In [232]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


Reading csv's

In [233]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [234]:
df_train.head(3)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0


Looking for missings

In [235]:
df_train.isnull().sum()

employee_id                0
department                 0
region                     0
education               2409
gender                     0
recruitment_channel        0
no_of_trainings            0
age                        0
previous_year_rating    4124
length_of_service          0
KPIs_met >80%              0
awards_won?                0
avg_training_score         0
is_promoted                0
dtype: int64

In [236]:
df_train["education"].value_counts()

education
Bachelor's          36669
Master's & above    14925
Below Secondary       805
Name: count, dtype: int64

In [237]:
df_train["previous_year_rating"].value_counts()

previous_year_rating
3.0    18618
5.0    11741
4.0     9877
1.0     6223
2.0     4225
Name: count, dtype: int64

Filling missings

In [238]:
def fill_missings(dataframe):
    media = dataframe['previous_year_rating'].mean().round()
    dataframe['previous_year_rating'].fillna(media, inplace=True)
    dataframe['education'].fillna(method='bfill',inplace=True)
    return dataframe

In [239]:
df_train = fill_missings(df_train)
df_train.isnull().sum()

employee_id             0
department              0
region                  0
education               0
gender                  0
recruitment_channel     0
no_of_trainings         0
age                     0
previous_year_rating    0
length_of_service       0
KPIs_met >80%           0
awards_won?             0
avg_training_score      0
is_promoted             0
dtype: int64

In [240]:
df_train.head(3)

Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0


Dummyzation

In [241]:
def dummyzation(dataframe):
    department_dummies = pd.get_dummies(dataframe['department'],dtype=int)
    dataframe = pd.concat([dataframe,department_dummies],axis=1)
    dataframe.drop(['department'],axis=1,inplace=True)

    region_dummies = pd.get_dummies(dataframe['region'],dtype=int)
    dataframe = pd.concat([dataframe,region_dummies],axis=1)
    dataframe.drop(['region'],axis=1,inplace=True)

    education_dummies = pd.get_dummies(dataframe['education'],dtype=int)
    dataframe = pd.concat([dataframe,education_dummies],axis=1)
    dataframe.drop(['education'],axis=1,inplace=True)


    gender_dummies = pd.get_dummies(dataframe["gender"],dtype=int,drop_first=True)
    dataframe = pd.concat([dataframe,gender_dummies],axis=1)
    dataframe.drop(['gender'],axis=1,inplace=True)

    recruitment_channel_dummies = pd.get_dummies(dataframe["recruitment_channel"],dtype=int)
    dataframe = pd.concat([dataframe,recruitment_channel_dummies],axis=1)
    dataframe.drop(['recruitment_channel'],axis=1,inplace=True)

    dataframe.drop(['employee_id'],axis=1,inplace=True)

    return dataframe

In [242]:
df_train = dummyzation(df_train)
df_train.head(3)


Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted,Analytics,Finance,...,region_7,region_8,region_9,Bachelor's,Below Secondary,Master's & above,m,other,referred,sourcing
0,1,35,5.0,8,1,0,49,0,0,0,...,1,0,0,0,0,1,0,0,0,1
1,1,30,5.0,4,0,0,60,0,0,0,...,0,0,0,1,0,0,1,1,0,0
2,1,34,3.0,7,0,0,50,0,0,0,...,0,0,0,1,0,0,1,0,0,1


Filling missings and df_test dummyzation

In [243]:
df_test = fill_missings(df_test)
df_test = dummyzation(df_test)
df_test.head(3)

Unnamed: 0,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,Analytics,Finance,HR,...,region_7,region_8,region_9,Bachelor's,Below Secondary,Master's & above,m,other,referred,sourcing
0,1,24,3.0,1,1,0,77,0,0,0,...,0,0,0,1,0,0,1,0,0,1
1,1,31,3.0,5,0,0,51,0,0,1,...,0,0,0,1,0,0,0,1,0,0
2,1,31,1.0,4,0,0,47,0,0,0,...,0,0,0,1,0,0,1,1,0,0


Entrenamiento del modelo

In [244]:
from sklearn.linear_model import LogisticRegression

In [245]:
modelo = LogisticRegression(max_iter=1000)
X = df_train.drop(['is_promoted'],axis=1)
y = df_train['is_promoted']


In [246]:
#escalar la X
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [247]:
modelo.fit(X_scaled,y)

In [248]:
predicciones = modelo.predict(X_scaled)

Evaluaremos el modelo

In [249]:
from sklearn.metrics import precision_score, recall_score,accuracy_score, f1_score, confusion_matrix

In [253]:
accuracy_score(y,predicciones)

0.9320902058093709

In [251]:
precision_score(y,predicciones)

0.8028169014084507

In [252]:
recall_score(y,predicciones)   

0.2686375321336761

In [250]:
f1_score(y,predicciones)

0.4025682182985554