# HR:
* 1. Cargar y explorar los datos
* 2. Transformar los datos
* 3. Modelar los Datos
* 4. Crear el proceso de Score y evaluar los modelos
* 5. Crear pipeline para scorear los datos (score.py)

In [3]:
# 1. Carga de datos:
import pandas as pd
df = pd.read_csv('HR_train.csv') 
df.head(3)

Unnamed: 0,id,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,area,salary,left
0,3903,0.78,0.79,3,203,2,0,0,sales,low,0
1,2048,0.48,0.85,3,279,4,0,0,IT,low,0
2,4181,0.19,0.93,3,110,4,0,0,technical,medium,0


In [4]:
tar = 'left'
vard = ['area', 'salary']
varc = [c for c in df.columns if c not in ['id', tar]+vard]

# Transformar los datos:

In [5]:
seed_ = 10

In [6]:
# Primero el split:
from sklearn.model_selection import train_test_split

X = df[varc+vard].copy()
y = df[tar].copy()

Xt, Xv, yt, yv = train_test_split(X, y, test_size=.2, random_state=seed_)

In [7]:
Xt['area'].value_counts(1, dropna=False)

sales          0.281279
technical      0.180956
support        0.150641
IT             0.080633
product_mng    0.057610
marketing      0.057193
RandD          0.051672
accounting     0.050943
hr             0.048443
management     0.040629
Name: area, dtype: float64

In [8]:
Xt['salary'].value_counts(1, dropna=False)

low       0.486926
medium    0.430357
high      0.082717
Name: salary, dtype: float64

In [9]:
import numpy as np

def norm_cat(df, column, threshold=0.05, label='category', others_label='Others',
            new_col=True):
    
    '''Salida de mi función: 
    1. Df con variables normalizadas.
    2. Diccionario de normalización'''
    
    aux = pd.DataFrame(df[column].value_counts(1, dropna=False))
    aux[label] = aux.index
    aux[label] = aux[label].map(lambda x:
                                x if aux.loc[x, column]>threshold else others_label)
    aux_dict = dict(zip(aux.index, aux[label]))
    
    if new_col:
        df[column+'_norm'] = df[column].map(aux_dict)
    else:
        df[column] = df[column].map(aux_dict)
        
    return(df, aux_dict)

def WoE(df, column, tar, label='_WoE'):
    
    '''Salida de mi función: 
    1. Df con variables normalizadas.
    2. Diccionario de WoEs'''
    
    df[column].fillna('Missings', inplace=True)
    aux = df[[tar, column]].pivot_table(index=column, columns=tar, aggfunc='size')
    woe = aux.apply(lambda x:x/sum(x)).apply(lambda x:np.log(x[1]/x[0]), axis=1)
    aux['WoE'] = woe
    aux_dict = dict(zip(aux.index, aux['WoE']))
    
    df[column+label] = df[column].map(aux_dict)
    
    return(df, aux_dict)

In [10]:
aux = Xt.copy()
aux[tar] = yt
woes = dict()
norms = dict()

for c in vard:
    aux, norms[c] = norm_cat(aux, c)
    aux, woes[c] = WoE(aux, c+'_norm', tar)

aux.head(2)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,area,salary,left,area_norm,area_norm_WoE,salary_norm,salary_norm_WoE
8282,0.8,0.6,5,217,3,0,0,management,high,0,Others,-0.074886,high,-1.500416
63,0.61,0.55,5,266,2,0,0,sales,medium,0,sales,0.055122,medium,-0.182641


In [11]:
def woes_norms(df, disc_cols, woes_cols, disc_dict, woe_dict, 
               label='_norm', wlabel='_woe'):
    
    for c in disc_cols:
        df[c+label] = df[c].map(lambda x:disc_dict.get(c).get(x, 'Others'))
    for c in woes_cols:
        df[c+wlabel] = df[c+label].map(lambda x:woe_dict.get(c).get(x, 0))
    
    return(df)

In [12]:
Xt = woes_norms(Xt, vard, vard, norms, woes)
Xv = woes_norms(Xv, vard, vard, norms, woes)

In [13]:
Xt.head(2)

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,area,salary,area_norm,salary_norm,area_woe,salary_woe
8282,0.8,0.6,5,217,3,0,0,management,high,Others,high,-0.074886,-1.500416
63,0.61,0.55,5,266,2,0,0,sales,medium,sales,medium,0.055122,-0.182641


# 3. Modelar los datos:

In [14]:
predictors = varc + ['salary_woe', 'area_woe']

In [15]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import precision_score, classification_report
from sklearn.pipeline import make_pipeline

In [16]:
pipe = make_pipeline(StandardScaler(with_mean=False),
                    LinearDiscriminantAnalysis(),
                    verbose=2)

In [17]:
%%time
pipe.fit(Xt[predictors], yt)

[Pipeline] .... (step 1 of 2) Processing standardscaler, total=   0.0s
[Pipeline]  (step 2 of 2) Processing lineardiscriminantanalysis, total=   0.0s
CPU times: user 9.4 ms, sys: 1.78 ms, total: 11.2 ms
Wall time: 10.2 ms


# 4. Predecir y evaluar el modelo:

In [18]:
print(precision_score(yt, pipe.predict(Xt[predictors])))
print(precision_score(yv, pipe.predict(Xv[predictors])))

0.5618153364632238
0.5733333333333334


In [19]:
out_path = 'hr_models/'
import joblib as jb
pipe_name = 'pipe_hr.joblib'
jb.dump(pipe, out_path+pipe_name)

['hr_models/pipe_hr.joblib']

# 5. Construcción del score:

In [20]:
filename = 'HR_test.csv'
df = pd.read_csv(general_data_path+filename)
df.head(3)

Unnamed: 0,id,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,area,salary
0,513,0.11,0.89,6,293,4,0,0,support,low
1,4942,0.28,0.51,3,124,3,0,0,technical,low
2,11233,0.7,0.85,4,142,2,0,0,support,low


In [21]:
# Proceso de score:

import pandas as pd
import numpy as np
import joblib as jb

def churn_employee(df, features, joblib_name, label='left'):
    
    pipeline = jb.load(joblib_name)
    aux = df[features]
    df[label] = pipeline.predict(aux)
    return(df)

In [22]:
# Ejecución:

pipe_name = out_path+pipe_name

# Paso 1. WoEs_norm
df_test = woes_norms(df, vard, vard, norms, woes)

# Paso 2. Score:
df_test = churn_employee(df_test, predictors, pipe_name)
df_test[['id', 'left']]

Unnamed: 0,id,left
0,513,1
1,4942,1
2,11233,0
3,10476,0
4,4193,0
...,...,...
2995,6502,0
2996,8046,0
2997,14127,0
2998,6297,0
