In [116]:
import pickle
import requests

import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mutual_info_score

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [117]:
df = pd.read_csv("cleaned_employee_churn_data.csv")

In [118]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=1)

y_train = df_train.left.values
y_val = df_val.left.values
y_test = df_test.left.values

del df_train['left']
del df_val['left']
#del df_test['left']

In [119]:
len(df_train), len(df_val), len(df_test)

(5724, 1908, 1908)

**Testing if everything works**

In [120]:
categorical = ['department', 'salary']
numerical = ['promoted', 'review', 'projects', 'tenure', 'satisfaction', 'bonus', 'avg_hrs_month']

In [121]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')

In [122]:
train_dict[0]

{'department': 'support',
 'salary': 'low',
 'promoted': 0,
 'review': 0.8825964808543181,
 'projects': 4,
 'tenure': 5.0,
 'satisfaction': 0.1898694691395604,
 'bonus': 0,
 'avg_hrs_month': 180.2331289811952}

In [123]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)

In [124]:
X_train = dv.transform(train_dict)

In [125]:
dv.get_feature_names_out()

array(['avg_hrs_month', 'bonus', 'department=admin',
       'department=engineering', 'department=finance', 'department=it',
       'department=logistics', 'department=marketing',
       'department=operations', 'department=retail', 'department=sales',
       'department=support', 'projects', 'promoted', 'review',
       'salary=high', 'salary=low', 'salary=medium', 'satisfaction',
       'tenure'], dtype=object)

## Machine learning for classification

In [126]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train, y_train)

In [127]:
val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

**Hard predictions**

In [128]:
model.predict(X_val)

array([0, 0, 0, ..., 1, 1, 0])

**Soft predictions**

In [129]:
model.predict_proba(X_val)

array([[0.92036263, 0.07963737],
       [0.63551226, 0.36448774],
       [0.67829254, 0.32170746],
       ...,
       [0.49187423, 0.50812577],
       [0.48854172, 0.51145828],
       [0.84385046, 0.15614954]])

**Selecting where the probability is churn/left**

In [130]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([0.07963737, 0.36448774, 0.32170746, ..., 0.50812577, 0.51145828,
       0.15614954])

In [131]:
left = y_pred > 0.5

In [132]:
(y_val == left).mean()

0.7253668763102725

In [133]:
df_val[left]

Unnamed: 0,department,promoted,review,projects,salary,tenure,satisfaction,bonus,avg_hrs_month
4771,retail,0,0.819778,3,medium,8.0,0.495601,1,188.668634
7143,sales,0,0.777348,2,medium,8.0,0.445576,0,190.303723
8698,operations,0,0.722346,3,medium,8.0,0.668736,0,188.287870
4404,operations,0,0.821501,3,medium,5.0,0.433486,0,179.908824
5776,sales,0,0.731383,4,medium,10.0,0.552799,1,194.679151
...,...,...,...,...,...,...,...,...,...
4834,retail,0,0.789362,3,medium,7.0,0.349901,1,184.581597
8588,marketing,0,0.765292,4,medium,8.0,0.711322,0,188.009551
3681,sales,0,0.824177,4,medium,4.0,0.487978,0,176.018499
9394,retail,0,0.751656,4,medium,8.0,0.477960,0,189.218370


In [134]:
df_val[left].index

Index([4771, 7143, 8698, 4404, 5776, 7857, 7968, 9369, 4372, 2676,
       ...
       8538, 8393, 3551, 9093, 2035, 4834, 8588, 3681, 9394, 1504],
      dtype='int64', length=157)

In [135]:
len(df_val[left].index)

157

**2nd model for testing**

In [136]:
dicts_train_full = df_train_full[categorical + numerical].to_dict(orient='records')

In [137]:
dicts_train_full[3]

{'department': 'engineering',
 'salary': 'low',
 'promoted': 0,
 'review': 0.7479946598976721,
 'projects': 2,
 'tenure': 6.0,
 'satisfaction': 0.3758591881955768,
 'bonus': 0,
 'avg_hrs_month': 183.37512047619984}

In [138]:
X_train_full = dv.fit_transform(dicts_train_full)

In [139]:
y_train_full = df_train_full.left.values

In [140]:
model = LogisticRegression(solver='liblinear', random_state=1)
model.fit(X_train_full, y_train_full)

## Models for use

In [141]:
def train(df_train, y_train, C=1.0):
    dicts = df_train[categorical + numerical].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train, y_train)
    
    return dv, model

In [142]:
def predict(df, dv, model):
    dicts = df[categorical + numerical].to_dict(orient='records')

    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [143]:
C = 1.0
n_splits = 5

In [144]:
kfold = KFold(n_splits=n_splits, shuffle=True, random_state=1)

scores = []

for train_idx, val_idx in kfold.split(df_train_full):
    df_train = df_train_full.iloc[train_idx]
    df_val = df_train_full.iloc[val_idx]

    y_train = df_train.left.values
    y_val = df_val.left.values

    dv, model = train(df_train, y_train, C=C)
    y_pred = predict(df_val, dv, model)

    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)

print('C=%s %.3f +- %.3f' % (C, np.mean(scores), np.std(scores)))

C=1.0 0.717 +- 0.005


In [145]:
scores

[0.7230942911591682,
 0.7172374600587591,
 0.7100849556412427,
 0.7207775470302372,
 0.7147296088493511]

## Saving the model

In [146]:
output_file = f'model_C={C}.bin'
output_file

'model_C=1.0.bin'

In [147]:
with open(output_file, 'wb') as f_out: 
    pickle.dump((dv, model), f_out)
f_out

<_io.BufferedWriter name='model_C=1.0.bin'>

In [148]:
input_file = 'model_C=1.0.bin'
with open(input_file, 'rb') as f_in: 
    dv, model = pickle.load(f_in)
model

In [149]:
df_train_full.T

Unnamed: 0,5746,7512,7195,758,6008,7182,7922,3633,8285,3906,...,3462,7751,4225,144,5056,2895,7813,905,5192,235
department,retail,marketing,operations,engineering,sales,support,retail,retail,admin,logistics,...,engineering,support,engineering,support,operations,logistics,admin,operations,retail,sales
promoted,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
review,0.514585,0.608014,0.458128,0.747995,0.673757,0.579624,0.622681,0.729233,0.90172,0.786362,...,0.636671,0.674503,0.840436,0.681606,0.723493,0.586277,0.657189,0.636177,0.552363,0.640193
projects,4,4,3,2,3,4,3,4,2,4,...,4,4,4,2,3,4,4,4,3,3
salary,high,medium,high,low,medium,medium,high,medium,medium,low,...,medium,medium,medium,medium,medium,medium,low,medium,medium,high
tenure,8.0,8.0,6.0,6.0,8.0,7.0,10.0,6.0,8.0,3.0,...,6.0,7.0,4.0,6.0,7.0,6.0,8.0,6.0,7.0,5.0
satisfaction,0.486957,0.5321,0.181083,0.375859,0.405226,0.342821,0.62682,0.340773,0.524451,0.602154,...,0.440692,0.449702,0.448001,0.443414,0.398189,0.554945,0.477037,0.573064,0.347974,0.613226
bonus,1,0,0,0,0,0,1,0,1,0,...,0,0,0,0,1,0,0,0,0,0
avg_hrs_month,190.332987,189.757446,182.985786,183.37512,188.574919,186.496188,193.65353,184.013618,190.088321,175.063746,...,183.619474,184.678179,176.818262,182.53386,187.245536,183.32165,190.081317,183.299195,186.936014,180.462972
left,0,0,0,0,0,0,0,1,1,1,...,1,0,1,0,0,0,0,0,0,0


In [150]:
employee = {
    'department': 'retail',
    'promoted': 0,
    'review': 0.514585,
    'projects': 4,
    'salary': 'high',
    'tenure': 8.0,
    'satisfaction': 0.486957,
    'bonus': 1,
    'avg_hrs_month': 190.332987,
    'left': 0
}

In [151]:
X = dv.transform([employee])

In [152]:
y_pred = model.predict_proba(X)[0, 1]

In [153]:
print('input:', employee)
print('output:', y_pred)

input: {'department': 'retail', 'promoted': 0, 'review': 0.514585, 'projects': 4, 'salary': 'high', 'tenure': 8.0, 'satisfaction': 0.486957, 'bonus': 1, 'avg_hrs_month': 190.332987, 'left': 0}
output: 0.08882579470349221
