# Employee Retention - Classification

## Part 4. Project Delievery

<br id = 'toc'>

**Table of Contents**
1. [Premodeling Function](#pre)
    1. [clean_data](#clean)
    2. [engineer_features](#engine)
2. [Construct Model Class](#construct)
3. [Deploy Model](#deploy)
    1. [in Jupyter Notebook](#jupyter)
    2. [Try on unseen data](#unseen)
    3. [Python script](#script)
4. [User Guide](#guide)

In [9]:
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

import pickle as pickle

from sklearn.metrics import roc_auc_score

In [10]:
# Load training raw data
df = pd.read_csv('../data/employee_data.csv')
print(df.shape)

(14249, 10)


In [11]:
df.head()

Unnamed: 0,avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,status,tenure
0,221,engineering,,0.932868,4,,low,0.829896,Left,5.0
1,232,support,,,3,,low,0.834544,Employed,2.0
2,184,sales,,0.78883,3,,medium,0.834988,Employed,3.0
3,206,sales,,0.575688,4,,low,0.424764,Employed,2.0
4,249,sales,,0.845217,3,,low,0.779043,Employed,3.0


In [12]:
# drop status to mimic unseen data
df.drop('status', axis = 1, inplace = True)

In [13]:
df.shape

(14249, 9)

In [14]:
# Load processed training data to validate premodeling functions
abt = pd.read_csv('../data/analytical_base_table.csv')

In [15]:
abt.shape

(14068, 25)

In [16]:
y = abt.status

In [17]:
X = abt.drop('status', axis = 1)

[back to top](#toc)
<a id='pre'></a>
### 1. Premodeling Function
<a id = 'clean'></a>
#### 1.A. clean_data

In [18]:
def clean_data(df):
    
    df = df[df.department != 'temp'].copy()
    df.loc[:, 'department'] = df.department.replace('information_technology', 'IT')
    df.loc[:, 'salary'] = df.salary.replace({'low':0, 'medium':1, 'high':2})
    df.loc[:, 'filed_complaint'] = df.filed_complaint.fillna(0)
    df.loc[:, 'recently_promoted'] = df.recently_promoted.fillna(0)
    df.loc[:, 'department'] = df.department.fillna('Missing')
    df.loc[:, 'last_evaluation_missing'] = df.last_evaluation.isnull().astype(int)
    df.loc[:, 'last_evaluation'] = df.last_evaluation.fillna(0.72)
    
    return df

In [19]:
cleaned_data = clean_data(df)

In [20]:
cleaned_data.shape

(14068, 10)

[back to top](#toc)
<a id='engine'></a>
#### 1.B. engineer_features

In [23]:
# the step to ensure train/test dataframe columns are aligned especially after pd.get_dummies step
X.head(0)

Unnamed: 0,avg_monthly_hrs,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,tenure,last_evaluation_missing,underperformer,...,department_Missing,department_admin,department_engineering,department_finance,department_management,department_marketing,department_procurement,department_product,department_sales,department_support


In [26]:
def engineer_features(df):
    
    df.loc[:,'underperformer'] = (df.last_evaluation < 0.65).astype(int)
    df.loc[:,'overqualified'] = ((df.satisfaction < 0.2) & (df.last_evaluation >0.7)).astype(int)
    df.loc[:,'overachiever'] = ((df.last_evaluation > 0.8) & (df.satisfaction > 0.7)).astype(int)    
    df.loc[:,'burnout'] = ((df.avg_monthly_hrs>240) & (df.satisfaction < 0.2)).astype(int)
    df = pd.get_dummies(df)
    _, df = X.head(0).align(df, join = 'left', axis = 1)
    for col in df.columns:
        df.loc[:, col] = df[col].astype(X[col].dtypes.name)
    return df

In [27]:
augmented_data = engineer_features(cleaned_data)

In [28]:
augmented_data.shape

(14068, 24)

In [29]:
augmented_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14068 entries, 0 to 14248
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   avg_monthly_hrs          14068 non-null  int64  
 1   filed_complaint          14068 non-null  float64
 2   last_evaluation          14068 non-null  float64
 3   n_projects               14068 non-null  int64  
 4   recently_promoted        14068 non-null  float64
 5   salary                   14068 non-null  int64  
 6   satisfaction             14068 non-null  float64
 7   tenure                   14068 non-null  float64
 8   last_evaluation_missing  14068 non-null  int64  
 9   underperformer           14068 non-null  int64  
 10  overqualified            14068 non-null  int64  
 11  overachiever             14068 non-null  int64  
 12  burnout                  14068 non-null  int64  
 13  department_IT            14068 non-null  int64  
 14  department_Missing    

In [30]:
# compare with save ABT
abt = pd.read_csv('../data/analytical_base_table.csv',float_precision='round_trip')

In [31]:
(augmented_data.reset_index(drop=True) == abt.drop('status', axis=1).reset_index(drop=True)).all().all()

True

[back to top](#toc)
<a id='construct'></a>
### 2. Construct Model Class

In [33]:
# Load saved model
with open('../models/final_model.pkl', 'rb') as f:
    final_model = pickle.load(f)

In [34]:
with open('../models/final_model_auc.pkl', 'rb') as f:
    final_model_auc = pickle.load(f)

In [35]:
roc_auc_score(y, final_model.predict_proba(X)[:, 1])

1.0

In [36]:
final_model_auc

1.0

In [52]:
model_dict = {'final_model': final_model,
             'trained_df': X.head(0)}

In [53]:
with open('../delieverables/model_dict.pkl', 'wb') as f:
    pickle.dump(model_dict, f)

In [64]:
class EmployeeRetentionModel:
    
    def __init__(self, model_dict_location):
        with open(model_dict_location, 'rb') as f:
            model_dict = pickle.load(f)
            self.model = model_dict['final_model']
            self.trained_df = model_dict['trained_df']
            
    def predict_proba(self, X, clean=True, augment=True):
        if clean == True:
            X = self.clean_data(X)
        if augment == True:
            X = self.engineer_features(X)            
        return X, self.model.predict_proba(X)
    
    def clean_data(self, df):    
        df = df[df.department != 'temp'].copy()
        df.loc[:, 'department'] = df.department.replace('information_technology', 'IT')
        df.loc[:, 'salary'] = df.salary.replace({'low':0, 'medium':1, 'high':2})
        df.loc[:, 'filed_complaint'] = df.filed_complaint.fillna(0)
        df.loc[:, 'recently_promoted'] = df.recently_promoted.fillna(0)
        df.loc[:, 'department'] = df.department.fillna('Missing')
        df.loc[:, 'last_evaluation_missing'] = df.last_evaluation.isnull().astype(int)
        df.loc[:, 'last_evaluation'] = df.last_evaluation.fillna(0.72)    
        return df

    def engineer_features(self, df):
        trained_df = self.trained_df
        df = df.copy()
        df.loc[:,'underperformer'] = (df.last_evaluation < 0.65).astype(int)
        df.loc[:,'overqualified'] = ((df.satisfaction < 0.2) & (df.last_evaluation >0.7)).astype(int)
        df.loc[:,'overachiever'] = ((df.last_evaluation > 0.8) & (df.satisfaction > 0.7)).astype(int)    
        df.loc[:,'burnout'] = ((df.avg_monthly_hrs>240) & (df.satisfaction < 0.2)).astype(int)
        df = pd.get_dummies(df, columns = ['department'])
        _, df = trained_df.align(df, join = 'left', axis = 1)
        for col in df.columns:
            df.loc[:, col] = df[col].astype(trained_df[col].dtypes.name)
        return df    

[back to top](#toc)
<a id='deploy'></a>
### 3. Model Deployement 
<a id = 'jupyter'></a>
#### 3.A. in Jupyter Notebook

In [65]:
model = EmployeeRetentionModel('model_dict.pkl')

In [66]:
_, pred = model.predict_proba(X, clean = False, augment = False)

In [67]:
_, pred1 = model.predict_proba(df, clean=True, augment=True)

In [68]:
_, pred2 = model.predict_proba(cleaned_data, clean=False, augment = True)

In [69]:
_, pred3 = model.predict_proba(augmented_data, clean=False, augment=False)

In [70]:
np.array_equal(pred, pred1)

True

In [71]:
np.array_equal(pred2, pred3)

True

In [72]:
np.array_equal(pred1, pred2)

True

[back to top](#toc)
<a id='unseen'></a>
#### 3.B. Try on unseen data

In [73]:
new_df = pd.read_csv('../scr/unseen_employee_data.csv')

In [74]:
new_df.shape

(750, 9)

In [75]:
new_df.head()

Unnamed: 0,avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,tenure
0,228,management,,0.735618,2,,high,0.805661,3.0
1,229,product,,1.0,4,,low,0.719961,4.0
2,196,sales,1.0,0.557426,4,,low,0.749835,2.0
3,207,IT,,0.715171,3,,high,0.987447,3.0
4,129,management,,0.484818,2,,low,0.441219,3.0


In [76]:
retention_model = EmployeeRetentionModel('../delieverables/model_dict.pkl')

In [77]:
new_cleaned_df = clean_data(new_df)

In [78]:
new_cleaned_df.head()

Unnamed: 0,avg_monthly_hrs,department,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,tenure,last_evaluation_missing
0,228,management,0.0,0.735618,2,0.0,2,0.805661,3.0,0
1,229,product,0.0,1.0,4,0.0,0,0.719961,4.0,0
2,196,sales,1.0,0.557426,4,0.0,0,0.749835,2.0,0
3,207,IT,0.0,0.715171,3,0.0,2,0.987447,3.0,0
4,129,management,0.0,0.484818,2,0.0,0,0.441219,3.0,0


In [79]:
new_cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 740 entries, 0 to 749
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   avg_monthly_hrs          740 non-null    int64  
 1   department               740 non-null    object 
 2   filed_complaint          740 non-null    float64
 3   last_evaluation          740 non-null    float64
 4   n_projects               740 non-null    int64  
 5   recently_promoted        740 non-null    float64
 6   salary                   740 non-null    int64  
 7   satisfaction             740 non-null    float64
 8   tenure                   740 non-null    float64
 9   last_evaluation_missing  740 non-null    int64  
dtypes: float64(5), int64(4), object(1)
memory usage: 63.6+ KB


In [80]:
new_cleaned_df.to_csv('../data/unseen_cleaned.csv', index = None)

In [81]:
new_augmented_df = engineer_features(new_cleaned_df)

In [82]:
new_augmented_df.head()

Unnamed: 0,avg_monthly_hrs,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,tenure,last_evaluation_missing,underperformer,...,department_Missing,department_admin,department_engineering,department_finance,department_management,department_marketing,department_procurement,department_product,department_sales,department_support
0,228,0.0,0.735618,2,0.0,2,0.805661,3.0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,229,0.0,1.0,4,0.0,0,0.719961,4.0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,196,1.0,0.557426,4,0.0,0,0.749835,2.0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,207,0.0,0.715171,3,0.0,2,0.987447,3.0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,129,0.0,0.484818,2,0.0,0,0.441219,3.0,0,1,...,0,0,0,0,1,0,0,0,0,0


In [83]:
new_augmented_df.shape

(740, 24)

In [84]:
new_augmented_df.to_csv('../data/unseen_abt.csv', index = None)

In [85]:
_, pred1 = model.predict_proba(new_augmented_df, clean=False, augment=False)

In [86]:
_, pred2 = model.predict_proba(new_cleaned_df, clean = False, augment = True)

In [87]:
_, pred3 = model.predict_proba(new_df, clean=True, augment=True)

In [88]:
np.array_equal(pred1, pred2)

True

In [89]:
pred1[:5]

array([[1.   , 0.   ],
       [0.895, 0.105],
       [0.995, 0.005],
       [1.   , 0.   ],
       [0.   , 1.   ]])

In [90]:
pred2[:5]

array([[1.   , 0.   ],
       [0.895, 0.105],
       [0.995, 0.005],
       [1.   , 0.   ],
       [0.   , 1.   ]])

[back to top](#toc)
<a id='pre'></a>
#### 3.C. Python script

<div class = 'alert alert-info'>

1. create `retention_model.py`
2. locate data, e.g., `unseen_data.csv`, `final_model.pkl`
3. in terminal, change to the directory where `rentention_model` is saved and type: `python rentention_model data_location output_location model_location True True`, the last two arguments are input values for `clean=, augment=` parameters for the `predict_proba` mothod.
4. predictions will be saved in the directory specified in the above command.

For example, if data, model files are all saved under one directory, we can run the model as follows:

`python retention_model.py unseen_data.csv prediction.csv model_dict.pkl True True`

    

</div>



```python
#retention_model.py
import numpy as np
import pandas as pd

import pickle as pickle
import sklearn
import sys

class EmployeeRetentionModel:
    
    def __init__(self, model_dict_location):
        with open(model_dict_location, 'rb') as f:
            model_dict = pickle.load(f)
            self.model = model_dict['final_model']
            self.trained_df = model_dict['trained_df']
            
    def predict_proba(self, X, clean=True, augment=True):
        if clean == True:
            X = self.clean_data(X)
        if augment == True:
            X = self.engineer_features(X)            
        return X, self.model.predict_proba(X)
    
    def clean_data(self, df):    
        df = df[df.department != 'temp'].copy()
        df.loc[:, 'department'] = df.department.replace('information_technology', 'IT')
        df.loc[:, 'salary'] = df.salary.replace({'low':0, 'medium':1, 'high':2})
        df.loc[:, 'filed_complaint'] = df.filed_complaint.fillna(0)
        df.loc[:, 'recently_promoted'] = df.recently_promoted.fillna(0)
        df.loc[:, 'department'] = df.department.fillna('Missing')
        df.loc[:, 'last_evaluation_missing'] = df.last_evaluation.isnull().astype(int)
        df.loc[:, 'last_evaluation'] = df.last_evaluation.fillna(0.72)    
        return df

    def engineer_features(self, df):
        trained_df = self.trained_df
        df = df.copy()
        df.loc[:,'underperformer'] = (df.last_evaluation < 0.65).astype(int)
        df.loc[:,'overqualified'] = ((df.satisfaction < 0.2) & (df.last_evaluation >0.7)).astype(int)
        df.loc[:,'overachiever'] = ((df.last_evaluation > 0.8) & (df.satisfaction > 0.7)).astype(int)    
        df.loc[:,'burnout'] = ((df.avg_monthly_hrs>240) & (df.satisfaction < 0.2)).astype(int)
        df = pd.get_dummies(df, columns = ['department'])
        _, df = trained_df.align(df, join = 'left', axis = 1)
        for col in df.columns:
            df.loc[:, col] = df[col].astype(trained_df[col].dtypes.name)
        return df 
    
def main(data_location, output_location, model_dict_location, clean=True, augment=True):
    
    df = pd.read_csv(data_location)
    retention_model = EmployeeRetentionModel(model_dict_location)
    
    df, pred = retention_model.predict_proba(df, clean = clean, augment = augment)
    df['prediction'] = pred[:,1]
    
    df.to_csv(output_location, index=None)

    
if __name__ == '__main__':
    main(*sys.argv[1:])
```    

In [91]:
# after running previous command in terminal, prediction is saved in the prediction.csv file
pred4 = pd.read_csv('../data/prediction.csv')

In [92]:
pred4.head()

Unnamed: 0,avg_monthly_hrs,filed_complaint,last_evaluation,n_projects,recently_promoted,salary,satisfaction,tenure,last_evaluation_missing,underperformer,...,department_admin,department_engineering,department_finance,department_management,department_marketing,department_procurement,department_product,department_sales,department_support,prediction
0,228,0.0,0.735618,2,0.0,2,0.805661,3.0,0,0,...,0,0,0,1,0,0,0,0,0,0.0
1,229,0.0,1.0,4,0.0,0,0.719961,4.0,0,0,...,0,0,0,0,0,0,1,0,0,0.105
2,196,1.0,0.557426,4,0.0,0,0.749835,2.0,0,1,...,0,0,0,0,0,0,0,1,0,0.005
3,207,0.0,0.715171,3,0.0,2,0.987447,3.0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
4,129,0.0,0.484818,2,0.0,0,0.441219,3.0,0,1,...,0,0,0,1,0,0,0,0,0,1.0


<div class = 'alert alert-info'>
    We can see the prediction in the last column above
    </div>

**This is the end of analysis for this project. The project is summarized in the README.md document, and deliverables can be located in the delieverables folder.**

**Please feel free to reach out with further questions, improvements, and anything you would like to share.**