## Problem Statement
- We Want to predict who among our employees is likely to leave

In [50]:
import pandas as pd


In [51]:
df=pd.read_csv('emp_analytics.csv')

In [52]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [53]:
df.shape

(14999, 10)

In [54]:
df.isnull().sum()

satisfaction_level       0
last_evaluation          0
number_project           0
average_montly_hours     0
time_spend_company       0
Work_accident            0
left                     0
promotion_last_5years    0
Department               0
salary                   0
dtype: int64

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14999 non-null  float64
 1   last_evaluation        14999 non-null  float64
 2   number_project         14999 non-null  int64  
 3   average_montly_hours   14999 non-null  int64  
 4   time_spend_company     14999 non-null  int64  
 5   Work_accident          14999 non-null  int64  
 6   left                   14999 non-null  int64  
 7   promotion_last_5years  14999 non-null  int64  
 8   Department             14999 non-null  object 
 9   salary                 14999 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


## Feature Engineering

In [56]:
df.groupby('left').mean()

Unnamed: 0_level_0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years
left,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.66681,0.715473,3.786664,199.060203,3.380032,0.175009,0.026251
1,0.440098,0.718113,3.855503,207.41921,3.876505,0.047326,0.005321


In [57]:
df.salary.value_counts()

low       7316
medium    6446
high      1237
Name: salary, dtype: int64

In [58]:
pd.get_dummies(df.salary)

Unnamed: 0,high,low,medium
0,0,1,0
1,0,0,1
2,0,0,1
3,0,1,0
4,0,1,0
...,...,...,...
14994,0,1,0
14995,0,1,0
14996,0,1,0
14997,0,1,0


In [59]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


In [60]:
subdf = df[['satisfaction_level','average_montly_hours','promotion_last_5years','salary','left']]
subdf.head()

Unnamed: 0,satisfaction_level,average_montly_hours,promotion_last_5years,salary,left
0,0.38,157,0,low,1
1,0.8,262,0,medium,1
2,0.11,272,0,medium,1
3,0.72,223,0,low,1
4,0.37,159,0,low,1


In [61]:
subdf.columns

Index(['satisfaction_level', 'average_montly_hours', 'promotion_last_5years',
       'salary', 'left'],
      dtype='object')

In [62]:
salaries_dummies=pd.get_dummies(subdf.salary,prefix="salary")

In [63]:
salaries_dummies

Unnamed: 0,salary_high,salary_low,salary_medium
0,0,1,0
1,0,0,1
2,0,0,1
3,0,1,0
4,0,1,0
...,...,...,...
14994,0,1,0
14995,0,1,0
14996,0,1,0
14997,0,1,0


In [64]:
##concatenation - merge columns
df=pd.concat([subdf,salaries_dummies],axis='columns')

In [65]:
df.sample(20)

Unnamed: 0,satisfaction_level,average_montly_hours,promotion_last_5years,salary,left,salary_high,salary_low,salary_medium
10911,0.81,153,0,low,0,0,1,0
3972,0.73,243,0,medium,0,0,0,1
123,0.11,300,0,low,1,0,1,0
13639,0.64,215,0,low,0,0,1,0
11828,0.42,173,0,medium,0,0,0,1
5421,0.97,246,0,low,0,0,1,0
3930,0.89,164,0,high,0,1,0,0
6578,0.83,242,0,low,0,0,1,0
2026,0.56,201,0,medium,0,0,0,1
4100,0.66,244,0,medium,0,0,0,1


Now we need to remove salary column which is text data. It is already replaced by dummy variables so we can safely remove it

In [14]:
df.drop('salary',inplace=True,axis='columns')

In [15]:
df.sample(5)

Unnamed: 0,satisfaction_level,average_montly_hours,promotion_last_5years,left,salary_high,salary_low,salary_medium
14819,0.45,134,0,1,0,0,1
3206,0.4,280,0,0,0,1,0
8537,0.44,210,0,0,0,0,1
916,0.11,304,0,1,0,1,0
5556,0.85,203,0,0,1,0,0


In [16]:
X=df.drop('left',axis='columns')


In [17]:
X.head()

Unnamed: 0,satisfaction_level,average_montly_hours,promotion_last_5years,salary_high,salary_low,salary_medium
0,0.38,157,0,0,1,0
1,0.8,262,0,0,0,1
2,0.11,272,0,0,0,1
3,0.72,223,0,0,1,0
4,0.37,159,0,0,1,0


In [18]:
y=df[['left']]

In [19]:
y

Unnamed: 0,left
0,1
1,1
2,1
3,1
4,1
...,...
14994,1
14995,1
14996,1
14997,1


In [20]:
y.value_counts()

left
0       11428
1        3571
dtype: int64

## Imbalanced dataset

## SMOTE -Synthetic Minority Oversampling Technique

In [None]:
### Install
!pip install imbalanced-learn

In [21]:
from imblearn.over_sampling import SMOTE

In [22]:
sm=SMOTE(random_state=42)

In [23]:
X,y=sm.fit_resample(X,y)

In [24]:
y.value_counts()

left
1       11428
0       11428
dtype: int64

In [25]:
y

Unnamed: 0,left
0,1
1,1
2,1
3,1
4,1
...,...
22851,1
22852,1
22853,1
22854,1


In [26]:
X

Unnamed: 0,satisfaction_level,average_montly_hours,promotion_last_5years,salary_high,salary_low,salary_medium
0,0.380000,157,0,0,1,0
1,0.800000,262,0,0,0,1
2,0.110000,272,0,0,0,1
3,0.720000,223,0,0,1,0
4,0.370000,159,0,0,1,0
...,...,...,...,...,...,...
22851,0.100000,268,0,0,1,0
22852,0.422742,277,0,0,0,1
22853,0.740000,222,0,0,1,0
22854,0.436711,145,0,0,1,0


In [27]:
y.shape

(22856, 1)

In [28]:
X.shape

(22856, 6)

### Split Dataset

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [30]:
X_train.shape

(15999, 6)

In [31]:
y_train.shape

(15999, 1)

In [32]:
X_test.shape

(6857, 6)

In [33]:
y_test.shape

(6857, 1)

In [34]:
y_test

Unnamed: 0,left
4347,0
8407,0
19287,1
19186,1
2642,0
...,...
3032,0
20519,1
3301,0
15201,1


In [35]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [36]:
model.fit(X_train, y_train)

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [37]:
y_pred=model.predict(X_test)

In [39]:
model.score(X_test,y_test)

0.717514948228088

Our model is 72% accurate

## Evaluate Errors

In [40]:
from sklearn.metrics import classification_report

In [41]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.74      0.72      3399
           1       0.73      0.70      0.71      3458

    accuracy                           0.72      6857
   macro avg       0.72      0.72      0.72      6857
weighted avg       0.72      0.72      0.72      6857



In [42]:
#confusion Matrix
from sklearn.metrics import confusion_matrix

In [43]:
cm=confusion_matrix(y_test,y_pred)

In [44]:
cm

array([[2514,  885],
       [1052, 2406]], dtype=int64)

In [48]:
### Create a Pickle file using serialization 
import pickle
pickle_out = open("emp-model.pkl","wb")
pickle.dump(model, pickle_out)
pickle_out.close()