In [267]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows',200)

from sklearn.preprocessing import OneHotEncoder,LabelEncoder,MinMaxScaler

#from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgb


pd.set_option('max_columns', 100)
pd.set_option('max_rows', 100)

### Read the data

In [268]:
train = pd.read_csv("data/train_MpHjUjU.csv")
test = pd.read_csv("data/test_hXY9mYw.csv")
submission = pd.read_csv("data/sample_submission_znWiLZ4.csv")

In [269]:
train.head()

Unnamed: 0,MMM-YY,Emp_ID,Age,Gender,City,Education_Level,Salary,Dateofjoining,LastWorkingDate,Joining Designation,Designation,Total Business Value,Quarterly Rating
0,01-01-2016,1,28,Male,C23,Master,57387,24-12-2015,,1,1,2381060,2
1,01-02-2016,1,28,Male,C23,Master,57387,24-12-2015,,1,1,-665480,2
2,01-03-2016,1,28,Male,C23,Master,57387,24-12-2015,11-03-2016,1,1,0,2
3,01-11-2017,2,31,Male,C7,Master,67016,06-11-2017,,2,2,0,1
4,01-12-2017,2,31,Male,C7,Master,67016,06-11-2017,,2,2,0,1


In [270]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19104 entries, 0 to 19103
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   MMM-YY                19104 non-null  object
 1   Emp_ID                19104 non-null  int64 
 2   Age                   19104 non-null  int64 
 3   Gender                19104 non-null  object
 4   City                  19104 non-null  object
 5   Education_Level       19104 non-null  object
 6   Salary                19104 non-null  int64 
 7   Dateofjoining         19104 non-null  object
 8   LastWorkingDate       1616 non-null   object
 9   Joining Designation   19104 non-null  int64 
 10  Designation           19104 non-null  int64 
 11  Total Business Value  19104 non-null  int64 
 12  Quarterly Rating      19104 non-null  int64 
dtypes: int64(7), object(6)
memory usage: 1.9+ MB


In [271]:
test.head()

Unnamed: 0,Emp_ID
0,394
1,173
2,1090
3,840
4,308


In [272]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 741 entries, 0 to 740
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Emp_ID  741 non-null    int64
dtypes: int64(1)
memory usage: 5.9 KB


In [273]:
train.nunique()

MMM-YY                     24
Emp_ID                   2381
Age                        36
Gender                      2
City                       29
Education_Level             3
Salary                   2383
Dateofjoining             869
LastWorkingDate           493
Joining Designation         5
Designation                 5
Total Business Value    10181
Quarterly Rating            4
dtype: int64

In [274]:
train.isnull().sum()

MMM-YY                      0
Emp_ID                      0
Age                         0
Gender                      0
City                        0
Education_Level             0
Salary                      0
Dateofjoining               0
LastWorkingDate         17488
Joining Designation         0
Designation                 0
Total Business Value        0
Quarterly Rating            0
dtype: int64

In [275]:
train.dtypes

MMM-YY                  object
Emp_ID                   int64
Age                      int64
Gender                  object
City                    object
Education_Level         object
Salary                   int64
Dateofjoining           object
LastWorkingDate         object
Joining Designation      int64
Designation              int64
Total Business Value     int64
Quarterly Rating         int64
dtype: object

In [276]:
train['MMM-YY'] = pd.to_datetime(train['MMM-YY'])
train['Dateofjoining'] = pd.to_datetime(train['Dateofjoining'])
train['LastWorkingDate'] = pd.to_datetime(train['LastWorkingDate'])

In [277]:
emp_churned = train[~train.LastWorkingDate.isnull()]['Emp_ID']

In [278]:
emp_demo_details = ['Emp_ID','Age','Gender','City','Education_Level']

In [279]:
df = train[emp_demo_details]

In [280]:
df = df.drop_duplicates(subset=['Emp_ID'])

In [281]:
df.shape

(2381, 5)

In [282]:
churn = df.Emp_ID.isin(emp_churned)
df['Churn'] = churn
df['Churn'].replace({False: 0, True: 1}, inplace=True)
df.head(10)

Unnamed: 0,Emp_ID,Age,Gender,City,Education_Level,Churn
0,1,28,Male,C23,Master,1
3,2,31,Male,C7,Master,0
5,4,43,Male,C13,Master,1
10,5,29,Male,C9,College,1
13,6,31,Female,C11,Bachelor,0
18,8,34,Male,C2,College,1
21,11,28,Female,C19,Master,0
22,12,35,Male,C23,Master,1
28,13,29,Male,C19,Master,1
51,14,39,Female,C26,College,0


In [283]:
emp_work_details = ['MMM-YY','Dateofjoining','LastWorkingDate','Designation',
                    'Joining Designation','Total Business Value','Quarterly Rating']

In [284]:
# Promotion
temp = train[['Emp_ID','Designation','Joining Designation']]
temp = temp.groupby(['Emp_ID']).nth(-1)


temp['Promotion'] = temp['Designation'] - temp['Joining Designation']
temp['Promotion'] = temp['Promotion'].tolist()

df['Designation'] = temp['Designation'].tolist()
df['Promotion'] = temp['Promotion'].tolist()


In [285]:
df.head()

Unnamed: 0,Emp_ID,Age,Gender,City,Education_Level,Churn,Designation,Promotion
0,1,28,Male,C23,Master,1,1,0
3,2,31,Male,C7,Master,0,2,0
5,4,43,Male,C13,Master,1,2,0
10,5,29,Male,C9,College,1,1,0
13,6,31,Female,C11,Bachelor,0,3,0


In [286]:
df_1 = train[['Emp_ID', 'MMM-YY','Dateofjoining','LastWorkingDate', 'Salary']]
df_last_sal = df_1.groupby(['Emp_ID']).nth(-1)
df_last_sal.rename(columns={'Salary':'Last_Salary'}, inplace=True)
df = df.merge(df_last_sal, on = 'Emp_ID', how='left')
df = df[['Emp_ID', 'Age', 'Gender', 'City', 'Education_Level', 'Promotion', 'Last_Salary', 'Designation', 'Churn']]

In [287]:
# Rating
df_1 = train[['Emp_ID', 'MMM-YY','Dateofjoining','LastWorkingDate', 'Quarterly Rating']]
df_last_rating = df_1.groupby(['Emp_ID']).nth(-1)
df_second_last_rating = df_1.groupby(['Emp_ID']).nth(-2)
df_last_rating.rename(columns={'Quarterly Rating':'Last_Quarterly_Rating'}, inplace=True)
# df_second_last_rating.rename(columns={'Quarterly Rating':'Second_Last_Quarterly_Rating'}, inplace=True)
# df_second_last_rating = df_second_last_rating['Second_Last_Quarterly_Rating']
df = df.merge(df_last_rating, on = 'Emp_ID', how='left')
#df = df.merge(df_second_last_rating, on = 'Emp_ID', how='left')
df = df[['Emp_ID', 'Age', 'Gender', 'City', 'Education_Level', 'Promotion', 'Designation','Last_Salary', 'Last_Quarterly_Rating','Churn']]

In [288]:
# Tenure
df_last_sal = df_last_sal.merge(df, on='Emp_ID', how='left')
df_last_sal = df_last_sal[['Emp_ID', 'MMM-YY','Dateofjoining', 'LastWorkingDate','Churn']]

df_last_sal['Tenure_months'] = ""
df_last_sal['Tenure_months'][df_last_sal.Churn == 1] = df_last_sal.LastWorkingDate.dt.month - df_last_sal.Dateofjoining.dt.month + 12*(df_last_sal.LastWorkingDate.dt.year - df_last_sal.Dateofjoining.dt.year)
df_last_sal['Tenure_months'][df_last_sal.Churn == 0] = df_last_sal['MMM-YY'].dt.month - df_last_sal.Dateofjoining.dt.month + 12*(df_last_sal['MMM-YY'].dt.year - df_last_sal.Dateofjoining.dt.year)

df['Tenure_months'] = df_last_sal['Tenure_months']
df['Tenure_months'] = df['Tenure_months'].astype(int)

In [289]:
df.dtypes

Emp_ID                    int64
Age                       int64
Gender                   object
City                     object
Education_Level          object
Promotion                 int64
Designation               int64
Last_Salary               int64
Last_Quarterly_Rating     int64
Churn                     int64
Tenure_months             int32
dtype: object

In [290]:
# Average Business Value by tenure
df_2 = train[['Emp_ID', 'Total Business Value']]
df_2 = df_2.groupby(['Emp_ID'])['Total Business Value'].mean().reset_index()
df['Avg_Quarterly_Business'] = df_2['Total Business Value']

In [291]:
train_df = df

In [292]:
train_columns = ['Age', 'Gender','City','Education_Level','Promotion', 'Last_Salary', 'Last_Quarterly_Rating', 'Tenure_months', 'Avg_Quarterly_Business', 'Designation']

In [293]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2381 entries, 0 to 2380
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Emp_ID                  2381 non-null   int64  
 1   Age                     2381 non-null   int64  
 2   Gender                  2381 non-null   object 
 3   City                    2381 non-null   object 
 4   Education_Level         2381 non-null   object 
 5   Promotion               2381 non-null   int64  
 6   Designation             2381 non-null   int64  
 7   Last_Salary             2381 non-null   int64  
 8   Last_Quarterly_Rating   2381 non-null   int64  
 9   Churn                   2381 non-null   int64  
 10  Tenure_months           2381 non-null   int32  
 11  Avg_Quarterly_Business  2381 non-null   float64
dtypes: float64(1), int32(1), int64(7), object(3)
memory usage: 232.5+ KB


### Encodings

In [294]:
train_df.nunique()

Emp_ID                    2381
Age                         35
Gender                       2
City                        29
Education_Level              3
Promotion                    5
Designation                  5
Last_Salary               2339
Last_Quarterly_Rating        4
Churn                        2
Tenure_months               99
Avg_Quarterly_Business    1639
dtype: int64

In [295]:
#One Hot Coding:
#train_df = pd.get_dummies(train_df, columns=['Gender','City','Education_Level'])
train_df = pd.get_dummies(train_df, columns=['Gender','Education_Level','City'])

In [296]:
#Label Encoding
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

label_var = ['Promotion','Designation','Last_Quarterly_Rating']
le = LabelEncoder()
for i in label_var:
    train_df[i] = le.fit_transform(train_df[i])
    
scaler = MinMaxScaler()
scale_var = ['Age','Tenure_months','Avg_Quarterly_Business']
for k in scale_var:
    train_df[[k]] = scaler.fit_transform(train_df[[k]])

In [297]:
train_df[['Age']] = scaler.fit_transform(train_df[['Age']])

In [298]:
target = train_df[['Churn']]

In [299]:
###  Join test with train df
test_df = test.merge(train_df, on='Emp_ID', how='left')

In [300]:
train_df = train_df.drop(['Emp_ID','Churn'],axis=1)

In [301]:
train_df.columns

Index(['Age', 'Promotion', 'Designation', 'Last_Salary',
       'Last_Quarterly_Rating', 'Tenure_months', 'Avg_Quarterly_Business',
       'Gender_Female', 'Gender_Male', 'Education_Level_Bachelor',
       'Education_Level_College', 'Education_Level_Master', 'City_C1',
       'City_C10', 'City_C11', 'City_C12', 'City_C13', 'City_C14', 'City_C15',
       'City_C16', 'City_C17', 'City_C18', 'City_C19', 'City_C2', 'City_C20',
       'City_C21', 'City_C22', 'City_C23', 'City_C24', 'City_C25', 'City_C26',
       'City_C27', 'City_C28', 'City_C29', 'City_C3', 'City_C4', 'City_C5',
       'City_C6', 'City_C7', 'City_C8', 'City_C9'],
      dtype='object')

In [302]:
### Model Building
# Split data into train and test sets as well as for validation and testing
train, test, target_train, target_val = train_test_split(train_df, 
                                                         target, 
                                                         train_size= 0.80,
                                                         random_state=0);


In [303]:
train.shape, test.shape, target_train.shape, target_val.shape

((1904, 41), (477, 41), (1904, 1), (477, 1))

In [304]:
# Logistic Regression
lr = LogisticRegression()
lr.fit(train, target_train)
print("Fitting of Logistic Forest finished")

Fitting of Logistic Forest finished


In [305]:
lr_predictions = lr.predict(test)
print("Predictions finished")

Predictions finished


In [306]:
from sklearn.metrics import (accuracy_score, f1_score, log_loss, classification_report)
print("f1 score: {}".format(f1_score(target_val, lr_predictions)))
print("Accuracy: {}".format(accuracy_score(target_val, lr_predictions)))
print("="*80)
print(classification_report(target_val, lr_predictions))

f1 score: 0.819672131147541
Accuracy: 0.7463312368972747
              precision    recall  f1-score   support

           0       0.70      0.48      0.57       168
           1       0.76      0.89      0.82       309

    accuracy                           0.75       477
   macro avg       0.73      0.69      0.70       477
weighted avg       0.74      0.75      0.73       477



In [307]:
# Random Forest
seed = 123   # We set our random seed to zero for reproducibility
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 1000,
#     'warm_start': True, 
    'max_features': 0.3,
    'max_depth': 4,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'random_state' : seed,
    'verbose': 0
}

In [308]:
rf = RandomForestClassifier(**rf_params)
rf.fit(train, target_train)
print("Fitting of Random Forest finished")

Fitting of Random Forest finished


In [309]:
rf_predictions = rf.predict(test)
print("Predictions finished")

Predictions finished


In [310]:
print("f1 score: {}".format(f1_score(target_val, rf_predictions)))
print("Accuracy: {}".format(accuracy_score(target_val, rf_predictions)))
print("="*80)
print(classification_report(target_val, rf_predictions))

f1 score: 0.8761061946902655
Accuracy: 0.8238993710691824
              precision    recall  f1-score   support

           0       0.89      0.57      0.70       168
           1       0.80      0.96      0.88       309

    accuracy                           0.82       477
   macro avg       0.85      0.77      0.79       477
weighted avg       0.83      0.82      0.81       477



In [311]:
# Gradient Boosting Params
gb_params ={
    'n_estimators': 1500,
    'max_features': 0.9,
    'learning_rate' : 0.25,
    'max_depth': 4,
    'min_samples_leaf': 2,
    'subsample': 1,
    'max_features' : 'sqrt',
    'random_state' : seed,
    'verbose': 0
}

In [312]:
gb = GradientBoostingClassifier(**gb_params)
gb.fit(train, target_train)
# Get our predictions
gb_predictions = gb.predict(test)
print("Predictions have finished")

Predictions have finished


In [313]:
print("f1 score: {}".format(f1_score(target_val, gb_predictions)))
print("Accuracy: {}".format(accuracy_score(target_val, gb_predictions)))
print("="*80)
print(classification_report(target_val, gb_predictions))

f1 score: 0.9142857142857143
Accuracy: 0.8867924528301887
              precision    recall  f1-score   support

           0       0.87      0.80      0.83       168
           1       0.90      0.93      0.91       309

    accuracy                           0.89       477
   macro avg       0.88      0.87      0.87       477
weighted avg       0.89      0.89      0.89       477



In [314]:
lgb_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective':'binary',
    'metric': {'auc'},
    'num_leaves': 96,
    'learning_rate': 0.01,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_freq': 1,
    'verbose': 1,
    'min_data_in_leaf' : 1,
    'max_bin' : 255,
    'lambda_l1' : 0.00002,
    'lambda_l2' : 0.00001,
    'min_gain_to_split' : 0.001
}

In [318]:
import lightgbm as lgb
seed=123
lgb = lgb.LGBMClassifier()
lgb.fit(train, target_train)
# Get our predictions
lgb_predictions = lgb.predict(test)
print("Predictions have finished")

Predictions have finished


In [None]:
print("f1 score: {}".format(f1_score(target_val, lgb_predictions)))
print("Accuracy: {}".format(accuracy_score(target_val, lgb_predictions)))
print("="*80)
print(classification_report(target_val, lgb_predictions))

In [319]:
## Cat Boosting
seed = 123
cb = CatBoostClassifier(
    iterations=7777,
    learning_rate = 0.03,
    verbose=500,
    eval_metric = 'AUC'
)
cb.fit(train, target_train)
# Get our predictions
cb_predictions = cb.predict(test)
print("Predictions have finished")

0:	total: 10.4ms	remaining: 1m 21s
500:	total: 2.39s	remaining: 34.8s
1000:	total: 5.11s	remaining: 34.6s
1500:	total: 7.83s	remaining: 32.7s
2000:	total: 10.5s	remaining: 30.4s
2500:	total: 13s	remaining: 27.5s
3000:	total: 14.8s	remaining: 23.5s
3500:	total: 17.4s	remaining: 21.3s
4000:	total: 19.9s	remaining: 18.8s
4500:	total: 23.3s	remaining: 17s
5000:	total: 25.4s	remaining: 14.1s
5500:	total: 27.1s	remaining: 11.2s
6000:	total: 29.4s	remaining: 8.71s
6500:	total: 31.7s	remaining: 6.22s
7000:	total: 34s	remaining: 3.77s
7500:	total: 36.3s	remaining: 1.33s
7776:	total: 37.2s	remaining: 0us
Predictions have finished


In [320]:
print("f1 score: {}".format(f1_score(target_val, cb_predictions)))
print("Accuracy: {}".format(accuracy_score(target_val, cb_predictions)))
print("="*80)
print(classification_report(target_val, cb_predictions))

f1 score: 0.9102564102564101
Accuracy: 0.8825995807127882
              precision    recall  f1-score   support

           0       0.85      0.82      0.83       168
           1       0.90      0.92      0.91       309

    accuracy                           0.88       477
   macro avg       0.87      0.87      0.87       477
weighted avg       0.88      0.88      0.88       477



In [321]:
## Xg Boosting
seed = 123
xgb = XGBClassifier(n_estimators=100)
xgb.fit(train, target_train)
# Get our predictions
xgb_predictions = xgb.predict(test)
print("Predictions have finished")

Predictions have finished


In [322]:
print("f1 score: {}".format(f1_score(target_val, xgb_predictions)))
print("Accuracy: {}".format(accuracy_score(target_val, xgb_predictions)))
print("="*80)
print(classification_report(target_val, xgb_predictions))

f1 score: 0.9133858267716535
Accuracy: 0.8846960167714885
              precision    recall  f1-score   support

           0       0.87      0.79      0.83       168
           1       0.89      0.94      0.91       309

    accuracy                           0.88       477
   macro avg       0.88      0.86      0.87       477
weighted avg       0.88      0.88      0.88       477



In [323]:
from sklearn.model_selection import cross_val_score
lgb_scores = cross_val_score(lgb, train, target_train, cv=10, scoring = "f1_macro")
#print("Scores:", scores)
print("Mean lgb:", lgb_scores.mean())

Mean lgb: 0.8849664566565725


In [324]:
importances = pd.DataFrame({
    'Feature': train_df.columns,
    'Importance': xgb.feature_importances_
})
importances = importances.sort_values(by='Importance', ascending=False)
importances = importances.set_index('Feature')
importances

Unnamed: 0_level_0,Importance
Feature,Unnamed: 1_level_1
Last_Quarterly_Rating,0.261768
Tenure_months,0.09826
City_C25,0.065275
City_C1,0.059108
City_C28,0.047081
Avg_Quarterly_Business,0.032768
City_C9,0.03083
Designation,0.021347
City_C10,0.019516
City_C2,0.019137


In [325]:
test_df = test_df.drop(['Emp_ID', 'Churn'],axis=1)

In [326]:
#test_df['Churn'] = lgb.predict(test_df)
test_df['Churn'] = lgb.predict(test_df)

In [327]:
submission['Target'] = test_df['Churn']

In [328]:
test_df.head()

Unnamed: 0,Age,Promotion,Designation,Last_Salary,Last_Quarterly_Rating,Tenure_months,Avg_Quarterly_Business,Gender_Female,Gender_Male,Education_Level_Bachelor,Education_Level_College,Education_Level_Master,City_C1,City_C10,City_C11,City_C12,City_C13,City_C14,City_C15,City_C16,City_C17,City_C18,City_C19,City_C2,City_C20,City_C21,City_C22,City_C23,City_C24,City_C25,City_C26,City_C27,City_C28,City_C29,City_C3,City_C4,City_C5,City_C6,City_C7,City_C8,City_C9,Churn
0,0.297297,2,3,97722,2,0.75,0.321824,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0.432432,2,2,56174,2,0.75,0.245966,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,0.432432,2,3,96750,1,0.75,0.55088,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0.486486,3,3,88813,1,0.701923,0.218666,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1
4,0.243243,3,4,188418,1,0.730769,0.627238,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [329]:
submission.head(20)

Unnamed: 0,Emp_ID,Target
0,394,0
1,173,0
2,1090,0
3,840,1
4,308,0
5,1864,0
6,1606,0
7,954,0
8,2422,0
9,1841,0


In [330]:
submission.to_csv('submission_lgb_4.csv',index=False)

In [None]:
# Undersampling
# oversampler=SMOTE(random_state=0)
# smote_train, smote_target = oversampler.fit_sample(train,target_train)