In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn import tree
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
%matplotlib inline

In [None]:
pd.options.display.max_columns = 999

In [None]:
data = pd.read_csv('HR_Employee_Attrition_Data.csv')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.Age.value_counts()

In [None]:
data.Attrition = data.Attrition.map({'Yes':1, 'No':0})

In [None]:
data.Attrition.value_counts()

In [None]:
data.columns
data.dtypes

In [None]:
obj_cols = data.dtypes[data.dtypes == 'object'].index

In [None]:
for  cols  in  obj_cols:
    data = data.join(pd.get_dummies(data[cols], prefix=cols).iloc[:,:-1])

In [None]:
data.head()

In [None]:
data_new = data.drop(obj_cols, axis=1)

In [None]:
x = data_new.drop('Attrition', axis=1)
y = data_new['Attrition']
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

Feature Selection

In [None]:
rndf = RandomForestClassifier(n_estimators=150)
rndf.fit(x_train, y_train)


In [None]:
importance = pd.DataFrame.from_dict({'cols':x_train.columns, 'importance': rndf.feature_importances_})
importance = importance.sort_values(by='importance', ascending=False)


In [None]:
import seaborn as sns
%matplotlib inline
plt.figure(figsize=(20,15))
sns.barplot(importance.cols, importance.importance)
plt.xticks(rotation=90)

In [None]:
imp_cols = importance[importance.importance >= 0.005].cols.values


In [None]:
imp_cols

Model Selection

In [None]:
names = ["Decision Tree", "Random Forest", "AdaBoost", "Logisic", "GBM"]
classifiers = [
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(n_estimators=100),
    AdaBoostClassifier(n_estimators=100),
    LogisticRegression(),
    GradientBoostingClassifier(n_estimators=100)]

In [None]:
for clf in zip(names, classifiers):
    clf[1].fit(x_train[imp_cols], y_train) #[imp_cols]
    print(clf[0], clf[1].score(x_test[imp_cols], y_test)) #[imp_cols]

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
for clf in zip(names, classifiers):
    print(clf[0])
    pred = clf[1].predict(x_test[imp_cols])# [imp_cols]
    print(confusion_matrix(pred, y_test))
    print(classification_report(pred, y_test)) 
    print('*'*20)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint

Random Search CV

In [None]:
param_dist = {"max_depth": [5, None],
              #"max_features": sp_randint(2, 11),
              "n_estimators": sp_randint(50, 150),
              "min_samples_split": sp_randint(2, 6),
              "min_samples_leaf": sp_randint(1, 6),
              "bootstrap": [True, False],
              "criterion": ["gini", "entropy"]
             }

In [None]:
rscv = RandomizedSearchCV(classifiers[1], param_distributions=param_dist, n_iter=10, verbose=1)

In [None]:
rscv.fit(x_train[imp_cols], y_train)

In [None]:
rscv.best_params_
y_test_pred = rscv.predict(x_test[imp_cols])
rscv.score(x_test[imp_cols], y_test)

Grid Search CV

In [None]:
param_dist = {"max_depth": [5, None],
              #"max_features": sp_randint(2, 11),
              "n_estimators": [50, 100, 150],
              "min_samples_split": [2, 3, 4, 5],
              "min_samples_leaf": [2,3,4,5,6]
             }

In [None]:
rscv_grid = GridSearchCV(classifiers[1], param_grid=param_dist, verbose=1)

In [None]:
rscv_grid.fit(x_train[imp_cols], y_train)

In [None]:
rscv_grid.best_params_

In [None]:
model = rscv_grid.best_estimator_
model.fit(x_train[imp_cols], y_train)

In [None]:
model.score(x_test[imp_cols], y_test)

Lift Gain and KS Statistics

In [None]:
predict_probas = rscv.predict_proba(x_test[imp_cols])
y_test_pred = rscv.predict(x_test[imp_cols])

In [None]:
predict_probas

In [None]:
new_df = pd.DataFrame.from_dict({'target': y_test,'predict':y_test_pred, 'pred_prob_0':predict_probas[:,0], 'pred_prob_1': predict_probas[:,1]})

In [None]:
new_df

In [None]:
new_df = new_df.sort_values(by='pred_prob_1', ascending=False)
splits = np.array_split(new_df.values, 10)
ks_df = pd.DataFrame(columns=['count', '1s', '0s', 'pos_per', 'neg_per'])
list_of_dict = []
for split in splits:
    cnt = split.shape[0]
    pred_target_pos = split[:,3].sum()
    pred_target_neg = cnt - pred_target_pos
    pos_per = pred_target_pos / cnt
    neg_per = pred_target_neg / cnt
    list_of_dict.append({'count': cnt, '1s': pred_target_pos, '0s': pred_target_neg, 'pos_per': pos_per, 'neg_per': neg_per})
metric_data = pd.DataFrame.from_dict(list_of_dict)
metric_data

In [None]:
metric_data['0s_cumsum'] = metric_data['0s'].cumsum()
metric_data['1s_cumsum'] = metric_data['1s'].cumsum()
metric_data['0s_cumsum_per'] = metric_data['0s_cumsum'] / metric_data['0s'].sum()
metric_data['1s_cumsum_per'] = metric_data['1s_cumsum'] / metric_data['1s'].sum()
metric_data['cum_pop_%'] = (metric_data.index + 1) * 10
metric_data['lift_decile'] = 1000 * metric_data['1s'] / metric_data['1s'].sum()
metric_data['lift_total'] = 10000 * metric_data['1s_cumsum_per'] / metric_data['cum_pop_%']
metric_data['ks'] = metric_data['1s_cumsum_per'] - metric_data['0s_cumsum_per']
metric_data

In [None]:
fig, ax1 = plt.subplots()

ax1.bar(metric_data.index, metric_data['1s'])
ax1.set_ylabel('1s_count')
ax1.set_xlabel('Deciles')
ax2 = ax1.twinx()

ax2.plot(metric_data['0s_cumsum_per'], color='red')
ax2.plot(metric_data['1s_cumsum_per'], color='green')
ax2.set_ylabel('ks')