In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, log_loss, classification_report)
from imblearn.over_sampling import SMOTE
import xgboost

import warnings
warnings.filterwarnings('ignore')

In [None]:
attrition = pd.read_csv('WA_Fn-UseC_-HR-Employee-Attrition.csv')
attrition.head()


In [None]:
display(attrition.isnull().any())

In [None]:
target_map = {'Yes':1, 'No':0}
attrition["Attrition_numerical"] = attrition["Attrition"].apply(lambda x: target_map[x])

In [None]:
numerical = ['Age', 'DailyRate', 'DistanceFromHome', 
             'Education', 'EmployeeNumber', 'EnvironmentSatisfaction',
             'HourlyRate', 'JobInvolvement', 'JobLevel', 'JobSatisfaction',
            'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
             'PercentSalaryHike', 'PerformanceRating', 'RelationshipSatisfaction',
             'StockOptionLevel', 'TotalWorkingYears',
             'TrainingTimesLastYear', 'WorkLifeBalance', 'YearsAtCompany',
             'YearsInCurrentRole', 'YearsSinceLastPromotion','YearsWithCurrManager']
correlation_matrix = attrition[numerical].astype(float).corr()
plt.figure(figsize=(30, 20))
fig = sns.heatmap(correlation_matrix,annot=True)
fig.set(xlabel='',ylabel='')
fig.xaxis.tick_top()

In [None]:
numerical = ['Age', 'DailyRate',  'JobSatisfaction',
       'MonthlyIncome', 'PerformanceRating',
        'WorkLifeBalance', 'YearsAtCompany', 'Attrition_numerical']

g = sns.pairplot(attrition[numerical], hue='Attrition_numerical', palette='seismic', diag_kind = 'kde',diag_kws=dict(shade=True))
g.set(xticklabels=[])

In [None]:
attrition = attrition.drop(['Attrition_numerical'], axis=1)

# Empty list to store columns with categorical data
categorical = []
for col, value in attrition.iteritems():
    if value.dtype == 'object':
        categorical.append(col)

# Store the numerical columns in a list numerical
numerical = attrition.columns.difference(categorical)

In [None]:
attrition_cat = attrition[categorical]
attrition_cat = attrition_cat.drop(['Attrition'], axis=1)

In [None]:
attrition_cat = pd.get_dummies(attrition_cat)
attrition_cat.head(3)

In [None]:
attrition_num = attrition[numerical]

In [None]:
attrition_final = pd.concat([attrition_num, attrition_cat], axis=1)

In [None]:
target_map = {'Yes':1, 'No':0}
# Use the pandas apply method to numerically encode our attrition target variable
target = attrition["Attrition"].apply(lambda x: target_map[x])
target.head(3)

In [None]:
attrition['Attrition'].value_counts().plot(kind='bar')

In [None]:
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedShuffleSplit

# Split data into train and test sets as well as for validation and testing
train, test, target_train, target_val = train_test_split(attrition_final, 
                                                         target, 
                                                         train_size= 0.80,
                                                         random_state=0);

In [None]:
oversampler=SMOTE(random_state=0)
smote_train, smote_target = oversampler.fit_sample(train,target_train)

In [None]:
seed = 0   # We set our random seed to zero for reproducibility
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 1000,
#     'warm_start': True, 
    'max_features': 0.3,
    'max_depth': 4,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'random_state' : seed,
    'verbose': 0
}


In [None]:
rf = RandomForestClassifier(**rf_params)

In [None]:
rf.fit(smote_train, smote_target)
print("Fitting of Random Forest finished")

In [None]:
rf_predictions = rf.predict(test)
print("Predictions finished")

In [None]:
print("Accuracy score: {}".format(accuracy_score(target_val, rf_predictions)))
print("="*80)
print(classification_report(target_val, rf_predictions))

In [None]:
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(target_val, rf_predictions)

In [None]:
print(confusion)

In [None]:
print(confusion[0][0]/(confusion[0][0]+confusion[0][1]))

In [None]:
feature_importance_df = pd.DataFrame({'Feature': attrition_final.columns, 'Importance': rf.feature_importances_})

# Sort the DataFrame by feature importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Create a barplot using Seaborn
plt.figure(figsize=(20, 35))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette="viridis")

plt.title('Random Forest Feature Importance')
plt.xlabel('Feature Importance')
plt.ylabel('Feature')

plt.show()

In [None]:
from sklearn import tree
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont
import re

decision_tree = tree.DecisionTreeClassifier(max_depth = 4)
decision_tree.fit(train, target_train)

# Predicting results for test dataset
y_pred = decision_tree.predict(test)

# Export our trained model as a .dot file
with open("tree1.dot", 'w') as f:
     f = tree.export_graphviz(decision_tree,
                              out_file=f,
                              max_depth = 4,
                              impurity = False,
                              feature_names = attrition_final.columns.values,
                              class_names = ['No', 'Yes'],
                              rounded = True,
                              filled= True )
        
#Convert .dot to .png to allow display in web notebook
check_call(['dot','-Tpng','tree1.dot','-o','tree1.png'])

# Annotating chart with PIL
img = Image.open("tree1.png")
draw = ImageDraw.Draw(img)
img.save('sample-out.png')
PImage("sample-out.png", height=2000, width=1900)

In [None]:
gb_params ={
    'n_estimators': 1500,
    'max_features': 0.9,
    'learning_rate' : 0.25,
    'max_depth': 4,
    'min_samples_leaf': 2,
    'subsample': 1,
    'max_features' : 'sqrt',
    'random_state' : seed,
    'verbose': 0
}

In [None]:
gb = GradientBoostingClassifier(**gb_params)
# Fit the model to our SMOTEd train and target
gb.fit(smote_train, smote_target)
# Get our predictions
gb_predictions = gb.predict(test)
print("Predictions have finished")

In [None]:
print(accuracy_score(target_val, gb_predictions))
print(classification_report(target_val, gb_predictions))

In [None]:
feature_importance_df = pd.DataFrame({'Feature': attrition_final.columns, 'Importance': gb.feature_importances_})

# Sort the DataFrame by feature importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Create a barplot using Seaborn
plt.figure(figsize=(20,25))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette="viridis")

plt.title('Gradient Boosting Model Feature Importance')
plt.xlabel('Feature Importance')
plt.ylabel('Feature')

plt.show()