In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn import preprocessing


In [2]:
attrition_df = pd.read_csv("general_data.csv")
attrition_df.head()


Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [3]:
lc = preprocessing.LabelEncoder()
attrition_df["Attrition"] = lc.fit_transform(attrition_df["Attrition"])
attrition_df["Gender"] = lc.fit_transform(attrition_df["Gender"])
attrition_df["BusinessTravel"] = lc.fit_transform(attrition_df["BusinessTravel"])
attrition_df["Department"] = lc.fit_transform(attrition_df["Department"])
attrition_df["NumCompaniesWorked"] = np.where(attrition_df["NumCompaniesWorked"].isnull(), 2, attrition_df["NumCompaniesWorked"])
attrition_df["TotalWorkingYears"] = np.where(attrition_df["TotalWorkingYears"].isnull(), 2, attrition_df["TotalWorkingYears"])
attrition_df.head()


Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,0,2,2,6,2,Life Sciences,1,1,0,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,1,1,1,10,1,Life Sciences,1,2,0,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,0,1,1,17,4,Other,1,3,1,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,0,0,1,2,5,Life Sciences,1,4,1,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,0,2,1,10,1,Medical,1,5,1,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [4]:
attrition_df['TotalWorkingYears'].mean()


11.260997732426304

In [5]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators = 1000, max_features = 2, oob_score = True)
features = ["Age", "DistanceFromHome", "NumCompaniesWorked", "PercentSalaryHike", "StockOptionLevel", "TotalWorkingYears", "TrainingTimesLastYear", "YearsAtCompany", "YearsSinceLastPromotion", "YearsWithCurrManager"]


In [6]:
rf_model.fit(X = attrition_df[features], y = attrition_df['Attrition'])
print("oob score: ", rf_model.oob_score_)
for features,imp in zip(features, rf_model.feature_importances_):
    print(features, imp)

oob score:  0.999546485260771
Age 0.16701692750461408
DistanceFromHome 0.1345392362833891
NumCompaniesWorked 0.08456685953489979
PercentSalaryHike 0.1223044723612882
StockOptionLevel 0.06025755746162711
TotalWorkingYears 0.1249902926908792
TrainingTimesLastYear 0.07718193807309746
YearsAtCompany 0.09218262180465815
YearsSinceLastPromotion 0.06349828003229271
YearsWithCurrManager 0.07346181425325347


Inference:
Age, Distance from home, PercentSalaryHike and TotalWorkingYears have a high oob score and hence they are the most relevant features to consider

In [7]:
from sklearn.model_selection import train_test_split
x = pd.DataFrame([attrition_df['Age'], attrition_df['DistanceFromHome'], attrition_df['PercentSalaryHike'], attrition_df['TotalWorkingYears']]).T
y = attrition_df['Attrition']
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)


In [8]:
tree_model = tree.DecisionTreeClassifier(max_depth = 8)
tree_model.fit(X_train, y_train)
with open("AttritionTree.dot", "w") as f:
    f = tree.export_graphviz(tree_model, feature_names = ["Age", "DistanceFromHome", "PercentSalaryHike", "TotalWorkingHours"], out_file = f)
tree_model.score(X = X_train, y = y_train)    

0.8761337868480725

In [9]:
test_preds = tree_model.predict(X = X_test)
Predicted_Output = pd.DataFrame({"Attrition": test_preds})
Predicted_Output.to_csv("AttritionPrediction.csv")