## Import Packages

In [36]:
import numpy as np
import matplotlib.pyplot as pp
import pandas as pd

from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

## Parse Data

In [39]:
# Parse training and test into data and labels
X_trO = pd.read_csv("trainEmployeeData.csv")
Y_tr = X_trO[['Attrition']]
X_tr = X_trO.drop(['Attrition','Unnamed: 0'], 1)

X_testO = pd.read_csv("testEmployeeData.csv")
Y_test = X_testO[['Attrition', 'Unnamed: 0']]
X_test = X_testO.drop(['Attrition','Unnamed: 0'], 1)

In [40]:
X_tr.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,42,Travel_Rarely,201,Research & Development,1,4,Life Sciences,1,517,2,...,2,80,1,8,5,3,5,2,1,2
1,32,Travel_Rarely,128,Research & Development,2,1,Technical Degree,1,362,4,...,4,80,0,9,5,3,6,2,0,4
2,48,Travel_Rarely,715,Research & Development,1,3,Life Sciences,1,1263,4,...,3,80,0,25,3,4,1,0,0,0
3,51,Travel_Rarely,432,Research & Development,9,4,Life Sciences,1,116,4,...,2,80,2,10,4,3,4,2,0,3
4,29,Travel_Rarely,726,Research & Development,29,1,Life Sciences,1,1859,4,...,4,80,2,11,3,3,7,0,1,6


In [41]:
Y_tr.head()

Unnamed: 0,Attrition
0,No
1,No
2,No
3,No
4,No


## Data Cleaning

In [44]:
# Check for any null values
total_data = pd.concat([X_trO,X_testO]).drop('Unnamed: 0', 1)
X_tr.isnull().any()

Age                         False
BusinessTravel              False
DailyRate                   False
Department                  False
DistanceFromHome            False
Education                   False
EducationField              False
EmployeeCount               False
EmployeeNumber              False
EnvironmentSatisfaction     False
Gender                      False
HourlyRate                  False
JobInvolvement              False
JobLevel                    False
JobRole                     False
JobSatisfaction             False
MaritalStatus               False
MonthlyIncome               False
MonthlyRate                 False
NumCompaniesWorked          False
Over18                      False
OverTime                    False
PercentSalaryHike           False
PerformanceRating           False
RelationshipSatisfaction    False
StandardHours               False
StockOptionLevel            False
TotalWorkingYears           False
TrainingTimesLastYear       False
WorkLifeBalanc

In [45]:
# We see no invalid data based off summary statistics
total_data.describe(include='all')

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
count,1470.0,1470,1470,1470.0,1470,1470.0,1470.0,1470,1470.0,1470.0,...,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
unique,,2,3,,3,,,6,,,...,,,,,,,,,,
top,,No,Travel_Rarely,,Research & Development,,,Life Sciences,,,...,,,,,,,,,,
freq,,1233,1043,,961,,,606,,,...,,,,,,,,,,
mean,36.92381,,,802.485714,,9.192517,2.912925,,1.0,1024.865306,...,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129
std,9.135373,,,403.5091,,8.106864,1.024165,,0.0,602.024335,...,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136
min,18.0,,,102.0,,1.0,1.0,,1.0,1.0,...,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,30.0,,,465.0,,2.0,2.0,,1.0,491.25,...,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0
50%,36.0,,,802.0,,7.0,3.0,,1.0,1020.5,...,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0
75%,43.0,,,1157.0,,14.0,4.0,,1.0,1555.75,...,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0


In [59]:
def transformTrainData(data):
    categorical = []
    for col, value in data.iteritems():
        if value.dtype == 'object':
            categorical.append(col)
    numerical = data.columns.difference(categorical)
    data_cat = data[categorical]
    data_cat = pd.get_dummies(data_cat)
    data_num = data[numerical]
    return pd.concat([data_num, data_cat], axis=1)

def transformTestData(data):
    return data["Attrition"].apply(lambda x: 1 if x == "Yes" else 0)

tran_X_tr = transformTrainData(X_tr)
tran_X_test = transformTrainData(X_test)

tran_Y_tr = transformTestData(Y_tr)
tran_Y_test = transformTestData(Y_test)

tran_X_tr.head()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single,Over18_Y,OverTime_No,OverTime_Yes
0,42,201,1,4,1,517,2,95,3,1,...,0,0,0,0,1,0,0,1,1,0
1,32,128,2,1,1,362,4,84,2,2,...,0,0,0,0,0,0,1,1,1,0
2,48,715,1,3,1,1263,4,76,2,5,...,1,0,0,0,0,0,1,1,1,0
3,51,432,9,4,1,116,4,96,3,1,...,0,0,0,0,0,1,0,1,1,0
4,29,726,29,1,1,1859,4,93,1,2,...,0,0,0,0,1,0,0,1,1,0


## Random Forest Model

In [61]:
# 50 trees, 25/34 features
rf_model = RandomForestClassifier(n_estimators=50, max_depth=25, max_features='auto')
rf_model.fit(tran_X_tr, tran_Y_tr)
print("Training accuracy: ", rf_model.score(tran_X_tr, tran_Y_tr))
print("Test accuracy: ", rf_model.score(tran_X_test, tran_Y_test))

Training accuracy:  1.0
Test accuracy:  0.853741496599
