In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore") # this will take away the red dialog boxes in the output terminal
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,MinMaxScaler
#--------
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('../Data/CleanAttrition.csv')#,index_col='EmployeeNumber')
custom_features= pd.read_csv('../Data/Custom_Features.csv')#,index_col='EmployeeNumber')

In [3]:
df.head()

Unnamed: 0,EmployeeNumber,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EnvironmentSatisfaction,...,PerformanceRating,RelationshipSatisfaction,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,1,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,2,...,3,1,0,8,0,1,6,4,0,5
1,2,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,3,...,4,4,1,10,3,3,10,7,1,7
2,4,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,4,...,3,2,0,7,3,3,0,0,0,0
3,5,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,4,...,3,3,0,8,3,3,8,7,3,0
4,7,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,...,3,4,1,6,3,3,2,2,2,2


In [4]:
custom_features.head()

Unnamed: 0,EmployeeNumber,FieldVsDept,MonthlyHours,DaysWorked,CompanyRatiotoCareer,RoleRatioToCompany,RoleRatioToCareer,AgeBin,HourlyRateBin,MonthlyRateBin,MonthlyIncomeBin,DailyRateBin,DaysWorkedBin,MonthlyHoursBin,CompanyRatiotoCareerBin,RoleRatioToCompanyBin,RoleRatioToCareerBin
0,1,Life Sciences - Sales,63.755319,5.438294,0.75,0.666667,0.5,40,90,19000,5500,1100,5.0,60.0,0.7,0.65,0.45
1,2,Life Sciences - Research & Development,84.098361,18.387097,1.0,0.7,0.7,40,60,24500,5000,200,15.0,80.0,0.95,0.65,0.65
2,4,Other - Research & Development,22.717391,1.522214,0.0,1.0,0.0,30,90,2000,2000,1300,0.0,20.0,0.0,1.0,0.0
3,5,Life Sciences - Research & Development,51.946429,2.089799,1.0,0.875,0.875,30,50,23000,2500,1300,0.0,40.0,0.95,0.85,0.85
4,7,Medical - Research & Development,86.7,5.86802,0.333333,1.0,0.333333,20,40,16500,3000,500,5.0,80.0,0.3,0.95,0.3


In [5]:
#I chose to quntify business travel as scale of how much travel is occuring, as opposed to dummy encodig or one hot encoding
df['BusinessTravel'].replace('Non-Travel',0,inplace=True)
df['BusinessTravel'].replace('Travel_Rarely',1,inplace=True)
df['BusinessTravel'].replace('Travel_Frequently',2,inplace=True)

In [6]:
#Lets check on our non-numeric columns
merged_df = df.merge(custom_features,on='EmployeeNumber').set_index('EmployeeNumber')
bins = [col for col in  merged_df.columns if col.endswith('Bin')]

object_cols = [col for col in  merged_df.columns if merged_df[col].dtype == 'O']
num_cols = [col for col in  merged_df.columns if merged_df[col].dtype != 'O']

hot_coded_df = pd.get_dummies(merged_df,object_cols,drop_first=True) #Quanifty object type columns with one hot encoding
hot_coded_df.drop(bins,axis=1,inplace=True)
hot_coded_df.dropna(inplace=True)

In [7]:
X = hot_coded_df.drop('Attrition_Yes',axis=1)
y = hot_coded_df['Attrition_Yes']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=20,stratify=y)

In [8]:
#scale a the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
#https://scikit-learn.org/stable/modules/preprocessing.html

In [9]:
X_train_scaled.to_csv("../Data/TrainTest/X_train.csv",index=False)
X_test_scaled.to_csv("../Data/TrainTest/X_test.csv",index=False)

y_train.to_csv("../Data/TrainTest/y_train.csv",index=False)
y_test.to_csv("../Data/TrainTest/y_test.csv",index=False)

In [10]:
X.to_csv("../Data/TrainTest/X_full.csv",index=False)
y.to_csv("../Data/TrainTest/y_full.csv",index=False)

In [11]:
X_train_scaled.head()

Unnamed: 0,Age,BusinessTravel,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,...,FieldVsDept_Marketing - Sales,FieldVsDept_Medical - Human Resources,FieldVsDept_Medical - Research & Development,FieldVsDept_Medical - Sales,FieldVsDept_Other - Human Resources,FieldVsDept_Other - Research & Development,FieldVsDept_Other - Sales,FieldVsDept_Technical Degree - Human Resources,FieldVsDept_Technical Degree - Research & Development,FieldVsDept_Technical Degree - Sales
0,0.438572,-0.146,-0.389625,-1.022327,0.094108,1.165648,0.500516,0.366493,-0.96737,1.149088,...,-0.344207,-0.10395,-0.567998,-0.259668,-0.04413,-0.208831,-0.108625,-0.06247,-0.261793,-0.164153
1,2.410706,-0.146,-0.205904,-0.898244,0.094108,0.251432,0.161425,-1.061996,1.762311,1.149088,...,-0.344207,-0.10395,-0.567998,-0.259668,-0.04413,-0.208831,-0.108625,-0.06247,-0.261793,-0.164153
2,-1.314437,-2.023932,-0.306573,-0.525995,-0.884046,-0.662785,0.936491,1.794982,-0.057476,-1.571184,...,-0.344207,-0.10395,-0.567998,-0.259668,-0.04413,-0.208831,-0.108625,-0.06247,-0.261793,-0.164153
3,0.986387,-0.146,-0.228554,-0.898244,-0.884046,1.165648,0.791166,0.366493,0.852417,1.149088,...,-0.344207,-0.10395,-0.567998,-0.259668,-0.04413,-0.208831,-0.108625,-0.06247,-0.261793,-0.164153
4,-0.218807,-0.146,-1.242793,2.203826,0.094108,0.251432,0.88805,0.366493,-0.057476,1.149088,...,-0.344207,-0.10395,-0.567998,-0.259668,-0.04413,-0.208831,-0.108625,-0.06247,-0.261793,-0.164153
