In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier 

In [2]:
train_DTR = DecisionTreeRegressor()
train_RFC = RandomForestClassifier(n_estimators = 2, random_state = 1)
train_DTC = DecisionTreeClassifier()

In [3]:
pd.set_option("display.max_rows",None)
pd.set_option("display.max_columns",None)

In [4]:
df = pd.read_csv('HR-Em.csv')

In [5]:
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,MonthlyRate,NumCompaniesWorked,Over18,OverTime,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,2,Female,94,3,2,Sales Executive,4,Single,5993,19479,8,Y,Yes,11,3,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,3,Male,61,2,2,Research Scientist,2,Married,5130,24907,1,Y,No,23,4,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,4,Male,92,2,1,Laboratory Technician,3,Single,2090,2396,6,Y,Yes,15,3,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,4,Female,56,3,1,Research Scientist,3,Married,2909,23159,1,Y,Yes,11,3,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,1,Male,40,3,1,Laboratory Technician,2,Married,3468,16632,9,Y,No,12,3,4,80,1,6,3,3,2,2,2,2


In [6]:
# df.info()

In [7]:
df.isnull().sum()

Age                         0
Attrition                   0
BusinessTravel              0
DailyRate                   0
Department                  0
DistanceFromHome            0
Education                   0
EducationField              0
EmployeeCount               0
EmployeeNumber              0
EnvironmentSatisfaction     0
Gender                      0
HourlyRate                  0
JobInvolvement              0
JobLevel                    0
JobRole                     0
JobSatisfaction             0
MaritalStatus               0
MonthlyIncome               0
MonthlyRate                 0
NumCompaniesWorked          0
Over18                      0
OverTime                    0
PercentSalaryHike           0
PerformanceRating           0
RelationshipSatisfaction    0
StandardHours               0
StockOptionLevel            0
TotalWorkingYears           0
TrainingTimesLastYear       0
WorkLifeBalance             0
YearsAtCompany              0
YearsInCurrentRole          0
YearsSince

In [8]:
enum_jobs = set(df["JobRole"])
indx_jobs = range(len(enum_jobs))
jobs = zip(enum_jobs,indx_jobs)
dict_jobs = dict(jobs)
                  
print(dict_jobs)

{'Healthcare Representative': 0, 'Sales Executive': 1, 'Research Director': 2, 'Human Resources': 3, 'Sales Representative': 4, 'Research Scientist': 5, 'Manufacturing Director': 6, 'Manager': 7, 'Laboratory Technician': 8}


In [9]:
df["IndxJobRole"] = [dict_jobs[role] for role in df['JobRole']]
print(df[['JobRole','IndxJobRole']])

                        JobRole  IndxJobRole
0               Sales Executive            1
1            Research Scientist            5
2         Laboratory Technician            8
3            Research Scientist            5
4         Laboratory Technician            8
5         Laboratory Technician            8
6         Laboratory Technician            8
7         Laboratory Technician            8
8        Manufacturing Director            6
9     Healthcare Representative            0
10        Laboratory Technician            8
11        Laboratory Technician            8
12           Research Scientist            5
13        Laboratory Technician            8
14        Laboratory Technician            8
15       Manufacturing Director            6
16           Research Scientist            5
17        Laboratory Technician            8
18                      Manager            7
19           Research Scientist            5
20       Manufacturing Director            6
21        

In [10]:
df.describe()

Unnamed: 0,Age,DailyRate,DistanceFromHome,Education,EmployeeCount,EmployeeNumber,EnvironmentSatisfaction,HourlyRate,JobInvolvement,JobLevel,JobSatisfaction,MonthlyIncome,MonthlyRate,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager,IndxJobRole
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,802.485714,9.192517,2.912925,1.0,1024.865306,2.721769,65.891156,2.729932,2.063946,2.728571,6502.931293,14313.103401,2.693197,15.209524,3.153741,2.712245,80.0,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755,4.123129,4.142857
std,9.135373,403.5091,8.106864,1.024165,0.0,602.024335,1.093082,20.329428,0.711561,1.10694,1.102846,4707.956783,7117.786044,2.498009,3.659938,0.360824,1.081209,0.0,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243,3.568136,2.774237
min,18.0,102.0,1.0,1.0,1.0,1.0,1.0,30.0,1.0,1.0,1.0,1009.0,2094.0,0.0,11.0,3.0,1.0,80.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,30.0,465.0,2.0,2.0,1.0,491.25,2.0,48.0,2.0,1.0,2.0,2911.0,8047.0,1.0,12.0,3.0,2.0,80.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0,2.0,1.0
50%,36.0,802.0,7.0,3.0,1.0,1020.5,3.0,66.0,3.0,2.0,3.0,4919.0,14235.5,2.0,14.0,3.0,3.0,80.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0,3.0,5.0
75%,43.0,1157.0,14.0,4.0,1.0,1555.75,4.0,83.75,3.0,3.0,4.0,8379.0,20461.5,4.0,18.0,3.0,4.0,80.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0,7.0,6.0
max,60.0,1499.0,29.0,5.0,1.0,2068.0,4.0,100.0,4.0,5.0,4.0,19999.0,26999.0,9.0,25.0,4.0,4.0,80.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0,17.0,8.0


In [11]:
df['JobLevel'] = df['JobLevel'].astype('object')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 36 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                

In [12]:
columns = []
for column,column_type in zip(df.columns,df.dtypes):
    if not column_type in ['int64']:
        columns.append(column)
columns.remove('JobRole')
columns.remove('OverTime')
print(columns)

['Attrition', 'BusinessTravel', 'Department', 'EducationField', 'Gender', 'JobLevel', 'MaritalStatus', 'Over18']


In [13]:
xnew = df[columns]
xnew = pd.get_dummies(xnew)
xnew = np.array(xnew.iloc[0, : ]).reshape(1,-1)
ynew = df.iloc[0]['IndxJobRole']
print(xnew)

[[0 1 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 1 1]]


In [14]:
df = df.iloc[1: , :]
x, y = df[columns], df["IndxJobRole"]

In [15]:
x = pd.get_dummies(x)

In [16]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = 0.3, random_state = 1)

In [17]:
train_DTR.fit(train_x,train_y)
train_RFC.fit(train_x,train_y)
train_DTC.fit(train_x,train_y)

DecisionTreeClassifier()

In [18]:
pred_DTR = np.around(train_DTR.predict(test_x))
pred_RFC = np.around(train_RFC.predict(test_x))
pred_DTC = np.around(train_DTC.predict(test_x))

In [19]:
print("MAE of DecisionTreeRegressor = {}".format(mean_absolute_error(test_y, pred_DTR)))
print("MAE of RandomForestClassifier = {}".format(mean_absolute_error(test_y, pred_RFC)))

MAE of DecisionTreeRegressor = 1.564625850340136
MAE of RandomForestClassifier = 1.566893424036281


In [20]:
print("AS of DecisionTreeRegressor {} %".format(accuracy_score(test_y, pred_DTR) *100))
print("AS of RandomForestClassifier {} %".format(accuracy_score(test_y, pred_RFC) *100))
print("AS of DecisionTreeClassifier {} %".format(accuracy_score(test_y, pred_DTC) *100))

AS of DecisionTreeRegressor 43.53741496598639 %
AS of RandomForestClassifier 60.090702947845806 %
AS of DecisionTreeClassifier 59.863945578231295 %


In [21]:
pred_DTR_y = np.around(train_DTR.predict(xnew))
pred_RFC_y = np.around(train_RFC.predict(xnew))
pred_DTC_y = np.around(train_DTC.predict(xnew))



In [22]:
print(pred_DTR_y,pred_RFC_y,pred_DTC_y,ynew)

[1.] [1] [1] 1
