# Machine Learning

There is a problem of classification where what I am looking for is to identifying whether the employee is living the company or not.

### Import libraries

In [5]:
import pandas as pd
import numpy as np

### Load Data

In [6]:
df_cleaned=pd.read_csv('data/HR_data_cleaned.csv')

In [7]:
df_cleaned.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.8,0.86,5,262,6,0,1,0,sales,medium
2,0.11,0.88,7,272,4,0,1,0,sales,medium
3,0.72,0.87,5,223,5,0,1,0,sales,low
4,0.37,0.52,2,159,3,0,1,0,sales,low


### Data quality issues 

In [9]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11991 entries, 0 to 11990
Data columns (total 10 columns):
satisfaction_level       11991 non-null float64
last_evaluation          11991 non-null float64
number_project           11991 non-null int64
average_montly_hours     11991 non-null int64
time_spend_company       11991 non-null int64
Work_accident            11991 non-null int64
left                     11991 non-null int64
promotion_last_5years    11991 non-null int64
department               11991 non-null object
salary                   11991 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 936.9+ KB


After a quick review of the dataset I realise some issues such that:

* To deal better with ML algorithms and statistics the features must have a NUMERICAL value. Therefore ,some **features (sales and salary)** will be transformed to numerical.


- **left** : as "target" column, it should occupy the last position in the dataframe. It will be moved to there.

**1. Convert to numerical** : "salary" and "sales"

In [11]:
df_cleaned['salary'].unique()

array(['low', 'medium', 'high'], dtype=object)

In [13]:
#Dummy Coding
hr_salary = pd.get_dummies(df_cleaned['salary'])
hr_salary.sample(10)

Unnamed: 0,high,low,medium
4581,1,0,0
6462,0,1,0
9847,0,0,1
4441,0,1,0
5316,1,0,0
8632,0,1,0
6772,0,1,0
2519,0,0,1
8957,0,0,1
7621,0,1,0


In [14]:
#join
hr2=pd.merge(df_cleaned,hr_salary,left_index=True, right_index=True)
hr2.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary,high,low,medium
0,0.38,0.53,2,157,3,0,1,0,sales,low,0,1,0
1,0.8,0.86,5,262,6,0,1,0,sales,medium,0,0,1
2,0.11,0.88,7,272,4,0,1,0,sales,medium,0,0,1
3,0.72,0.87,5,223,5,0,1,0,sales,low,0,1,0
4,0.37,0.52,2,159,3,0,1,0,sales,low,0,1,0


In [20]:
#Now it is the turn of "department" 
hr2['department'].unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [21]:
hr_department= hr2['department'].replace(['sales', 'accounting', 'hr', 'technical', 'support', 'management','IT', 'product_mng', 'marketing', 'RandD'], 
                                  [0,1,2,3,4,5,6,7,8,9],#new values
                                 inplace = False) #it will be modified in place
hr_department

0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
13       0
14       0
15       0
16       0
17       0
18       0
19       0
20       0
21       0
22       0
23       0
24       0
25       0
26       0
27       0
28       1
29       1
        ..
11961    8
11962    1
11963    1
11964    6
11965    6
11966    5
11967    8
11968    8
11969    8
11970    0
11971    0
11972    0
11973    0
11974    0
11975    0
11976    0
11977    6
11978    7
11979    7
11980    7
11981    7
11982    6
11983    6
11984    5
11985    5
11986    5
11987    5
11988    5
11989    8
11990    6
Name: department, Length: 11991, dtype: int64

In [28]:
#finally, we include this values into the dataFrame as "department_numeric" columnn
hr2['department_numeric']=hr_department
hr2.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,department,salary,high,low,medium,department_numeric
0,0.38,0.53,2,157,3,0,1,0,sales,low,0,1,0,0
1,0.8,0.86,5,262,6,0,1,0,sales,medium,0,0,1,0
2,0.11,0.88,7,272,4,0,1,0,sales,medium,0,0,1,0
3,0.72,0.87,5,223,5,0,1,0,sales,low,0,1,0,0
4,0.37,0.52,2,159,3,0,1,0,sales,low,0,1,0,0


In [29]:
#Now, I can remove "sales" and "salary" columns from DataFrame.Therefore it will include only numeric features
hr_numeric=hr2.drop(columns=['department','salary'])
hr_numeric.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,high,low,medium,department_numeric
0,0.38,0.53,2,157,3,0,1,0,0,1,0,0
1,0.8,0.86,5,262,6,0,1,0,0,0,1,0
2,0.11,0.88,7,272,4,0,1,0,0,0,1,0
3,0.72,0.87,5,223,5,0,1,0,0,1,0,0
4,0.37,0.52,2,159,3,0,1,0,0,1,0,0


**2.Change the order of columns**: "left" column as "target" will be moved to the final of the data frame

In [30]:
cols = hr_numeric.columns.tolist()
cols

['satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'time_spend_company',
 'Work_accident',
 'left',
 'promotion_last_5years',
 'high',
 'low',
 'medium',
 'department_numeric']

In [31]:
#Now "left" column will be the last one
cols_new = cols[:6]+cols[7:]+cols[6:7]
cols_new

['satisfaction_level',
 'last_evaluation',
 'number_project',
 'average_montly_hours',
 'time_spend_company',
 'Work_accident',
 'promotion_last_5years',
 'high',
 'low',
 'medium',
 'department_numeric',
 'left']

In [32]:
#finally, it's included in the new dataframe called: hr_data
hr_data= hr_numeric[cols_new]
hr_data.head()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,high,low,medium,department_numeric,left
0,0.38,0.53,2,157,3,0,0,0,1,0,0,1
1,0.8,0.86,5,262,6,0,0,0,0,1,0,1
2,0.11,0.88,7,272,4,0,0,0,0,1,0,1
3,0.72,0.87,5,223,5,0,0,0,1,0,0,1
4,0.37,0.52,2,159,3,0,0,0,1,0,0,1


In [33]:
#check values' type: 
hr_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11991 entries, 0 to 11990
Data columns (total 12 columns):
satisfaction_level       11991 non-null float64
last_evaluation          11991 non-null float64
number_project           11991 non-null int64
average_montly_hours     11991 non-null int64
time_spend_company       11991 non-null int64
Work_accident            11991 non-null int64
promotion_last_5years    11991 non-null int64
high                     11991 non-null uint8
low                      11991 non-null uint8
medium                   11991 non-null uint8
department_numeric       11991 non-null int64
left                     11991 non-null int64
dtypes: float64(2), int64(7), uint8(3)
memory usage: 878.3 KB


### Saving cleaned dataset

In [None]:
hr_data.to_csv('data/HR_data_numeric.csv',index=False)

### Load Data

In [39]:
df_numeric=pd.read_csv('data/HR_data_numeric.csv')

In [40]:
df_numeric.shape

(11991, 12)

In [41]:
df_numeric.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'high', 'low', 'medium', 'department_numeric',
       'left'],
      dtype='object')

In [42]:
x=df_numeric[['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'high', 'low', 'medium', 'department_numeric']] #X will be the features
y=df_numeric['left'] #left column will be the target one (1=left, 0 =stay)

In [43]:
#Split the dataset into two parts: train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [44]:
#checking we have the desired 70% train, 30% split of the data
print("{0:0.2f}% in training set".format((len(x_train)/len(df_numeric.index)) * 100))
print("{0:0.2f}% in test set".format((len(x_test)/len(df_numeric.index)) * 100))

66.99% in training set
33.01% in test set


In [45]:
### Preprocesado de variables

## Choosing the model

### Looking for the best accuracy, I will analyse some prediction models 

### Let's start with _K-nearest Neighbors_

In [46]:
from sklearn.neighbors import KNeighborsClassifier
clf1 = KNeighborsClassifier(n_neighbors=10)

In [47]:
clf1.fit(x_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

In [48]:
from sklearn.metrics import accuracy_score

In [49]:
accuracy_score(y_test,clf1.predict(x_test))

0.9295098534613441

In [50]:
from sklearn.model_selection import GridSearchCV

In [51]:
clf1=GridSearchCV(estimator =KNeighborsClassifier(n_neighbors=5),
                  param_grid={"n_neighbors":np.arange(1,50)})

clf1.fit(x,y) #training the model with the best_params
clf1.best_params_

{'n_neighbors': 4}

In [52]:
from sklearn.model_selection import cross_val_score

In [53]:
cross_val_score(KNeighborsClassifier(n_neighbors=4),x,y).mean()

0.9298633941450096