# Machine Learning for Predictive Analytics Mini Sprint

In [3]:
# Here we import all our libraries
import pandas as pd
from pandas.api import types 

In [1]:
# Set the path to the data and unzip it
data_path = '../data/predictive-analytics-sprint/'
zip_file = 'data.zip'
!unzip -o {data_path + zip_file} -d {data_path}

Archive:  ../data/predictive-analytics-sprint/data.zip
  inflating: ../data/predictive-analytics-sprint/data_dictionary.xlsx  
  inflating: ../data/predictive-analytics-sprint/employee_survey_data.csv  
  inflating: ../data/predictive-analytics-sprint/general_data.csv  
  inflating: ../data/predictive-analytics-sprint/in_time.csv  
  inflating: ../data/predictive-analytics-sprint/manager_survey_data.csv  
  inflating: ../data/predictive-analytics-sprint/out_time.csv  


## Cleaning and Preprocessing Data

In [6]:
# Load the data
general_data = pd.read_csv(data_path + 'general_data.csv')
# We swap the columns 'EmployeeID' and 'Age' to make the data more readable
cols = list(general_data.columns)
a, b = cols.index('EmployeeID'), cols.index('Age')
cols[b], cols[a] = cols[a], cols[b]
general_data = general_data[cols]

employee_survey_data = pd.read_csv(data_path + 'employee_survey_data.csv')
manager_survey_data = pd.read_csv(data_path + 'manager_survey_data.csv')

display(general_data.head() , employee_survey_data.head() , manager_survey_data.head())

Unnamed: 0,EmployeeID,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,Age,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,1,No,Travel_Rarely,Sales,6,2,Life Sciences,1,51,Female,1,Healthcare Representative,Married,131160,1.0,Y,11,8,0,1.0,6,1,0,0
1,2,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,31,Female,1,Research Scientist,Single,41890,0.0,Y,23,8,1,6.0,3,5,1,4
2,3,No,Travel_Frequently,Research & Development,17,4,Other,1,32,Male,4,Sales Executive,Married,193280,1.0,Y,15,8,3,5.0,2,5,0,3
3,4,No,Non-Travel,Research & Development,2,5,Life Sciences,1,38,Male,3,Human Resources,Married,83210,3.0,Y,11,8,3,13.0,5,8,7,5
4,5,No,Travel_Rarely,Research & Development,10,1,Medical,1,32,Male,1,Sales Executive,Single,23420,4.0,Y,12,8,2,9.0,2,6,0,4


Unnamed: 0,EmployeeID,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance
0,1,3.0,4.0,2.0
1,2,3.0,2.0,4.0
2,3,2.0,2.0,1.0
3,4,4.0,4.0,3.0
4,5,4.0,1.0,3.0


Unnamed: 0,EmployeeID,JobInvolvement,PerformanceRating
0,1,3,3
1,2,2,4
2,3,3,3
3,4,2,3
4,5,3,3


In [7]:
# Here we merge all three data frames into oneon the employee column
data = pd.merge(general_data, employee_survey_data, on='EmployeeID')
data = pd.merge(data, manager_survey_data, on='EmployeeID')
data.head()

Unnamed: 0,EmployeeID,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,Age,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,JobInvolvement,PerformanceRating
0,1,No,Travel_Rarely,Sales,6,2,Life Sciences,1,51,Female,1,Healthcare Representative,Married,131160,1.0,Y,11,8,0,1.0,6,1,0,0,3.0,4.0,2.0,3,3
1,2,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,31,Female,1,Research Scientist,Single,41890,0.0,Y,23,8,1,6.0,3,5,1,4,3.0,2.0,4.0,2,4
2,3,No,Travel_Frequently,Research & Development,17,4,Other,1,32,Male,4,Sales Executive,Married,193280,1.0,Y,15,8,3,5.0,2,5,0,3,2.0,2.0,1.0,3,3
3,4,No,Non-Travel,Research & Development,2,5,Life Sciences,1,38,Male,3,Human Resources,Married,83210,3.0,Y,11,8,3,13.0,5,8,7,5,4.0,4.0,3.0,2,3
4,5,No,Travel_Rarely,Research & Development,10,1,Medical,1,32,Male,1,Sales Executive,Single,23420,4.0,Y,12,8,2,9.0,2,6,0,4,4.0,1.0,3.0,3,3


In [None]:
# We can save the data to a csv file  for later use
data.to_csv(data_path + 'employee_data.csv', index=False)

In [8]:
# Loop through the columns and convert the non-numeric columns to numeric
for column in data.columns:
  # Check if the data type is not an integer or a float
  if not types.is_numeric_dtype(data[column]):
    # One hot encode the column with numerical categories
    data[column] = pd.Categorical(data[column])
    data[column] = data[column].cat.codes
data.head()

Unnamed: 0,EmployeeID,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,Age,Gender,JobLevel,JobRole,MaritalStatus,MonthlyIncome,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,JobInvolvement,PerformanceRating
0,1,0,2,2,6,2,1,1,51,0,1,0,1,131160,1.0,0,11,8,0,1.0,6,1,0,0,3.0,4.0,2.0,3,3
1,2,1,1,1,10,1,1,1,31,0,1,6,2,41890,0.0,0,23,8,1,6.0,3,5,1,4,3.0,2.0,4.0,2,4
2,3,0,1,1,17,4,4,1,32,1,4,7,1,193280,1.0,0,15,8,3,5.0,2,5,0,3,2.0,2.0,1.0,3,3
3,4,0,0,1,2,5,1,1,38,1,3,1,1,83210,3.0,0,11,8,3,13.0,5,8,7,5,4.0,4.0,3.0,2,3
4,5,0,2,1,10,1,3,1,32,1,1,7,2,23420,4.0,0,12,8,2,9.0,2,6,0,4,4.0,1.0,3.0,3,3


## Model Setup

## Training and Evaluation