# Notebook Projet - HumanForYou

### Auteurs :
- BRASSEUR **Louis**
- BOUIC **Nathan**
- TANTON **Quentin**
- FRIEDRICH **Kevin**

#### Date : 14/03/2023

In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Paths des différents fichiers
general_data_csv = "./dataset/general_data.csv"
employee_survey_data_csv = "./dataset/employee_survey_data.csv"
manager_survey_data_csv = "./dataset/manager_survey_data.csv"
working_hours_data_csv = "./dataset/working_hours.csv"



# Chargement des données
data = pd.read_csv(general_data_csv, sep=',')
employee_survey_data = pd.read_csv(employee_survey_data_csv, sep=',')
manager_survey_data = pd.read_csv(manager_survey_data_csv, sep=',')
working_hours_data = pd.read_csv(working_hours_data_csv, sep=',')

# Merge des données
data = pd.merge(data, employee_survey_data, on='EmployeeID')
data = pd.merge(data, manager_survey_data, on='EmployeeID')
data = pd.merge(data, working_hours_data, on='EmployeeID')

data

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,JobInvolvement,PerformanceRating,daily_working_hours
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,6,1,0,0,3.0,4.0,2.0,3,3,6.55
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,3,5,1,4,3.0,2.0,4.0,2,4,6.98
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,2,5,0,3,2.0,2.0,1.0,3,3,6.50
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,5,8,7,5,4.0,4.0,3.0,2,3,6.48
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,2,6,0,4,4.0,1.0,3.0,3,3,7.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4405,42,No,Travel_Rarely,Research & Development,5,4,Medical,1,4406,Female,...,5,3,0,2,4.0,1.0,3.0,3,3,7.93
4406,29,No,Travel_Rarely,Research & Development,2,4,Medical,1,4407,Male,...,2,3,0,2,4.0,4.0,3.0,2,3,5.63
4407,25,No,Travel_Rarely,Research & Development,25,2,Life Sciences,1,4408,Male,...,4,4,1,2,1.0,3.0,3.0,3,4,6.82
4408,42,No,Travel_Rarely,Sales,18,2,Medical,1,4409,Male,...,2,9,7,8,4.0,1.0,3.0,2,3,8.77


In [61]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4410 entries, 0 to 4409
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      4410 non-null   int64  
 1   Attrition                4410 non-null   object 
 2   BusinessTravel           4410 non-null   object 
 3   Department               4410 non-null   object 
 4   DistanceFromHome         4410 non-null   int64  
 5   Education                4410 non-null   int64  
 6   EducationField           4410 non-null   object 
 7   EmployeeCount            4410 non-null   int64  
 8   EmployeeID               4410 non-null   int64  
 9   Gender                   4410 non-null   object 
 10  JobLevel                 4410 non-null   int64  
 11  JobRole                  4410 non-null   object 
 12  MaritalStatus            4410 non-null   object 
 13  MonthlyIncome            4410 non-null   int64  
 14  NumCompaniesWorked      

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

# On sépare les données en train et test
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(data,data['Gender']):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

data_train_set = strat_train_set.copy()
data_test_set = strat_test_set.copy()

data = data_train_set 

# On a donc nos variables data_train_set et data_test_set pour le train et le test

In [63]:
# On créer des groupes pour certaines variables catégorielles
from sklearn.impute import SimpleImputer

data['Age'] = pd.cut(data['Age'], [17, 24, 34, 44, 54, 60], labels=['18-24', '25-34', '35-44', '45-54', '55-60'])
data['DistanceFromHome'] = pd.cut(data['DistanceFromHome'], [0,10,20,30], labels=['0-10', '11-20', '21-30'])
data['MonthlyIncome'] = pd.cut(data['MonthlyIncome'], [10000, 50000, 90000, 130000, 170000, 200000], labels=['10000-50000', '50001-90000', '90001-130000', '130001-170000', '170001-200000'])
data['PercentSalaryHike'] = pd.cut(data['PercentSalaryHike'], [10,15,20,25], labels=['10-15', '16-20', '21-26'])

imputer = SimpleImputer(strategy="median")
data['TotalWorkingYears'] = imputer.fit_transform(data[['TotalWorkingYears']])
data['TotalWorkingYears'] = pd.cut(data['TotalWorkingYears'], [-1,10,20,30,40], labels=['0-10', '11-20', '21-30', '31-40'])

data['YearsAtCompany'] = pd.cut(data['YearsAtCompany'], [-1,10,20,30,40], labels=['0-10', '11-20', '21-30', '31-40'])
data['YearsSinceLastPromotion'] = pd.cut(data['YearsSinceLastPromotion'],[-1,2,5,10,15], labels=['0-2', '3-5', '6-10', '11-15'])
data['YearwithCurrManager'] = pd.cut(data['YearsWithCurrManager'],[-1,2,5,10,17], labels=['0-2', '3-5', '6-10', '11-17'])
data['daily_working_hours'] = pd.cut(data['daily_working_hours'],[5,6,7,8,9,11], labels=['5-6', '6-7', '7-8', '8-9', '9-11'])

# On supprime les colonnes inutiles
data = data_train_set.drop(['EmployeeCount','Over18','StandardHours','EmployeeID'], axis=1)

In [64]:


from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
import sklearn
sklearn.set_config(transform_output="pandas")

# Ici on va créer des pipelines pour transformer nos données

# ----- Age pipeline -----
age_pipeline = Pipeline([
        ('ordinal', OrdinalEncoder()),
        ('std_scaler', StandardScaler()),
    ])

# ----- Attrition pipeline -----
attrition_pipeline = Pipeline([
        ('ordinal', OrdinalEncoder()),
        ('std_scaler', StandardScaler()),
    ])

# ----- BusinessTravel pipeline -----
businessTravel_pipeline = Pipeline([
        ('ordinal', OrdinalEncoder()),
        ('std_scaler', StandardScaler()),
    ])

# ----- DailyWorkHours pipeline -----
dailyWorkHours_pipeline = Pipeline([
        ('ordinal', OrdinalEncoder()),
        ('std_scaler', StandardScaler()),
    ])

# ----- Department pipeline -----
department_pipeline = Pipeline([
        ('ordinal', OneHotEncoder(sparse=False))
    ])

# ----- DistanceFromHome pipeline -----
DistanceFromHome_pipeline = Pipeline([
        ('ordinal', OrdinalEncoder()),
        ('std_scaler', StandardScaler()),
    ])

# ----- Education pipeline -----
education_pipeline = Pipeline([
        ('std_scaler', StandardScaler())
    ])

# ----- EducationField pipeline -----
educationField_pipeline = Pipeline([
    ('nominal', OneHotEncoder(sparse=False))
])

# ----- Gender pipeline -----
gender_pipeline = Pipeline([
        ('ordinal', OrdinalEncoder()),
        ('std_scaler', StandardScaler()),
    ])

# ----- JobLevel pipeline -----
jobLevel_pipeline = Pipeline([
        ('std_scaler', StandardScaler())
    ])

# ----- JobRole pipeline -----
jobRole_pipeline = Pipeline([
    ('nominal', OneHotEncoder(sparse=False))
])

# ----- MaritalStatus pipeline -----
maritalStatus_pipeline = Pipeline([
        ('ordinal', OrdinalEncoder()),
        ('std_scaler', StandardScaler()),
    ])

# ----- MonthlyIncome pipeline -----
monthlyIncome_pipeline = Pipeline([
        ('ordinal', OrdinalEncoder()),
        ('std_scaler', StandardScaler()),
    ])

# ----- NumCompaniesWorked pipeline -----
numCompaniesWorked_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler())
    ])

# ----- PercentSalaryHike pipeline -----
percentSalaryHike_pipeline = Pipeline([
        ('ordinal', OrdinalEncoder()),
        ('std_scaler', StandardScaler()),
    ])

# ----- StockOptionLevel pipeline -----
stockOptionLevel_pipeline = Pipeline([
        ('std_scaler', StandardScaler())
    ])

# ----- TotalWorkingYears pipeline -----
totalWorkingYears_pipeline = Pipeline([
        ('ordinal', OrdinalEncoder()),
        ('std_scaler', StandardScaler())
    ])

# ----- TrainingTimesLastYear pipeline -----
trainingTimesLastYear_pipeline = Pipeline([
        ('std_scaler', StandardScaler())
    ])

# ----- YearsAtCompany pipeline -----
yearsAtCompany_pipeline = Pipeline([
        ('ordinal', OrdinalEncoder()),
        ('std_scaler', StandardScaler())
    ])

# ----- YearsSinceLastPromotion pipeline -----
yearsSinceLastPromotion_pipeline = Pipeline([
        ('ordinal', OrdinalEncoder()),
        ('std_scaler', StandardScaler())
    ])

# ----- YearsWithCurrManager pipeline -----
yearsWithCurrManager_pipeline = Pipeline([
        ('ordinal', OrdinalEncoder()),
        ('std_scaler', StandardScaler())
    ])

# ----- JobInvolvement pipeline -----
jobInvolvement_pipeline = Pipeline([
        ('std_scaler', StandardScaler())
    ])

# ----- PerformanceRating pipeline -----
performanceRating_pipeline = Pipeline([
        ('std_scaler', StandardScaler())
    ])

# ----- EnvironmentSatisfaction pipeline -----
environmentSatisfaction_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler())
    ])

# ----- JobSatisfaction pipeline -----
jobSatisfaction_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler())
    ])

# ----- WorkLifeBalance pipeline -----
workLifeBalance_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler())
    ])

full_pipeline = ColumnTransformer(transformers=[
        ("Age", age_pipeline, ['Age']),
        ("Attrition", attrition_pipeline, ['Attrition']),
        ("BusinessTravel", businessTravel_pipeline, ['BusinessTravel']),
        ("Department", department_pipeline, ['Department']),
        ("DistanceFromHome", DistanceFromHome_pipeline, ['DistanceFromHome']),
        ("Education", education_pipeline, ['Education']),
        ("EducationField", educationField_pipeline, ['EducationField']),
        ("Gender", gender_pipeline, ['Gender']),
        ("JobLevel", jobLevel_pipeline, ['JobLevel']),
        ("JobRole", jobRole_pipeline, ['JobRole']),
        ("MaritalStatus", maritalStatus_pipeline, ['MaritalStatus']),
        ("MonthlyIncome", monthlyIncome_pipeline, ['MonthlyIncome']),
        ("NumCompaniesWorked", numCompaniesWorked_pipeline, ['NumCompaniesWorked']),
        ("PercentSalaryHike", percentSalaryHike_pipeline, ['PercentSalaryHike']),
        ("StockOptionLevel", stockOptionLevel_pipeline, ['StockOptionLevel']),
        ("TotalWorkingYears", totalWorkingYears_pipeline, ['TotalWorkingYears']),
        ("TrainingTimesLastYear", trainingTimesLastYear_pipeline, ['TrainingTimesLastYear']),
        ("YearsAtCompany", yearsAtCompany_pipeline, ['YearsAtCompany']),
        ("YearsSinceLastPromotion", yearsSinceLastPromotion_pipeline, ['YearsSinceLastPromotion']),
        ("YearsWithCurrManager", yearsWithCurrManager_pipeline, ['YearsWithCurrManager']),
        ("JobInvolvement", jobInvolvement_pipeline, ['JobInvolvement']),
        ("PerformanceRating", performanceRating_pipeline, ['PerformanceRating']),
        ("EnvironmentSatisfaction", environmentSatisfaction_pipeline, ['EnvironmentSatisfaction']),
        ("JobSatisfaction", jobSatisfaction_pipeline, ['JobSatisfaction']),
        ("WorkLifeBalance", workLifeBalance_pipeline, ['WorkLifeBalance']),
        ("DailyWorkHours", dailyWorkHours_pipeline, ['daily_working_hours'])
    ], verbose_feature_names_out=False)

# ----- Pipeline Global -----
data_prepared = full_pipeline.fit_transform(data)

data_prepared.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3528 entries, 171 to 465
Data columns (total 41 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Age                                3528 non-null   float64
 1   Attrition                          3528 non-null   float64
 2   BusinessTravel                     3528 non-null   float64
 3   Department_Human Resources         3528 non-null   float64
 4   Department_Research & Development  3528 non-null   float64
 5   Department_Sales                   3528 non-null   float64
 6   DistanceFromHome                   3528 non-null   float64
 7   Education                          3528 non-null   float64
 8   EducationField_Human Resources     3528 non-null   float64
 9   EducationField_Life Sciences       3528 non-null   float64
 10  EducationField_Marketing           3528 non-null   float64
 11  EducationField_Medical             3528 non-null   floa



In [65]:
# full_pipeline.named_transformers_['Age'].named_steps['ordinal'].categories_

In [66]:
data_prepared = pd.DataFrame(data_prepared)
data_prepared

Unnamed: 0,Age,Attrition,BusinessTravel,Department_Human Resources,Department_Research & Development,Department_Sales,DistanceFromHome,Education,EducationField_Human Resources,EducationField_Life Sciences,...,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager,JobInvolvement,PerformanceRating,EnvironmentSatisfaction,JobSatisfaction,WorkLifeBalance,daily_working_hours
171,1.316253,-0.446757,0.592024,0.0,0.0,1.0,-0.607734,-1.853640,0.0,0.0,...,0.154556,-0.401755,-0.544679,-0.868644,-1.024356,2.352304,-1.574961,0.246081,-2.499207,-0.356909
2430,0.270130,-0.446757,0.592024,0.0,0.0,1.0,-0.607734,1.058476,0.0,0.0,...,0.154556,-0.401755,2.352247,0.813802,0.380748,-0.425115,1.171684,0.246081,0.340015,-0.356909
3006,-0.775993,-0.446757,0.592024,0.0,1.0,0.0,0.764920,0.087771,0.0,1.0,...,0.154556,-0.401755,-0.544679,1.374617,0.380748,-0.425115,1.171684,1.156115,0.340015,-1.149840
2187,2.362376,2.238352,0.592024,0.0,1.0,0.0,0.764920,0.087771,0.0,0.0,...,-2.155928,1.427139,1.386605,1.094210,1.785852,-0.425115,-1.574961,0.246081,-1.079596,1.228954
2186,-0.775993,-0.446757,-0.916035,0.0,1.0,0.0,2.137574,1.058476,0.0,0.0,...,0.154556,-0.401755,-0.544679,-1.149051,1.785852,-0.425115,1.171684,1.156115,0.340015,-0.356909
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2262,0.270130,-0.446757,-0.916035,0.0,1.0,0.0,-0.607734,1.058476,0.0,0.0,...,-0.615605,1.427139,1.386605,1.094210,0.380748,2.352304,-1.574961,1.156115,0.340015,-0.356909
4370,1.316253,-0.446757,-2.424094,0.0,1.0,0.0,-0.607734,0.087771,0.0,0.0,...,-0.615605,1.427139,0.420963,1.094210,0.380748,-0.425115,1.171684,1.156115,0.340015,-1.149840
4182,-0.775993,-0.446757,0.592024,0.0,0.0,1.0,-0.607734,0.087771,0.0,0.0,...,0.154556,-0.401755,-0.544679,-1.149051,0.380748,-0.425115,0.256136,1.156115,0.340015,-1.149840
4274,2.362376,2.238352,0.592024,0.0,1.0,0.0,0.764920,0.087771,0.0,1.0,...,-1.385767,-0.401755,1.386605,-0.307828,-2.429459,-0.425115,-0.659413,-0.663954,0.340015,1.228954
