# Observations

- No missing values (good)
- There are features with slight colleration w.r.t. the Attrition (they are also explainable)
- There are three features immediately to be dropped because of 1 distinct value
- We are left with 19 categorical and 13 numerical features
- ID type of feature is also excluded (numerical features down to 12)
- Number of numerical features gone up to 23, nominals down to 7, because of instructions

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.tree import ExtraTreeClassifier as ETC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression as LRC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier as RFC
from xgboost import XGBRFClassifier
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.neural_network import MLPClassifier as MLP

import sweetviz as sv

from utils.evaluation import *
from utils.dataprep import *

In [4]:
df = pd.read_csv("data/use_case_employee-attrition.csv")
df = df.drop(["EmployeeCount", "Over18", "StandardHours", "EmployeeNumber"], axis=1)

nominals = ["BusinessTravel", "Department", "EducationField", "Gender", "JobRole", "MaritalStatus", "OverTime"]
numericals = ["Age", "DailyRate", "DistanceFromHome", "Education", "EnvironmentSatisfaction", "HourlyRate", "JobInvolvement", "JobLevel", "JobSatisfaction", "MonthlyIncome", "MonthlyRate", "NumCompaniesWorked",
              "PercentSalaryHike", "PerformanceRating", "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears", "TrainingTimesLastYear", "WorkLifeBalance", "YearsAtCompany", "YearsInCurrentRole", 
              "YearsSinceLastPromotion", "YearsWithCurrManager"]

df_t = onehot_columns(df, nominals)
df_n = merge_onehotted(df, df_t, nominals)

X = df_n.drop("Attrition", axis=1)
y = df_n["Attrition"]
y = LabelEncoder().fit_transform(y)

scaler = MinMaxScaler()

pca = PCA(n_components=15)
resampler = RandomUnderSampler(random_state=42)
model = MLP(hidden_layer_sizes=(100, 10), max_iter=5000)

column_transformer = ColumnTransformer(
    transformers=[
        ("pca", pca, df_t.columns),
        ("passthrough", "passthrough", numericals)
    ])

pipeline = Pipeline(steps=[("column_transformer", column_transformer), ("classifier", model)])
pipeline_s = Pipeline(steps=[("column_transformer", column_transformer), ("scaler", scaler), ("classifier", model)])

steps = [
    ("undersample", resampler),
    ("pca", pca),
    ("classifier", model)
]

ipipeline = ImbPipeline(steps=steps)

steps = [
    ("undersample", resampler),
    ("scaler", scaler),
    ("pca", pca),
    ("classifier", model)
]

ipipeline_s = ImbPipeline(steps=steps)

steps = [
    ("undersample", resampler),
    ("scaler", scaler),
    ("pca", pca),
    ("classifier", model)
]

ipipeline_s_f = ImbPipeline(steps=steps)

print("Model: ")
compute_acc_rec_prec_f1_with_cv(model, X, y, cv_n=10)
print("Model with PCA:")
compute_acc_rec_prec_f1_with_cv(pipeline, X, y, cv_n=10)
print("Model with PCA and undersampling:")
compute_acc_rec_prec_f1_with_cv(ipipeline, X, y, cv_n=10)
print("Model with PCA and scaling:")
compute_acc_rec_prec_f1_with_cv(pipeline_s, X, y, cv_n=10)
print("Model with PCA, scaling and undersampling:")
compute_acc_rec_prec_f1_with_cv(ipipeline_s, X, y, cv_n=10)

Model: 
Accuracy score: 81.36054421768706 +/- 4.239574544457928
Recall score: 5.579710144927537 +/- 11.50663260875477


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision score: 8.710907704042716 +/- 11.492060309459307
F1 score: 12.819837160810794 +/- 14.146490752790836
Model with PCA:
Accuracy score: 82.5170068027211 +/- 2.2572326561955203
Recall score: 28.786231884057965 +/- 37.12905962137999


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision score: 23.890380410595768 +/- 29.7974978703201
F1 score: 12.007619414377864 +/- 12.44538271031943
Model with PCA and undersampling:
Accuracy score: 55.646258503401356 +/- 10.955549463508788
Recall score: 70.5072463768116 +/- 19.974877889285615
Precision score: 19.09639072752651 +/- 2.7804454584221947
F1 score: 26.952979600654718 +/- 3.7177934114955997
Model with PCA and scaling:
Accuracy score: 85.03401360544217 +/- 2.2766259225417
Recall score: 43.09782608695652 +/- 8.631043949588909
Precision score: 56.715436696086854 +/- 12.61326702411128
F1 score: 50.72651658152402 +/- 8.959586290048344
Model with PCA, scaling and undersampling:
Accuracy score: 73.33333333333334 +/- 3.999166926992944
Recall score: 65.018115942029 +/- 11.794146110790374
Precision score: 33.040805759061 +/- 4.142390485125199
F1 score: 43.50106064575519 +/- 5.1291379430814565


({'accuracy': 73.33333333333334,
  'recall': 65.018115942029,
  'precision': 33.040805759061,
  'f1': 43.50106064575519},
 {'accuracy': 3.999166926992944,
  'recall': 11.794146110790374,
  'precision': 4.142390485125199,
  'f1': 5.1291379430814565})

In [3]:
report = sv.analyze(df)
report.show_html("reports/report.html", open_browser=False)

                                             |          | [  0%]   00:00 -> (? left)

Report report.html was generated.
