# Observations

- No missing values (good)
- There are features with slight colleration w.r.t. the Attrition (they are also explainable)
- There are three features immediately to be dropped because of 1 distinct value
- We are left with 19 categorical and 13 numerical features
- ID type of feature is also excluded (numerical features down to 12)
- Number of numerical features gone up to 23, nominals down to 7, because of instructions

In [47]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier as GBC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier as RFC
from xgboost import XGBRFClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

import sweetviz as sv

from utils.evaluation import *
from utils.dataprep import *

In [54]:
df = pd.read_csv("data/use_case_employee-attrition.csv")
df = df.drop(["EmployeeCount", "Over18", "StandardHours", "EmployeeNumber"], axis=1)

nominals = ["BusinessTravel", "Department", "EducationField", "Gender", "JobRole", "MaritalStatus", "OverTime"]
numericals = ["Age", "DailyRate", "DistanceFromHome", "Education", "EnvironmentSatisfaction", "HourlyRate", "JobInvolvement", "JobLevel", "JobSatisfaction", "MonthlyIncome", "MonthlyRate", "NumCompaniesWorked",
              "PercentSalaryHike", "PerformanceRating", "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears", "TrainingTimesLastYear", "WorkLifeBalance", "YearsAtCompany", "YearsInCurrentRole", 
              "YearsSinceLastPromotion", "YearsWithCurrManager"]

df_t = onehot_columns(df, nominals)
df_n = merge_onehotted(df, df_t, nominals)

X = df_n.drop("Attrition", axis=1)
y = df_n["Attrition"]
y = LabelEncoder().fit_transform(y)

scaler = MinMaxScaler()

pca = PCA(n_components=15)
resampler = RandomUnderSampler(random_state=42)
model = GBC(random_state=42)

column_transformer = ColumnTransformer(
    transformers=[
        ("pca", pca, df_t.columns),
        ("passthrough", "passthrough", numericals)
    ])

pipeline = Pipeline(steps=[("column_transformer", column_transformer), ("classifier", model)])
pipeline_s = Pipeline(steps=[("column_transformer", column_transformer), ("scaler", scaler), ("classifier", model)])

steps = [
    ("undersample", resampler),
    ("pca", pca),
    ("classifier", model)
]

ipipeline = ImbPipeline(steps=steps)

steps = [
    ("undersample", resampler),
    ("scaler", scaler),
    ("pca", pca),
    ("classifier", model)
]

ipipeline_s = ImbPipeline(steps=steps)

steps = [
    ("undersample", resampler),
    ("scaler", scaler),
    ("pca", pca),
    ("feature_selection", rfe),
    ("classifier", model)
]

ipipeline_s_f = ImbPipeline(steps=steps)

compute_acc_rec_prec_f1_with_cv(model, X, y, cv_n=10)
compute_acc_rec_prec_f1_with_cv(pipeline, X, y, cv_n=10)
compute_acc_rec_prec_f1_with_cv(ipipeline, X, y, cv_n=10)
compute_acc_rec_prec_f1_with_cv(pipeline_s, X, y, cv_n=10)
compute_acc_rec_prec_f1_with_cv(ipipeline_s, X, y, cv_n=10)

Accuracy score: 87.14285714285715 +/- 1.8617594807352393
Recall score: 33.80434782608695 +/- 13.670323945458904
Precision score: 73.72739541160594 +/- 11.562727743534257
F1 score: 44.3336622537789 +/- 13.12109296645759
Accuracy score: 87.34693877551021 +/- 1.557214032960491
Recall score: 33.87681159420289 +/- 9.841809596363106
Precision score: 74.05982905982907 +/- 8.483528872768533
F1 score: 45.552949537469665 +/- 10.092157138693283
Accuracy score: 64.76190476190477 +/- 2.5957529290257013
Recall score: 66.26811594202898 +/- 8.781580271733164
Precision score: 26.395277544711565 +/- 3.0313099710984956
F1 score: 37.703465953516094 +/- 4.247354884182273
Accuracy score: 87.34693877551021 +/- 1.557214032960491
Recall score: 33.87681159420289 +/- 9.841809596363106
Precision score: 74.05982905982907 +/- 8.483528872768533
F1 score: 45.552949537469665 +/- 10.092157138693283
Accuracy score: 87.07482993197277 +/- 1.326094468681493
Recall score: 33.42391304347826 +/- 8.836142418127853
Precision sc

({'accuracy': 72.78911564625851,
  'recall': 65.79710144927537,
  'precision': 32.97561394818052,
  'f1': 43.791763155324475},
 {'accuracy': 2.316923315092028,
  'recall': 6.685150808759202,
  'precision': 2.572433875427693,
  'f1': 2.8365283616735333})

In [3]:
report = sv.analyze(df)
report.show_html("report.html", open_browser=False)

                                             |          | [  0%]   00:00 -> (? left)

Report report.html was generated.
