# Observations

- No missing values (good)
- There are features with slight colleration w.r.t. the Attrition (they are also explainable)
- There are three features immediately to be dropped because of 1 distinct value
- We are left with 19 categorical and 13 numerical features
- ID type of feature is also excluded (numerical features down to 12)
- Number of numerical features gone up to 23, nominals down to 7, because of instructions

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.tree import ExtraTreeClassifier as ETC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression as LRC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier as RFC
from xgboost import XGBRFClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier

import sweetviz as sv

from utils.evaluation import *
from utils.dataprep import *

In [15]:
df = pd.read_csv("data/use_case_employee-attrition.csv")
df = df.drop(["EmployeeCount", "Over18", "StandardHours", "EmployeeNumber"], axis=1)

nominals = ["BusinessTravel", "Department", "EducationField", "Gender", "JobRole", "MaritalStatus", "OverTime"]
numericals = ["Age", "DailyRate", "DistanceFromHome", "Education", "EnvironmentSatisfaction", "HourlyRate", "JobInvolvement", "JobLevel", "JobSatisfaction", "MonthlyIncome", "MonthlyRate", "NumCompaniesWorked",
              "PercentSalaryHike", "PerformanceRating", "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears", "TrainingTimesLastYear", "WorkLifeBalance", "YearsAtCompany", "YearsInCurrentRole", 
              "YearsSinceLastPromotion", "YearsWithCurrManager"]

df_t = onehot_columns(df, nominals)
df_n = merge_onehotted(df, df_t, nominals)

X = df_n.drop("Attrition", axis=1)
y = df_n["Attrition"]
y = LabelEncoder().fit_transform(y)

scaler = MinMaxScaler()

pca = PCA(n_components=15)
resampler = RandomUnderSampler(random_state=42)
model = LRC(random_state=42, penalty="elasticnet", solver="saga", l1_ratio=0.5)

column_transformer = ColumnTransformer(
    transformers=[
        ("pca", pca, df_t.columns),
        ("passthrough", "passthrough", numericals)
    ])

pipeline = Pipeline(steps=[("column_transformer", column_transformer), ("classifier", model)])
pipeline_s = Pipeline(steps=[("column_transformer", column_transformer), ("scaler", scaler), ("classifier", model)])

steps = [
    ("undersample", resampler),
    ("pca", pca),
    ("classifier", model)
]

ipipeline = ImbPipeline(steps=steps)

steps = [
    ("undersample", resampler),
    ("scaler", scaler),
    ("pca", pca),
    ("classifier", model)
]

ipipeline_s = ImbPipeline(steps=steps)

steps = [
    ("undersample", resampler),
    ("scaler", scaler),
    ("pca", pca),
    ("classifier", model)
]

ipipeline_s_f = ImbPipeline(steps=steps)

print("Model: ")
#compute_acc_rec_prec_f1_with_cv(model, X, y, cv_n=10)
print("Model with PCA:")
#compute_acc_rec_prec_f1_with_cv(pipeline, X, y, cv_n=10)
print("Model with PCA and undersampling:")
#compute_acc_rec_prec_f1_with_cv(ipipeline, X, y, cv_n=10)
print("Model with PCA and scaling:")
compute_acc_rec_prec_f1_with_cv(pipeline_s, X, y, cv_n=10)
print("Model with PCA, scaling and undersampling:")
compute_acc_rec_prec_f1_with_cv(ipipeline_s, X, y, cv_n=10)

Model: 
Model with PCA:
Model with PCA and undersampling:
Model with PCA and scaling:
Accuracy score: 88.50340136054422 +/- 1.8367346938775533
Recall score: 38.858695652173914 +/- 9.242770666787301
Precision score: 79.91816516816516 +/- 11.228336441424013
F1 score: 51.6780697909106 +/- 9.382382472009791
Model with PCA, scaling and undersampling:
Accuracy score: 73.87755102040816 +/- 2.5850340136054415
Recall score: 74.69202898550724 +/- 4.907414425245215
Precision score: 35.535737534104754 +/- 2.855893696338728
F1 score: 48.04205272840555 +/- 2.7668686004296434


({'accuracy': 73.87755102040816,
  'recall': 74.69202898550724,
  'precision': 35.535737534104754,
  'f1': 48.04205272840555},
 {'accuracy': 2.5850340136054415,
  'recall': 4.907414425245215,
  'precision': 2.855893696338728,
  'f1': 2.7668686004296434})

In [3]:
report = sv.analyze(df)
report.show_html("report.html", open_browser=False)

                                             |          | [  0%]   00:00 -> (? left)

Report report.html was generated.
