In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression as LRC

import sweetviz as sv

from utils.evaluation import *
from utils.dataprep import *

In [2]:
df = pd.read_csv("data/use_case_employee-attrition.csv")
df = df.drop(["EmployeeCount", "Over18", "StandardHours", "EmployeeNumber"], axis=1)

nominals = ["BusinessTravel", "Department", "EducationField", "Gender", "JobRole", "MaritalStatus", "OverTime"]
numericals = ["Age", "DailyRate", "DistanceFromHome", "Education", "EnvironmentSatisfaction", "HourlyRate", "JobInvolvement", "JobLevel", "JobSatisfaction", "MonthlyIncome", "MonthlyRate", "NumCompaniesWorked",
              "PercentSalaryHike", "PerformanceRating", "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears", "TrainingTimesLastYear", "WorkLifeBalance", "YearsAtCompany", "YearsInCurrentRole", 
              "YearsSinceLastPromotion", "YearsWithCurrManager"]

df_t = onehot_columns(df, nominals)
df_n = merge_onehotted(df, df_t, nominals)

X = df_n.drop("Attrition", axis=1)
y = df_n["Attrition"]
y = LabelEncoder().fit_transform(y)

report = sv.analyze(df)
report.show_html("reports/report.html", open_browser=False)

                                             |          | [  0%]   00:00 -> (? left)

Report reports/report.html was generated.


In [3]:
scaler = MinMaxScaler()

pca = PCA(n_components=15)
resampler = RandomUnderSampler(random_state=42)

model_xiii = LRC(random_state=42)
model_ix = SVC(random_state=42, kernel="poly")

model_viii = SVC(random_state=42, C=0.9)
model_xiv = LRC(random_state=42, penalty=None)
model_xv = LRC(random_state=42, penalty="elasticnet", solver="saga", l1_ratio=0.5)

column_transformer = ColumnTransformer(
    transformers=[
        ("pca", pca, df_t.columns),
        ("passthrough", "passthrough", numericals)
    ])

pipeline_xiii = ImbPipeline(steps=[("undersample", resampler), ("scaler", scaler), ("pca", pca), ("classifier", model_xiii)])
pipeline_ix = ImbPipeline(steps=[("undersample", resampler), ("pca", pca), ("classifier", model_ix)])

pipeline_viii = Pipeline(steps=[("column_transformer", column_transformer), ("scaler", scaler), ("classifier", model_viii)])
pipeline_xiv = Pipeline(steps=[("column_transformer", column_transformer), ("scaler", scaler), ("classifier", model_xiv)])
pipeline_xv = Pipeline(steps=[("column_transformer", column_transformer), ("scaler", scaler), ("classifier", model_xv)])

print("XIII:")
compute_acc_rec_prec_f1_with_cv(pipeline_xiii, X, y, cv_n=10)
print("IX:")
compute_acc_rec_prec_f1_with_cv(pipeline_ix, X, y, cv_n=10)
print("VIII:")
compute_acc_rec_prec_f1_with_cv(pipeline_viii, X, y, cv_n=10)
print("XIV:")
compute_acc_rec_prec_f1_with_cv(pipeline_xiv, X, y, cv_n=10)
print("XV:")
compute_acc_rec_prec_f1_with_cv(pipeline_xv, X, y, cv_n=10)

XIII:
	Accuracy score: 74.01360544217687 +/- 2.467803693498918
	Recall score: 74.69202898550724 +/- 4.907414425245215
	Precision score: 35.65803387055716 +/- 2.7777390264058766
	F1 score: 48.161587415276394 +/- 2.72361708330449
IX:
	Accuracy score: 29.455782312925166 +/- 5.425569899752493
	Recall score: 92.8985507246377 +/- 6.46046090735605
	Precision score: 17.790634675350717 +/- 0.815330514503049
	F1 score: 29.823768383381594 +/- 1.0830790734853406
VIII:
	Accuracy score: 86.87074829931973 +/- 1.0112971936951376
	Recall score: 21.15942028985507 +/- 7.267089961830495
	Precision score: 91.71428571428572 +/- 8.518886580500327
	F1 score: 33.52096360772668 +/- 9.325217040701375
XIV:
	Accuracy score: 88.02721088435375 +/- 1.9995834634964718
	Recall score: 42.22826086956522 +/- 10.844309426787827
	Precision score: 73.0021645021645 +/- 12.056772489546304
	F1 score: 52.61612975857162 +/- 9.69508578429857
XV:
	Accuracy score: 88.50340136054422 +/- 1.8367346938775533
	Recall score: 38.8586956521

({'accuracy': 88.50340136054422,
  'recall': 38.858695652173914,
  'precision': 79.91816516816516,
  'f1': 51.6780697909106},
 {'accuracy': 1.8367346938775533,
  'recall': 9.242770666787301,
  'precision': 11.228336441424013,
  'f1': 9.382382472009791})