In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.tree import ExtraTreeClassifier as ETC
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression as LRC
from sklearn.ensemble import GradientBoostingClassifier as GBC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier as RFC
from xgboost import XGBRFClassifier
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.neural_network import MLPClassifier as MLP

import sweetviz as sv

from utils.evaluation import *
from utils.dataprep import *

In [5]:
df = pd.read_csv("data/use_case_employee-attrition.csv")
df = df.drop(["EmployeeCount", "Over18", "StandardHours", "EmployeeNumber"], axis=1)

nominals = ["BusinessTravel", "Department", "EducationField", "Gender", "JobRole", "MaritalStatus", "OverTime"]
numericals = ["Age", "DailyRate", "DistanceFromHome", "Education", "EnvironmentSatisfaction", "HourlyRate", "JobInvolvement", "JobLevel", "JobSatisfaction", "MonthlyIncome", "MonthlyRate", "NumCompaniesWorked",
              "PercentSalaryHike", "PerformanceRating", "RelationshipSatisfaction", "StockOptionLevel", "TotalWorkingYears", "TrainingTimesLastYear", "WorkLifeBalance", "YearsAtCompany", "YearsInCurrentRole", 
              "YearsSinceLastPromotion", "YearsWithCurrManager"]

df_t = onehot_columns(df, nominals)
df_n = merge_onehotted(df, df_t, nominals)

X = df_n.drop("Attrition", axis=1)
y = df_n["Attrition"]
y = LabelEncoder().fit_transform(y)

scaler = MinMaxScaler()

pca = PCA(n_components=15)
resampler = RandomUnderSampler(random_state=42)
model = SVC(random_state=42, C=0.9)

column_transformer = ColumnTransformer(
    transformers=[
        ("pca", pca, df_t.columns),
        ("passthrough", "passthrough", numericals)
    ])

pipeline = Pipeline(steps=[("column_transformer", column_transformer), ("classifier", model)])
pipeline_s = Pipeline(steps=[("column_transformer", column_transformer), ("scaler", scaler), ("classifier", model)])

steps = [
    ("undersample", resampler),
    ("pca", pca),
    ("classifier", model)
]

ipipeline = ImbPipeline(steps=steps)

steps = [
    ("undersample", resampler),
    ("scaler", scaler),
    ("pca", pca),
    ("classifier", model)
]

ipipeline_s = ImbPipeline(steps=steps)

steps = [
    ("undersample", resampler),
    ("scaler", scaler),
    ("pca", pca),
    ("classifier", model)
]

ipipeline_s_f = ImbPipeline(steps=steps)

print("Model: ")
compute_acc_rec_prec_f1_with_cv(model, X, y, cv_n=10)
print("Model with PCA:")
compute_acc_rec_prec_f1_with_cv(pipeline, X, y, cv_n=10)
print("Model with PCA and undersampling:")
compute_acc_rec_prec_f1_with_cv(ipipeline, X, y, cv_n=10)
print("Model with PCA and scaling:")
compute_acc_rec_prec_f1_with_cv(pipeline_s, X, y, cv_n=10)
print("Model with PCA, scaling and undersampling:")
compute_acc_rec_prec_f1_with_cv(ipipeline_s, X, y, cv_n=10)

Model: 
Accuracy score: 83.87755102040816 +/- 0.31173984319427517
Recall score: 0.0 +/- 0.0
Precision score: 0.0 +/- 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


F1 score: 0.0 +/- 0.0
Model with PCA:
Accuracy score: 83.87755102040816 +/- 0.31173984319427517
Recall score: 0.0 +/- 0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision score: 0.0 +/- 0.0
F1 score: 0.0 +/- 0.0
Model with PCA and undersampling:
Accuracy score: 53.80952380952382 +/- 3.0943035144337836
Recall score: 65.76086956521739 +/- 7.85196004973015
Precision score: 20.69542170481229 +/- 2.145013460370438
F1 score: 31.446456111064634 +/- 3.2045042552683487
Model with PCA and scaling:
Accuracy score: 86.87074829931973 +/- 1.0112971936951376
Recall score: 21.15942028985507 +/- 7.267089961830495
Precision score: 91.71428571428572 +/- 8.518886580500327
F1 score: 33.52096360772668 +/- 9.325217040701375
Model with PCA, scaling and undersampling:
Accuracy score: 75.85034013605441 +/- 3.0913109571433357
Recall score: 69.18478260869566 +/- 5.71181029382088
Precision score: 37.08081968809462 +/- 4.590086106350273
F1 score: 48.15925181440757 +/- 4.674509643317559


({'accuracy': 75.85034013605441,
  'recall': 69.18478260869566,
  'precision': 37.08081968809462,
  'f1': 48.15925181440757},
 {'accuracy': 3.0913109571433357,
  'recall': 5.71181029382088,
  'precision': 4.590086106350273,
  'f1': 4.674509643317559})

In [3]:
report = sv.analyze(df)
report.show_html("reports/report.html", open_browser=False)

                                             |          | [  0%]   00:00 -> (? left)

Report report.html was generated.
