In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(action='ignore')

md = pd.read_excel("./buffer/1-main_dataset.xlsx")
md.drop(columns=md.columns[0], axis=1, inplace=True)

X = md.drop(columns="Attrition", axis=1)
y = [i == "Yes" for i in md["Attrition"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,  random_state=0)


In [18]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

model_dict = {
    'Random Forest': RandomForestClassifier(random_state=0),
    'Logistic Regression': LogisticRegression(C=0.15, penalty="l2"),
    'Multi Layered Perceptron': MLPClassifier(random_state=0)
}

# Get the list of columns depending on there type
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
      ('imputer', SimpleImputer(strategy='median')),
      ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
      ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
      ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[
      ('num', numeric_transformer, numeric_features),
      ('cat', categorical_transformer, categorical_features)])

full_pipeline_dict = {}
for name, model in model_dict.items():
      full_pipeline_dict[name] = Pipeline(steps=[
            ('preprocessor', preprocessor),
            (name, model)])

In [19]:
y_pred_dict = {}

for name, pipeline in full_pipeline_dict.items():
    pipeline.fit(X_train, y_train)
    score = pipeline.score(X_test, y_test)
    y_pred_dict[name] = pipeline.predict(X_test)
    print(f"{name} score: {score}")

Random Forest score: 0.9852607709750567
Logistic Regression score: 0.8503401360544217
Multi Layered Perceptron score: 0.9829931972789115


In [21]:
X_test_mod = X_test.copy()

X_test_mod['Average'] -= np.array( [0.2] * len(X_test_mod['Average']))
X_test_mod['TrainingTimesLastYear'] += np.array( [2] * len(X_test_mod['TrainingTimesLastYear']))

y_pred_mod_dict = {}
for name, pipeline in full_pipeline_dict.items():
    y_pred_mod_dict[name] = pipeline.predict(X_test_mod)

print(f"Real Attrition: {sum(y_test)}\nPredicted Attrition: { {k:sum(v) for k, v in y_pred_dict.items()} }\nImproved Attrition: { {k:sum(v) for k, v in y_pred_mod_dict.items()} }")

Real Attrition: 145
Predicted Attrition: {'Random Forest': 132, 'Logistic Regression': 59, 'Multi Layered Perceptron': 138}
Improved Attrition: {'Random Forest': 55, 'Logistic Regression': 0, 'Multi Layered Perceptron': 4}
