In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(action='ignore')

md = pd.read_excel("./buffer/1-main_dataset.xlsx")
md.drop(columns=md.columns[0], axis=1, inplace=True)

X = md.drop(columns="Attrition", axis=1)
y = [i == "Yes" for i in md["Attrition"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,  random_state=0)


In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Get the list of columns depending on there type
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
      ('imputer', SimpleImputer(strategy='median')),
      ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
      ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
      ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[
      ('num', numeric_transformer, numeric_features),
      ('cat', categorical_transformer, categorical_features)])

full_pipeline = Pipeline(steps=[
      ('preprocessor', preprocessor),
      ('modelRF', MLPClassifier())])

In [3]:

full_pipeline.fit(X_train, y_train)
y_predicted = full_pipeline.predict(X_test)
full_pipeline.score(X_test, y_test)

0.9863945578231292

In [6]:
print(f"Real Attrition: {sum(y_test)}, Predicted Attrition: {sum(y_predicted)}")

Real Attrition: 145, Predicted Attrition: 139


In [14]:
modifiable_columns = ['Average', 'TrainingTimesLastYear', 'YearsWithCurrManager', 'JobSatisfaction', 'Standard deviation', 'EnvironmentSatisfaction']

dividor = 15
modifiable_dict = {}

for i in modifiable_columns:
    if type(X_test[i].values[0]) != str:
        mini = min(X_test[i].values)
        maxi = max(X_test[i].values)

        modifiable_dict[i] = np.arange(mini, maxi, (maxi-mini)/dividor)
    else:
        modifiable_dict[i] = np.unique(X_test[i].values)

True

In [24]:
X_test_mod = X_test.copy()
X_test_mod['Average'] -= np.array( [0.2] * len(X_test_mod['Average']))
X_test_mod['TrainingTimesLastYear'] += np.array( [2] * len(X_test_mod['TrainingTimesLastYear']))

In [25]:
y_predicted_mod = full_pipeline.predict(X_test_mod)

print(f"Real Attrition: {sum(y_test)}, Predicted Attrition: {sum(y_predicted)}, Improved Attrition: {sum(y_predicted_mod)}")

Real Attrition: 145, Predicted Attrition: 139, Improved Attrition: 3
