In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

main_dataset_prepared = pd.read_excel("./buffer/2-main_dataset_prepared.xlsx")
main_dataset_prepared.drop(columns=main_dataset_prepared.columns[0], axis=1, inplace=True)

In [27]:
X = main_dataset_prepared.drop(columns=["Attrition_No", "Attrition_Yes"], axis=1)
y = main_dataset_prepared["Attrition_Yes"]

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test  = train_test_split(X, y, test_size=.2,  random_state=0)

## Choix du model

On notes les models selon leurs performances et si les coefficient des parametes d'entrées sont disponibles

In [29]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, Perceptron, RidgeClassifier

dict_models = {
    "Nearest Neighbors": KNeighborsClassifier(),
    "Linear SVM": SVC(kernel="linear",  random_state=0),
    "RBF SVM": SVC(random_state=0),
    "Decision Tree": DecisionTreeClassifier(random_state=0),
    "Random Forest": RandomForestClassifier(random_state=0),
    # "Neural Net": MLPClassifier(random_state=0),
    "AdaBoost": AdaBoostClassifier(random_state=0),
    "Logistic Regression": LogisticRegression(random_state=0),
    "Passive Aggressive Classifier": PassiveAggressiveClassifier(random_state=0),
    "Perceptron": Perceptron(random_state=0),
    "Ridge Classifier": RidgeClassifier(random_state=0)
}

pertinent_model = {}
pertinent_score= {}

for name, model in dict_models.items():
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    pertinent_score[name] = score
    print(f"{name}: {score}  |  ", end='')
    if "coef_" in dir(model) and name != "RBF SVM":
        print("coeff available")
        dict_coef = {}
        for i in range(len(model.coef_[0])):
            dict_coef[model.feature_names_in_[i]] = model.coef_[0][i]
        pertinent_model[name] = dict(sorted(dict_coef.items(),key = lambda kv: kv[1], reverse=True))
    else:
        print("coeff unavailable")

Nearest Neighbors: 0.8775510204081632  |  coeff unavailable
Linear SVM: 0.8356009070294784  |  coeff available
RBF SVM: 0.9149659863945578  |  coeff unavailable
Decision Tree: 0.9750566893424036  |  coeff unavailable
Random Forest: 0.9852607709750567  |  coeff unavailable
AdaBoost: 0.8537414965986394  |  coeff unavailable
Logistic Regression: 0.8469387755102041  |  coeff available
Passive Aggressive Classifier: 0.7494331065759637  |  coeff available
Perceptron: 0.8253968253968254  |  coeff available
Ridge Classifier: 0.854875283446712  |  coeff available


In [30]:
for name in pertinent_model.keys():
    print(f"{name}: {pertinent_score[name]}")

Linear SVM: 0.8356009070294784
Logistic Regression: 0.8469387755102041
Passive Aggressive Classifier: 0.7494331065759637
Perceptron: 0.8253968253968254
Ridge Classifier: 0.854875283446712


## Choix des paramètres

In [31]:
from sklearn.model_selection import GridSearchCV

def GridSearch(param_grid_dict, dict_models):
    bel = {}

    for name, param_grid in param_grid_dict.items():
        model = dict_models[name]
        gs = GridSearchCV(model, param_grid)
        gs.fit(X_train, y_train)
        score = gs.score(X_test, y_test)
        bel[name] = (score, gs.best_params_)

    return bel

In [49]:
param_grid_dict = {
    'Linear SVM': {
        'C': np.arange(0.0001, 0.001, 0.0001),
        'gamma': ['scale'] #, 'auto']
    },
    'Logistic Regression': {
        'penalty': ['l2'], # 'l1', 'l2', 'elasticnet', 'none'],
        'C': [0.15] # np.arange(0.05, 0.3, 0.05)
    },
    'Passive Aggressive Classifier': {
        'C': np.arange(0.0001, 0.001, 0.0001)
    },
    'Perceptron': {
        'penalty': ['l2']#, 'l1', 'elasticnet'],
    },
    'Ridge Classifier': {
        'alpha': np.arange(0.0001, 0.001, 0.0001)
    }
}

In [50]:
best_estimator_list = GridSearch(param_grid_dict, dict_models)

In [51]:
best_estimator_list

{'Linear SVM': (0.8356009070294784, {'C': 0.0001, 'gamma': 'scale'}),
 'Logistic Regression': (0.8503401360544217, {'C': 0.15, 'penalty': 'l2'}),
 'Passive Aggressive Classifier': (0.8356009070294784, {'C': 0.0001}),
 'Perceptron': (0.8174603174603174, {'penalty': 'l2'}),
 'Ridge Classifier': (0.854875283446712, {'alpha': 0.0001})}

In [81]:
coefs = dict_models['Ridge Classifier'].coef_
dict_coef = {X.columns[i]:coefs[0][i] for i in range(len(coefs[0]))}
dict_coef
dict(sorted(dict_coef.items(),key= lambda x:x[1], reverse=True))

{'MaritalStatus_Single': 0.1780875006597886,
 'JobRole_Research Director': 0.15936865978921688,
 'BusinessTravel_Travel_Frequently': 0.15458506486145607,
 'Department_Human Resources': 0.13113971868694266,
 'EducationField_Human Resources': 0.11709982166249042,
 'YearsSinceLastPromotion': 0.07924621822945982,
 'NumCompaniesWorked': 0.07197777955160167,
 'Average': 0.0713808518657956,
 'Standard deviation': 0.06769341775964487,
 'JobRole_Sales Executive': 0.038264195137925616,
 'YearsAtCompany': 0.032982758029673244,
 'JobRole_Research Scientist': 0.027532049168694273,
 'EducationField_Life Sciences': 0.02166180636065175,
 'JobRole_Laboratory Technician': 0.02158549562184055,
 'PercentSalaryHike': 0.01733664497359336,
 'Gender_Male': 0.014886853216827414,
 'JobRole_Human Resources': 0.006182213630697917,
 'EducationField_Marketing': -0.0015951897875500268,
 'DistanceFromHome': -0.0017953655326150052,
 'PerformanceRating': -0.0043904931214509835,
 'nb days off': -0.005667997858628496,
 '