## Final Project - Road Accidents in France in 2019
## N°3 / Fine-Tuning de Balanced Random Forest

In [1]:
import os
import pandas as pd
import numpy as np
import joblib
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier, RUSBoostClassifier, EasyEnsembleClassifier
from sklearn.metrics import balanced_accuracy_score, accuracy_score

### Import des fichiers de données

In [2]:
os.getcwd()

'C:\\Users\\mailb\\Documents_Administrateur\\Ironhack\\DA_BOOTCAMP\\3.Labs\\dataV2-labs\\module-3\\Final-Project'

In [3]:
acc = pd.read_csv('../Final-Project/data/victml_clean_dummies_2CLASSES.csv')

In [4]:
acc.shape

(130901, 141)

## Machine Learning

In [5]:
y = acc.pop('grav')
X = acc

In [6]:
# Pour l'évaluation des modèles, nous utiliserons le score de Balanced Accuracy ("exactitude pondérée") 
# plutôt que le score d'Accuracy, car nos classes cibles sont de tailles respectives déséquilibrées. 

def display_scores(scores):
    meansc = round(scores.mean(),3)
    print(meansc)

#### Fine-Tuning de Balanced Random Forest

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#### Réglage des Hyperparamètres avec RandomSearch

Essayons d'améliorer le modèle Gradient Boosting en jouant sur ses hyper-paramètres.

In [9]:
model = BalancedRandomForestClassifier()

parameters = {"bootstrap": [True, False],
              "n_estimators" : [100, 200, 500, 1000, 1500, 2000],
              "max_depth"    : [2, 3, 10, 15, 20],
              "max_features" : ['sqrt','log2',2,50,141],
              "random_state" : [0, 42],
              'min_samples_leaf': [1, 2, 4],
              'min_samples_split': [2, 5, 10]
              }

randm = RandomizedSearchCV(model, parameters, n_jobs=-1, scoring = 'balanced_accuracy')
randm.fit(X_train, y_train)

        nan 0.71731629 0.7702071  0.75229914]


RandomizedSearchCV(estimator=BalancedRandomForestClassifier(), n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [2, 3, 10, 15, 20],
                                        'max_features': ['sqrt', 'log2', 2, 50,
                                                         141],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [100, 200, 500, 1000,
                                                         1500, 2000],
                                        'random_state': [0, 42]},
                   scoring='balanced_accuracy')

In [10]:
randm.best_estimator_

BalancedRandomForestClassifier(max_depth=20, max_features=50,
                               min_samples_split=5, n_estimators=1500,
                               random_state=42)

In [11]:
print(f"The mean cross-validated score of the best estimator is: {randm.best_score_}")

The mean cross-validated score of the best estimator is: 0.7919631647829775


#### Evaluation Finale sur le Jeu de Test

In [12]:
final_model = randm.best_estimator_

final_pred_test = final_model.predict(X_test)

final_score_test = balanced_accuracy_score(y_test, final_pred_test)

print("Final Balanced Accuracy Score on Test Set =", round(final_score_test,3))

Final Balanced Accuracy Score on Test Set = 0.785


#### Variables les plus importantes
Etudions les variables qui sont plus déterminantes que les autres dans la classification par notre modèle.

In [13]:
feature_importances = randm.best_estimator_.feature_importances_
list_feat = list(feature_importances)
list_col = list(X_train.columns)
sorted([t for t in zip(list_feat, list_col)], key=lambda t: t[0], reverse=True)

[(0.05609852351229347, 'type_veh_7'),
 (0.04713224707005582, 'type_route_3'),
 (0.04348311271837383, 'obst_mobile_IRLVT'),
 (0.0404320632016042, 'vitesse_autorisee'),
 (0.03434572645739244, 'agglo_2'),
 (0.030904598334980094, 'obst_fixe_IRLVT'),
 (0.029792256369172432, 'nb_voies'),
 (0.026950115715606675, 'type_veh_33'),
 (0.02356831075523367, 'type_circu_2'),
 (0.021243651427606817, 'type_veh_10'),
 (0.016513531028459868, 'obst_mobile_2'),
 (0.015102187612750399, 'dep_Nord Est'),
 (0.01385171281868869, 'dep_Sud Est'),
 (0.013844377222826687, 'type_usager_3'),
 (0.013801528522196333, 'loc_pieton_IRLVT'),
 (0.013670827464469728, 'type_veh_autre'),
 (0.01342673100866057, 'pieton_seul_IRLVT'),
 (0.01312729311522477, 'place_veh_10'),
 (0.012714116647623607, 'dep_Sud Ouest'),
 (0.012427470241164305, 'manoeuvre_veh_13'),
 (0.012108309711479475, 'situ_acc_3'),
 (0.01206977957847713, 'an_nais_Moins de 35 ans'),
 (0.010856617324914697, 'dep_Nord-Ouest'),
 (0.0104483219637275, 'type_veh_2'),
 (0

In [17]:
# Pour sauvegarder le modèle
joblib.dump(randm.best_estimator_, "my_model_2021-08-10.pkl")
# Pour le réutiliser
#my_model_loaded = joblib.load("my_model.pkl")

['my_model_2021-08-10.pkl']

In [15]:
print("Done!!")

Done!!
