## Final Project - Road Accidents in France in 2019
## N°2 / Module 'Imbalanced-Learn'

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
import joblib

import sklearn
from sklearn import svm
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import balanced_accuracy_score

from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier, RUSBoostClassifier, EasyEnsembleClassifier


### Import des fichiers de données

In [2]:
acc = pd.read_csv('../Final-Project/data/victime_clean_dummies.csv')

In [3]:
acc.shape

(130901, 141)

## Machine Learning

In [4]:
from sklearn.model_selection import train_test_split
y = acc.pop('grav')
X = acc

In [5]:
# Pour l'évaluation des modèles, nous utiliserons le score de Balanced Accuracy ("exactitude pondérée") 
# plutôt que le score d'Accuracy, car nos classes cibles sont de tailles respectives déséquilibrées. 

def display_scores(scores):
    print("Mean Balanced Accuracy Score:", round(scores.mean(),3))
    print("Standard deviation:", round(scores.std(),5))

#### Choix et Entraînement de divers modèles initialisés par défaut

On essaie plusieurs modèles de classification, initialisés par défaut.
https://scikit-learn.org/stable/modules/multiclass.html

Dans un second temps, on utilise la solution proposée par le module imblearn.ensemble : des modèles d'ensembles entraînés à chaque étape sur un échantillon rééquilibré automatiquement entre les différentes classes. Ce qui permet de se passer de méthodes de rééchantillonnage avant l'entraînement.

In [6]:
# Modèle : BALANCED FOREST
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
brf1 = BalancedRandomForestClassifier()

In [8]:
brf1_scores = cross_val_score(brf1, X_train, y_train, scoring='balanced_accuracy', cv=10)
display_scores(brf1_scores)

Mean Balanced Accuracy Score: 0.559
Standard deviation: 0.00701


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

In [10]:
X_train, X_val, y_train, y_val= train_test_split(X_train, y_train, test_size=0.1)

In [11]:
brf1.fit(X_train, y_train)

BalancedRandomForestClassifier()

In [12]:
y_pred_val = brf1.predict(X_val)

In [13]:
balanced_accuracy_score(y_val, y_pred_val)

0.5601577149423805

In [14]:
from sklearn.metrics import classification_report

In [15]:
print(classification_report(y_val, y_pred_val))

              precision    recall  f1-score   support

           1       0.75      0.77      0.76      4964
           2       0.11      0.64      0.19       301
           3       0.37      0.36      0.36      1833
           4       0.70      0.47      0.56      4683

    accuracy                           0.58     11781
   macro avg       0.48      0.56      0.47     11781
weighted avg       0.65      0.58      0.61     11781



In [16]:
from sklearn.metrics import confusion_matrix

In [17]:
confusion_matrix(y_val, y_pred_val, normalize="true")

array([[0.7719581 , 0.06466559, 0.04774376, 0.11563255],
       [0.05980066, 0.63787375, 0.24252492, 0.05980066],
       [0.09056192, 0.34969995, 0.35951991, 0.20021822],
       [0.23361093, 0.11787316, 0.17723681, 0.47127909]])

In [18]:
y_val.shape

(11781,)

In [19]:
np.intersect1d(np.where(y_val == 2)[0], np.where(y_pred_val == 1)[0])

array([  641,  1032,  1271,  2313,  3549,  3945,  4479,  4776,  5345,
        6380,  7103,  7295,  7372,  8209,  9613, 10003, 10257, 10673],
      dtype=int64)

In [20]:
df = pd.read_csv('./data/victime_clean_forTableau.csv')

In [21]:
df.iloc[2019]

mois                              11
lum                                5
dep                    Ile de france
agglo                              2
intersection                   IRLVT
meteo                              2
collision                          2
type_route                         3
type_circu                         2
nb_voies                           4
declivite                          1
rectitude                          1
surface                            2
infrastructure                 IRLVT
situ_acc                           1
vitesse_autorisee                 50
sens_circu                         2
type_veh                           7
obst_fixe                      IRLVT
obst_mobile                        2
choc                               1
manoeuvre_veh                      1
num_veh                          A01
place_veh                          1
type_usager                        1
sexe                               1
an_nais              Moins de 35 ans
l

In [22]:
df.iloc[3324]

mois                             6
lum                              1
dep                  Ile de france
agglo                            2
intersection                     2
meteo                            1
collision                        6
type_route                       3
type_circu                       3
nb_voies                         4
declivite                        1
rectitude                        1
surface                          1
infrastructure                   5
situ_acc                         1
vitesse_autorisee               70
sens_circu                       1
type_veh                         7
obst_fixe                    IRLVT
obst_mobile                      2
choc                             4
manoeuvre_veh                    1
num_veh                        B01
place_veh                        1
type_usager                      1
sexe                             2
an_nais                  36-75 ans
loc_pieton                   IRLVT
pieton_seul         

In [23]:
# Modèle : BALANCED BAGGING
bbc1 = BalancedBaggingClassifier()

In [24]:
bbc1_scores = cross_val_score(bbc1, X_train, y_train, scoring='balanced_accuracy', cv=10)
display_scores(bbc1_scores)

Mean Balanced Accuracy Score: 0.523
Standard deviation: 0.00579


In [25]:
# Modèle : RUS BOOST
rusboost1 = RUSBoostClassifier()
rusboost1.fit(X_train, y_train) 

RUSBoostClassifier()

In [26]:
rusboost1_scores = cross_val_score(rusboost1, X_train, y_train, scoring='balanced_accuracy', cv=10)
display_scores(rusboost1_scores)

Mean Balanced Accuracy Score: 0.482
Standard deviation: 0.00973


In [27]:
# Modèle : EASY ENSEMBLE
ee1 = EasyEnsembleClassifier()

In [28]:
ee1_scores = cross_val_score(ee1, X_train, y_train, scoring='balanced_accuracy', cv=10)
display_scores(ee1_scores)

Mean Balanced Accuracy Score: 0.528
Standard deviation: 0.0028


#### RESUME

In [29]:
scordict = { "Model Name":["BalancedRandomForestClassifier", 'BalancedBaggingClassifier', 'RUSBoostClassifier', 'EasyEnsembleClassifier'],
            "Balanced Accuracy on Train": [brf1_scores.mean(), bbc1_scores.mean(), rusboost1_scores.mean(), ee1_scores.mean()]
           }
df_scores = pd.DataFrame(scordict)
df_scores.sort_values(by='Balanced Accuracy on Train',ascending=False).style.hide_index()

Model Name,Balanced Accuracy on Train
BalancedRandomForestClassifier,0.559266
EasyEnsembleClassifier,0.527648
BalancedBaggingClassifier,0.522711
RUSBoostClassifier,0.482152


#### CONCLUSION
Avec un score de 55% sur le jeu d'entraînement, le modèle "Balanced Forest Classifier" est le plus prometteur, mais reste insuffisant.

Pistes d'amélioration : undersampling des 2 classes majoritaires, rajout de features pour complexifier le jeu d'apprentissage, sélection d'autres algorithmes plus complexes.

In [30]:
print('finished!')

finished!


#### Réglage des Hyperparamètres avec RandomSearch

Essayons d'améliorer le modèle Gradient Boosting en jouant sur ses hyper-paramètres.

In [31]:
model = -----()

from sklearn.model_selection import RandomizedSearchCV

parameters = {"learning_rate": [0.001, 0.01, 0.1, 0.2],
              "n_estimators" : [100, 500, 1000, 1500],
              "subsample"    : [0.5, 0.7, 1.0, 1.5],
              "max_features" : ['sqrt','log2',2,50,140],
              #'min_samples_split':[2,4,6],
              #'min_samples_leaf':[3,5,7],
              "max_depth"    : [2, 3, 10, 15, 20]
              }

randm = RandomizedSearchCV(model, parameters, n_jobs=-1, scoring = 'balanced_accuracy')
randm.fit(X_train, y_train)

TypeError: bad operand type for unary -: 'tuple'

In [None]:
randm.best_estimator_

In [None]:
print(f"The mean cross-validated score of the best estimator is: {randm.best_score_}")

#### Evaluation Finale sur le Jeu de Test

In [None]:
final_model = randm.best_estimator_

final_pred_test = final_model.predict(X_test)

final_score_test = balanced_accuracy_score(y_test, final_pred_test)

print("Final Balanced Accuracy Score on Test Set =", round(final_score_test,3))

#### Variables les plus importantes
Etudions les variables qui sont plus déterminantes que les autres dans la classification par notre modèle.

In [None]:
#feature_importances = randm.best_estimator_.feature_importances_
#list_feat = list(feature_importances)
#list_col = list(X_train.columns)
#sorted([t for t in zip(list_feat, list_col)], key=lambda t: t[0], reverse=True)

In [None]:
# Pour sauvegarder le modèle
#joblib.dump(randm.best_estimator_, "my_model_2021-08-05.pkl")
# Pour le réutiliser
#my_model_loaded = joblib.load("my_model.pkl")

In [None]:
print("Done!!")