## Final Project - Road Accidents in France in 2019
## N°2 / Module 'Imbalanced-Learn'

In [None]:
import os
import pandas as pd
import numpy as np
import joblib
import sklearn
from sklearn import model_selection
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate
from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier, RUSBoostClassifier, EasyEnsembleClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import balanced_accuracy_score, accuracy_score, classification_report

### Import des fichiers de données

In [2]:
os.getcwd()

'C:\\Users\\mailb\\Documents_Administrateur\\Ironhack\\DA_BOOTCAMP\\3.Labs\\dataV2-labs\\module-3\\Final-Project'

In [3]:
acc = pd.read_csv('../Final-Project/data/victml_clean_dummies_2CLASSES.csv')

In [4]:
acc.shape

(130901, 141)

## Machine Learning

In [5]:
from sklearn.model_selection import train_test_split
y = acc.pop('grav')
X = acc

In [None]:
# Pour l'évaluation des modèles, nous utiliserons le score de Balanced Accuracy ("exactitude pondérée") 
# plutôt que le score d'Accuracy, car nos classes cibles sont de tailles respectives déséquilibrées. 

def display_scores(scores):
    meansc = round(scores.mean(),3)
    print(meansc)

#### Choix et Entraînement de divers modèles initialisés par défaut

On essaie plusieurs modèles de classification, initialisés par défaut.
https://scikit-learn.org/stable/modules/multiclass.html

Dans un second temps, on utilise la solution proposée par le module imblearn.ensemble : des modèles d'ensembles entraînés à chaque étape sur un échantillon rééquilibré automatiquement entre les différentes classes. Ce qui permet de se passer de méthodes de rééchantillonnage avant l'entraînement.

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
# Modèle : BALANCED FOREST
brf1 = BalancedRandomForestClassifier()

In [9]:
brf1_bacc = cross_val_score(brf1, X_train, y_train, scoring='balanced_accuracy', cv=5)
display_scores(brf1_bacc)

Mean score: 0.785
Standard deviation: 0.00421


In [10]:
brf1_acc = cross_val_score(brf1, X_train, y_train, scoring='accuracy', cv=5)
display_scores(brf1_acc)

Mean score: 0.76
Standard deviation: 0.00357


In [11]:
brf1.fit(X_train, y_train)

BalancedRandomForestClassifier()

In [72]:
[(X.columns[i], brf1.feature_importances_[i]) for i in np.argsort(brf1.feature_importances_)[::-1]][:15]

[('vitesse_autorisee', 0.03939756288207424),
 ('type_veh_7', 0.035636661092275854),
 ('nb_voies', 0.03181377601076886),
 ('obst_mobile_IRLVT', 0.022961019413465265),
 ('type_route_3', 0.02271141628758067),
 ('agglo_2', 0.02241197372056967),
 ('type_veh_33', 0.021362405037182848),
 ('obst_fixe_IRLVT', 0.019504183556215842),
 ('type_circu_2', 0.018938820666266544),
 ('sexe_2', 0.01650735120242056),
 ('sens_circu_2', 0.016078501905895),
 ('an_nais_Moins de 35 ans', 0.015704200515267587),
 ('type_route_4', 0.015087112706916582),
 ('dep_Sud Est', 0.014993629475947814),
 ('num_veh_B01', 0.014353678402190635)]

In [13]:
# Modèle : BALANCED BAGGING
bbc1 = BalancedBaggingClassifier()

In [14]:
bbc1_bacc = cross_val_score(bbc1, X_train, y_train, scoring='balanced_accuracy', cv=5)
display_scores(bbc1_bacc)

Mean score: 0.765
Standard deviation: 0.00421


In [15]:
bbc1_acc = cross_val_score(bbc1, X_train, y_train, scoring='accuracy', cv=5)
display_scores(bbc1_acc)

Mean score: 0.765
Standard deviation: 0.00285


In [16]:
bbc1.fit(X_train, y_train)

BalancedBaggingClassifier()

In [73]:
forest = ExtraTreesClassifier(n_estimators=bbc1.n_estimators,
                              random_state=bbc1.random_state)

forest.fit(X_train, y_train)
bbc1.feature_importances_ = forest.feature_importances_
[(X.columns[i], bbc1.feature_importances_[i]) for i in np.argsort(bbc1.feature_importances_)[::-1]][:15]

[('vitesse_autorisee', 0.02611764427264131),
 ('type_veh_33', 0.024229403880799253),
 ('agglo_2', 0.021827297132591964),
 ('sexe_2', 0.02142362966010733),
 ('type_veh_7', 0.021385365593542836),
 ('sens_circu_2', 0.01969383352975583),
 ('nb_voies', 0.019556313710346877),
 ('an_nais_Moins de 35 ans', 0.01847014211364689),
 ('type_route_3', 0.01846318037156607),
 ('type_circu_2', 0.017926843214778405),
 ('obst_fixe_IRLVT', 0.01768272218956795),
 ('an_nais_36-75 ans', 0.01762115306905414),
 ('type_usager_2', 0.016693111490846534),
 ('dep_Sud Est', 0.01559246974652756),
 ('place_veh_2', 0.015145032161745411)]

In [28]:
# Modèle : RUS BOOST
rb1 = RUSBoostClassifier()

In [30]:
rb1_bacc = cross_val_score(rb1, X_train, y_train, scoring='balanced_accuracy', cv=5)
display_scores(rb1_bacc)

Mean score: 0.763
Standard deviation: 0.00289


In [31]:
rb1_acc = cross_val_score(rb1, X_train, y_train, scoring='accuracy', cv=5)
display_scores(rb1_acc)

Mean score: 0.754
Standard deviation: 0.00264


In [32]:
rb1.fit(X_train, y_train)

RUSBoostClassifier()

In [75]:
[(X.columns[i], rb1.feature_importances_[i]) for i in np.argsort(rb1.feature_importances_)[::-1]][:15]

[('type_veh_7', 0.06),
 ('dep_Nord Est', 0.06),
 ('type_route_3', 0.04),
 ('dep_Nord-Ouest', 0.04),
 ('dep_Sud Ouest', 0.04),
 ('place_veh_10', 0.04),
 ('dep_Sud Est', 0.04),
 ('vitesse_autorisee', 0.04),
 ('obst_mobile_2', 0.04),
 ('choc_4', 0.02),
 ('obst_fixe_6', 0.02),
 ('obst_mobile_IRLVT', 0.02),
 ('agglo_2', 0.02),
 ('type_veh_31', 0.02),
 ('choc_2', 0.02)]

In [34]:
# Modèle : EASY ENSEMBLE
ee1 = EasyEnsembleClassifier()

In [35]:
ee1_bacc = cross_val_score(ee1, X_train, y_train, scoring='balanced_accuracy', cv=5)
display_scores(ee1_bacc)

Mean score: 0.764
Standard deviation: 0.00405


In [36]:
ee1_acc = cross_val_score(ee1, X_train, y_train, scoring='accuracy', cv=5)
display_scores(ee1_acc)

Mean score: 0.755
Standard deviation: 0.00225


In [37]:
ee1.fit(X_train, y_train)

EasyEnsembleClassifier()

In [74]:
forest = ExtraTreesClassifier(n_estimators=ee1.n_estimators,
                              random_state=ee1.random_state)

forest.fit(X_train, y_train)
ee1.feature_importances_ = forest.feature_importances_
[(X.columns[i], ee1.feature_importances_[i]) for i in np.argsort(ee1.feature_importances_)[::-1]][:15]

[('vitesse_autorisee', 0.024961130263324492),
 ('agglo_2', 0.021883996222515002),
 ('sexe_2', 0.0212536375293672),
 ('type_route_3', 0.02102554193281265),
 ('nb_voies', 0.02065761420262817),
 ('sens_circu_2', 0.020399401205653842),
 ('type_veh_7', 0.020346456657038735),
 ('type_veh_33', 0.020292122644312818),
 ('an_nais_Moins de 35 ans', 0.0182735977235779),
 ('obst_fixe_IRLVT', 0.01801493666916875),
 ('type_circu_2', 0.01776367967087498),
 ('an_nais_36-75 ans', 0.017360114682035552),
 ('type_usager_2', 0.016697630379306515),
 ('dep_Sud Est', 0.015806308526349656),
 ('place_veh_2', 0.015056172358565215)]

#### RESUME

In [40]:
scordict = { "Model Name":["BalancedRandomForestClassifier", 'BalancedBaggingClassifier', 'RUSBoostClassifier', 'EasyEnsembleClassifier'],
            "Mean Cross-Val Balanced Accuracy": [brf1_bacc.mean(), bbc1_bacc.mean(), rb1_bacc.mean(), ee1_bacc.mean()],
            "Mean Cross-Val Accuracy": [brf1_acc.mean(), bbc1_acc.mean(), rb1_acc.mean(), ee1_acc.mean()]
           }
df_scores = pd.DataFrame(scordict)
df_scores.sort_values(by='Mean Cross-Val Balanced Accuracy',ascending=False).style.hide_index()

Model Name,Mean Cross-Val Balanced Accuracy,Mean Cross-Val Accuracy
BalancedRandomForestClassifier,0.78536,0.760294
BalancedBaggingClassifier,0.764916,0.765336
EasyEnsembleClassifier,0.763927,0.75508
RUSBoostClassifier,0.762812,0.753829


#### CONCLUSION
Avec un score de 78,5% sur le jeu d'entraînement, le modèle "Balanced Forest Classifier", initialisé par défaut, est le plus prometteur. A la prochaine étape, nous procéderons au réglage des hyper-paramètres.

In [42]:
print('finished! See next file.')

finished! See next file.
