## Final Project - Road Accidents in France in 2019
## N°1 / "SciKit-Learn'

In [1]:
import os
import pandas as pd
import numpy as np
import joblib
import sklearn
from sklearn import model_selection
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.svm import SVC
from sklearn.metrics import balanced_accuracy_score, accuracy_score, classification_report
import catboost
from catboost import CatBoostClassifier
import xgboost 
from xgboost import XGBClassifier
import lightgbm
from lightgbm import LGBMClassifier

### Import des fichiers de données

In [2]:
os.getcwd()

'C:\\Users\\mailb\\Documents_Administrateur\\Ironhack\\DA_BOOTCAMP\\3.Labs\\dataV2-labs\\module-3\\Final-Project'

In [3]:
acc = pd.read_csv('../Final-Project/data/victime_clean_dummies.csv')

In [4]:
acc.shape

(130901, 141)

## Machine Learning

In [5]:
from sklearn.model_selection import train_test_split
y = acc.pop('grav')
X = acc

In [6]:
# Pour l'évaluation des modèles, nous utiliserons le score de Balanced Accuracy ("exactitude pondérée") 
# plutôt que le score d'Accuracy, car nos classes cibles sont de tailles respectives déséquilibrées. 

def display_scores(scores):
    meansc = round(scores.mean(),3)
    print(meansc)

#### Choix et Entraînement de divers modèles initialisés par défaut

On essaie plusieurs modèles de classification, initialisés par défaut.
https://scikit-learn.org/stable/modules/multiclass.html

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

**Inherently Multiclass**

In [8]:
#Premier modèle : RANDOM FOREST
rf1 = RandomForestClassifier()

In [9]:
# Cross-validation (Le jeu d'apprentissage est scindé en "sous-jeux", et l'apprentissage a lieu n fois d'affilée sur n-1 sous-jeux différents avec une évaluation sur le nème sous-jeu ("pli de validation").)
rf1_bacc = cross_val_score(rf1, X_train, y_train, scoring='balanced_accuracy', cv=5)
display_scores(rf1_bacc)

0.473


In [10]:
rf1_acc = cross_val_score(rf1, X_train, y_train, scoring='accuracy', cv=5)
display_scores(rf1_acc)

0.665


In [11]:
rf1.fit(X_train, y_train)

RandomForestClassifier()

In [12]:
[(X.columns[i], rf1.feature_importances_[i]) for i in np.argsort(rf1.feature_importances_)[::-1]][:15]

[('type_veh_7', 0.03301963657582155),
 ('vitesse_autorisee', 0.03063209684341509),
 ('nb_voies', 0.030480655838332368),
 ('sens_circu_2', 0.019635091687577374),
 ('sexe_2', 0.019335973430766724),
 ('num_veh_B01', 0.018645806655334603),
 ('obst_fixe_IRLVT', 0.0162515647888668),
 ('dep_Sud Est', 0.015891639075574712),
 ('type_usager_2', 0.01570314511767067),
 ('obst_mobile_IRLVT', 0.015486179297941365),
 ('obst_mobile_2', 0.014696369622052317),
 ('type_veh_33', 0.014612567281410398),
 ('declivite_2', 0.014173262972065299),
 ('type_veh_10', 0.013975955985319845),
 ('an_nais_Moins de 35 ans', 0.0139022146161148)]

In [13]:
# Modèle : EXTRA-TREES
etc1 = ExtraTreesClassifier()

In [14]:
# Cross-evaluation 
etc1_bacc = cross_val_score(etc1, X_train, y_train, scoring='balanced_accuracy', cv=5)
display_scores(etc1_bacc)

0.476


In [15]:
# Cross-evaluation 
etc1_acc = cross_val_score(etc1, X_train, y_train, scoring='accuracy', cv=5)
display_scores(etc1_acc)

0.659


In [16]:
etc1.fit(X_train, y_train)

ExtraTreesClassifier()

In [17]:
[(X.columns[i], etc1.feature_importances_[i]) for i in np.argsort(etc1.feature_importances_)[::-1]][:15]

[('type_veh_7', 0.030869811880863282),
 ('vitesse_autorisee', 0.02399364945806883),
 ('nb_voies', 0.023729800574725516),
 ('sens_circu_2', 0.0206336392603733),
 ('sexe_2', 0.019370044840830467),
 ('num_veh_B01', 0.01813940875340833),
 ('obst_mobile_IRLVT', 0.016849235359733653),
 ('obst_fixe_IRLVT', 0.016758927955151267),
 ('type_veh_33', 0.016695703416682334),
 ('type_usager_2', 0.016332573015431423),
 ('dep_Sud Est', 0.015749953186197353),
 ('declivite_2', 0.014571885614955083),
 ('agglo_2', 0.014389395905987026),
 ('obst_mobile_2', 0.013861692429197148),
 ('type_veh_10', 0.013409542563586474)]

In [18]:
# Modèle : GRADIENT BOOSTING
gbc11 = GradientBoostingClassifier()

In [19]:
# Cross-validation 
gbc11_bacc = cross_val_score(gbc11, X_train, y_train, scoring='balanced_accuracy', cv=5)
display_scores(gbc11_bacc)

0.448


In [20]:
# Cross-validation
gbc11_acc = cross_val_score(gbc11, X_train, y_train, scoring='accuracy', cv=5)
display_scores(gbc11_acc)

0.653


In [21]:
gbc11.fit(X_train, y_train)

GradientBoostingClassifier()

In [22]:
[(X.columns[i], gbc11.feature_importances_[i]) for i in np.argsort(gbc11.feature_importances_)[::-1]][:15]

[('type_veh_7', 0.1234594715105385),
 ('type_veh_10', 0.09966840837786956),
 ('obst_fixe_IRLVT', 0.07434117432815562),
 ('place_veh_10', 0.06388817690928968),
 ('type_veh_autre', 0.04921001919162119),
 ('obst_mobile_IRLVT', 0.0459610008675671),
 ('obst_mobile_2', 0.04571566008098366),
 ('agglo_2', 0.04536871352161132),
 ('pieton_seul_IRLVT', 0.040245049080619204),
 ('type_usager_3', 0.03985143423099702),
 ('num_veh_B01', 0.03004666143306421),
 ('type_usager_2', 0.025781138781970633),
 ('loc_pieton_IRLVT', 0.02531769038563318),
 ('sexe_2', 0.024746988544572295),
 ('type_veh_33', 0.020852317640815722)]

In [23]:
# Modèle : XGBoost
xgb1 = XGBClassifier()

In [24]:
# Cross-validation
xgb1_bacc = cross_val_score(xgb1, X_train, y_train, scoring='balanced_accuracy', cv=5)
display_scores(xgb1_bacc)



















0.486


In [25]:
# Cross-validation
xgb1_acc = cross_val_score(xgb1, X_train, y_train, scoring='accuracy', cv=5)
display_scores(xgb1_acc)



















0.67


In [26]:
# Modèle : LigthGBM
lgbm1 = LGBMClassifier()

In [27]:
# Cross-validation
lgbm1_bacc = cross_val_score(lgbm1, X_train, y_train, scoring='balanced_accuracy', cv=5)
display_scores(lgbm1_bacc)

0.479


In [28]:
# Cross-validation
lgbm1_acc = cross_val_score(lgbm1, X_train, y_train, scoring='accuracy', cv=5)
display_scores(lgbm1_acc)

0.668


In [29]:
# Modèle : CatBoost
cat1 = CatBoostClassifier(verbose=0, early_stopping_rounds=50)

In [30]:
# Cross-validation
cat1_bacc = cross_val_score(cat1, X_train, y_train, scoring='balanced_accuracy', cv=5, error_score='raise', verbose=0)
display_scores(cat1_bacc)

0.485


In [31]:
# Cross-validation
cat1_acc = cross_val_score(cat1, X_train, y_train, scoring='accuracy', cv=5, error_score='raise', verbose=0)
display_scores(cat1_acc)

0.673


In [32]:
# Modèle : LOGISTIC REGRESSION (MULTI-CLASS)
logreg1 = LogisticRegression(multi_class= "multinomial", max_iter=10000)

In [33]:
# Cross-validation
logreg1_bacc = cross_val_score(logreg1, X_train, y_train, scoring='balanced_accuracy', cv=5)
display_scores(logreg1_bacc)

0.456


In [34]:
# Cross-validation
logreg1_acc = cross_val_score(logreg1, X_train, y_train, scoring='accuracy', cv=5)
display_scores(logreg1_acc)

0.652


**Multiclass as One-Vs-The-Rest**

In [35]:
# Modèle : Multiclass as One-Vs-The-Rest / PERCEPTRON
pclf = Perceptron()

In [36]:
# Cross-validation
pclf_bacc = cross_val_score(pclf, X_train, y_train, scoring='balanced_accuracy', cv=5)
display_scores(pclf_bacc)

0.349


In [37]:
# Cross-validation
pclf_acc = cross_val_score(pclf, X_train, y_train, scoring='accuracy', cv=5)
display_scores(pclf_acc)

0.55


Dans un second fichier, on utilisera la solution proposée par le module imblearn.ensemble : des modèles d'ensembles destinés aux classes déséquilibrées.

#### RESUME

In [39]:
scordict = { "Model Name":["Random Forest", 'Extra-Trees', 'Gradient Boosting', 'XGBoost', 'LightGBM', 'CatBoost','Logistic Regression', 'Perceptron'],
            "Mean Cross-Val Balanced Accuracy": [rf1_bacc.mean(), etc1_bacc.mean(), gbc11_bacc.mean(), xgb1_bacc.mean(), lgbm1_bacc.mean(), cat1_bacc.mean(), logreg1_bacc.mean(), pclf_bacc.mean()],
            "Mean Cross-Val Accuracy": [rf1_acc.mean(), etc1_acc.mean(), gbc11_acc.mean(), xgb1_acc.mean(), lgbm1_acc.mean(), cat1_acc.mean(), logreg1_acc.mean(), pclf_acc.mean()]
           }
df_scores = pd.DataFrame(scordict)
df_scores.sort_values(by='Mean Cross-Val Balanced Accuracy', ascending=False).style.hide_index()

Model Name,Mean Cross-Val Balanced Accuracy,Mean Cross-Val Accuracy
XGBoost,0.485788,0.670149
CatBoost,0.484618,0.673348
LightGBM,0.478885,0.668478
Extra-Trees,0.475785,0.65889
Random Forest,0.47281,0.665346
Logistic Regression,0.456103,0.651881
Gradient Boosting,0.448286,0.653104
Perceptron,0.34882,0.549656


#### CONCLUSION
Le modèle "XGBoost" initialisé par défaut est le plus performant, mais reste insufisant avec un score inférieur à 50% (phénomène d'"underfitting").
Nous allons tester dans un second temps les algorithmes spécifiquement destinées aux classifications déséquilibrées.

In [40]:
print('finished! See next file.')

finished! See next file.
