## Final Project - Road Accidents in France in 2019
## N°1 / SKLEARN

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
import joblib

import sklearn
from sklearn import svm
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import balanced_accuracy_score

from imblearn.ensemble import BalancedRandomForestClassifier, BalancedBaggingClassifier, RUSBoostClassifier, EasyEnsembleClassifier


### Import des fichiers de données

In [2]:
acc = pd.read_csv('../data/victime_clean_dummies.csv')

In [3]:
acc.shape

(130901, 141)

## Machine Learning

In [4]:
from sklearn.model_selection import train_test_split
y = acc.pop('grav')
X = acc

#### Choix et Entraînement de divers modèles initialisés par défaut

Pour évaluer nos modèles, nous utiliserons le score de Balanced Accuracy ("exactitude pondérée") plutôt que le score d'Accuracy, car nos classes cibles sont de tailles respectives déséquilibrées. 

On essaie plusieurs modèles de classification, initialisés par défaut.
https://scikit-learn.org/stable/modules/multiclass.html
Dans un premier temps, on essaie des classifieurs intrinsèquement multi-classes : RF, ETC, GB. 

In [5]:
#Premier modèle : RANDOM FOREST
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
rf1 = RandomForestClassifier()
# On lance l'apprentissage :
rf1.fit(X_train, y_train)

RandomForestClassifier()

In [6]:
rf1_pred_train = rf1.predict(X_train)
rf1_train_score = balanced_accuracy_score(y_train, rf1_pred_train)
print("Final Balanced Accuracy Score on Train Set =", round(rf1_train_score,3))

Final Balanced Accuracy Score on Train Set = 0.997


In [7]:
rf1_pred_test = rf1.predict(X_test)
rf1_test_score = balanced_accuracy_score(y_test, rf1_pred_test)
print("Final Balanced Accuracy Score on Test Set =", round(rf1_test_score,3))

Final Balanced Accuracy Score on Test Set = 0.484


In [8]:
# Modèle : EXTRA-TREES
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
etc1 = ExtraTreesClassifier()
etc1.fit(X_train, y_train)

ExtraTreesClassifier()

In [9]:
etc1_pred_train = etc1.predict(X_train)
etc1_train_score = balanced_accuracy_score(y_train, etc1_pred_train)
print("Final Balanced Accuracy Score on Train Set =", round(etc1_train_score,3))

Final Balanced Accuracy Score on Train Set = 0.998


In [10]:
etc1_pred_test = etc1.predict(X_test)
etc1_test_score = balanced_accuracy_score(y_test, etc1_pred_test)
print("Final Balanced Accuracy Score on Test Set =", round(etc1_test_score,3))

Final Balanced Accuracy Score on Test Set = 0.474


In [11]:
# Modèle : GRADIENT BOOSTING
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
gbc1 = GradientBoostingClassifier()
gbc1.fit(X_train, y_train)

GradientBoostingClassifier()

In [12]:
gbc1_pred_train = gbc1.predict(X_train)
gbc1_train_score = balanced_accuracy_score(y_train, gbc1_pred_train)
print("Final Balanced Accuracy Score on Train Set =", round(gbc1_train_score,3))

Final Balanced Accuracy Score on Train Set = 0.453


In [13]:
gbc1_pred_test = gbc1.predict(X_test)
gbc1_test_score = balanced_accuracy_score(y_test, gbc1_pred_test)
print("Final Balanced Accuracy Score on Test Set =", round(gbc1_test_score,3))

Final Balanced Accuracy Score on Test Set = 0.446


In [14]:
# Modèle : LOGISTIC REGRESSION (MULTI-CLASS)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
logreg1 = LogisticRegression(multi_class= "multinomial", max_iter=10000)
logreg1.fit(X_train, y_train)

LogisticRegression(max_iter=10000, multi_class='multinomial')

In [15]:
logreg1_pred_train = logreg1.predict(X_train)
logreg1_train_score = balanced_accuracy_score(y_train, logreg1_pred_train)
print("Final Balanced Accuracy Score on Train Set =", round(logreg1_train_score,3))

Final Balanced Accuracy Score on Train Set = 0.457


In [16]:
logreg1_pred_test = logreg1.predict(X_test)
logreg1_test_score = balanced_accuracy_score(y_test, logreg1_pred_test)
print("Final Balanced Accuracy Score on Test Set =", round(logreg1_test_score,3))

Final Balanced Accuracy Score on Test Set = 0.458


In [None]:
# Modèle : MULTI-LAYER PERCEPTRON
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
mlp = MLPClassifier(()
mlp.fit(X_train, y_train)

In [None]:
mlp_pred_train = mlp.predict(X_train)
mlp_train_score = balanced_accuracy_score(y_train, mlp_pred_train)
print("Final Balanced Accuracy Score on Train Set =", round(mlp_train_score,3))

In [None]:
mlp_pred_test = mlp.predict(X_test)
mlp_test_score = balanced_accuracy_score(y_test, mlp_pred_test)
print("Final Balanced Accuracy Score on Test Set =", round(mlp_test_score,3))

Multiclass as One-Vs-One

In [None]:
# Modèle : SVM SVC
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
svmsvc = svm.SVC()
svmsvc.fit(X_train, y_train)

In [None]:
svmsvc_pred_train = svmsvc.predict(X_train)
svmsvc_train_score = balanced_accuracy_score(y_train, svmsvc_pred_train)
print("Final Balanced Accuracy Score on Train Set =", round(svmsvc_train_score,3))

In [None]:
svmsvc_pred_test = svmsvc.predict(X_test)
svmsvc_test_score = balanced_accuracy_score(y_test, svmsvc_pred_test)
print("Final Balanced Accuracy Score on Test Set =", round(svmsvc_test_score,3))

Multiclass as One-Vs-The-Rest

In [None]:
# Modèle : Multiclass as One-Vs-The-Rest / PERCEPTRON
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
pclf = Perceptron(()
pclf.fit(X_train, y_train)

In [None]:
pclf_pred_train = pclf.predict(X_train)
pclf_train_score = balanced_accuracy_score(y_train, pclf_pred_train)
print("Final Balanced Accuracy Score on Train Set =", round(pclf_train_score,3))

In [None]:
pclf_pred_test = pclf.predict(X_test)
pclf_test_score = balanced_accuracy_score(y_test, pclf_pred_test)
print("Final Balanced Accuracy Score on Test Set =", round(pclf_test_score,3))

**Conclusions :**
Avec un score très élevé sur le train mais inférieur à 50% sur le test, les modèles "Random Forest" et "Extra Trees Classifier" sont overfit.
Avec un score inférieur à 50% sur le jeu d'entraînement, les modèles "Gradient Boosting" et "Logistic Regression Multi-Class" sont décevants. On est dans un cas d'underfitting.

Dans un second fichier, on utilisera la solution proposée par le module imblearn.ensemble : des modèles d'ensembles destinés aux classes déséquilibrées.

In [None]:
print('finished! see next file')