Ce notebook regroupe les modèles de forêts alétoires permettant de prédire la congestion à t+30 minutes.

# Imports

In [1]:
# base modules
import os
import sys
import copy
import logging
import pandas as pd 

# custom module
from emlyon_module.imports import *
from emlyon_module.structured import *

# for manipulating data
from pandas_summary import DataFrameSummary
#!pip install dill
import dill

# for Machine Learning
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, BaggingRegressor
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, plot_tree
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.inspection import permutation_importance
from scipy.cluster import hierarchy

# for visualization
from IPython.display import display
from matplotlib import pyplot as plt
#!pip install -U plotnine
from plotnine import ggplot, aes
from plotnine.stats import stat_smooth
from pdpbox import pdp
# plotly
# seaborn
# altair

# Pre processing

In [2]:
df_lapil42 = pd.read_csv('/Users/hugo/neovya/Data/LAPIL42_final_selection.csv',parse_dates=['date'])

In [3]:
df_lapil42

Unnamed: 0,speed,hour,TV/h,Dayofweek,PL/h,Vitesse du vent moyen 10 mn,VL/h,Variation de pression en 3 heures,Etat du sol,Température,Température minimale du sol sur 12 heures,date,congestion+30min
0,87.0,0,434.0,0,141.0,2.700000,293.0,80.000000,1.000000,287.950000,286.450000,2019-09-23 00:00:00,0
1,87.0,0,434.0,0,141.0,2.672726,293.0,76.713049,0.997336,287.919104,286.424860,2019-09-23 00:06:00,1
2,87.0,0,520.0,0,170.0,2.642294,350.0,73.491523,0.995162,287.886406,286.398089,2019-09-23 00:12:00,0
3,87.0,0,520.0,0,170.0,2.608704,350.0,70.335422,0.993478,287.851905,286.369686,2019-09-23 00:18:00,0
4,86.0,0,390.0,0,120.0,2.571957,270.0,67.244747,0.992285,287.815602,286.339653,2019-09-23 00:24:00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6715,88.0,23,520.0,6,110.0,2.652161,410.0,201.973674,1.001384,283.560409,282.890517,2019-10-20 23:30:00,1
6716,85.0,23,690.0,6,140.0,2.650523,550.0,197.604785,1.001217,283.553549,282.858177,2019-10-20 23:36:00,1
6717,90.0,23,590.0,6,120.0,2.644488,470.0,193.222973,1.000995,283.549078,282.827955,2019-10-20 23:42:00,1
6718,86.0,23,510.0,6,100.0,2.634056,410.0,188.828238,1.000718,283.546996,282.799852,2019-10-20 23:48:00,1


In [4]:
#drop des colonnes VL & PL car ces données ne sont pas présentes pour les autres capteurs
df_lapil42 = df_lapil42.drop(['PL/h', 'VL/h', 'date'], axis = 1)

In [5]:
#Transformation en catéogires - Utilisation de la fonction train_cats pour convertir les colonnes en catégories
train_cats(df_lapil42)

In [6]:
#Utilisation de Proc_df pour séparer la variable à prédire des autres colonnes
df, y, nas = proc_df(df_lapil42, 'congestion+30min')

In [7]:
#Split du jeu de données en conservant l'odre
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df, y, test_size=0.33, shuffle=False)

print ("len x_train is", len(x_train))
print ("len x_test is", len(x_test))

len x_train is 4502
len x_test is 2218


In [9]:
#Vérification du nombre de classes dans le jeu de validation
unique, counts = np.unique(y_test, return_counts=True)
print (np.asarray((unique, counts)).T)

[[   0 1850]
 [   1  368]]


In [324]:
#Vérification du nombre de classes dans le jeu d'entrainement
unique, counts = np.unique(y_train, return_counts=True)
print (np.asarray((unique, counts)).T)

[[   0 3425]
 [   1 1077]]


# Premier modèle

In [11]:
def classifier_metrics (y_test, y_preds, average='weighted'):
    """Return Accuracy, Recall, Precision and F-1 score. 
    Average can take two arguments : macro or weighted """

    acc = metrics.accuracy_score(y_test, y_preds)
    rec = metrics.recall_score(y_test, y_preds, sample_weight = None)
    prc = metrics.precision_score(y_test, y_preds, sample_weight = None)
    f1  = metrics.f1_score(y_test, y_preds, sample_weight = None)

    print('Accuracy : {:.2f}%'.format(acc*100))
    print('Recall : {:.2f}%'.format(rec*100))
    print('Precision : {:.2f}%'.format(prc*100))
    print('F1-score : {:.2f}%'.format(f1*100))

## Premier modèle

Test d'un premier modèle naif

In [12]:
classifier = RandomForestClassifier(
    n_estimators = 20, 
    class_weight = None, # classifier specific
    criterion = 'gini',  # classifier specific
    max_depth = 3, 
    min_samples_split = 2, 
    min_samples_leaf = 1, 
    min_weight_fraction_leaf = 0.0, 
    max_features = 'auto', 
    max_leaf_nodes = None, 
    min_impurity_decrease = 0.0, 
    min_impurity_split = None, 
    ccp_alpha = 0.0, 
    random_state = 42, 
    bootstrap = True, 
    oob_score = True, 
    max_samples = None,
    warm_start = False, 
    n_jobs = -1, 
    verbose = 0, 
)

In [13]:
classifier.fit(x_train, y_train)

RandomForestClassifier(max_depth=3, n_estimators=20, n_jobs=-1, oob_score=True,
                       random_state=42)

Ayant une quantité de données assez limitée, nous utiliserons uniquement la cross validation pour vérifier nos résultats. Nous conservons le jeu de données test pour la phase finale de notre travail.

## Cross validation

In [14]:
from sklearn.metrics import classification_report, confusion_matrix

In [15]:
cv_score = cross_val_score(classifier, x_train, y_train, cv=10, scoring= 'f1')
cv_score

array([0.79803, 0.78378, 0.73514, 0.70659, 0.80357, 0.82126, 0.87685, 0.77157, 0.77143, 0.77885])

In [16]:
cv_score.mean()

0.7847058389111276

# Tuning hyperparameters

Pour améliorer le résultat, nous allons faire varier les différents paramètres de notre forêt aléatoire.  


Après différents tests à la main, nous avons utilisé la fonction Grid_search pour déterminer les meilleurs paramètres du modèle.

## Grid searchCV

Définition des paramètres à tester

In [17]:
param_grid = [
    {'n_estimators': [10, 30, 50, 70, 90, 110, 130, 150],
     'max_depth': [2,3,5,7,10,15, 20, None],
     'min_samples_split': [2,4,6,8,10,15,20,30,40, 50],
     'max_features': [0.25,0.4, 0.5, 0.7, 0.8],
     'max_samples': [0.2,0.4, 0.5,0.6,0.7,0.8,0.9, 1],
    },
]

model = RandomForestClassifier(
    random_state = 42,
    bootstrap = True,
)



In [18]:
tuned_model = GridSearchCV(
    estimator = model, 
    param_grid = param_grid, 
    scoring = None, # uses estimator's default score method
    n_jobs = -1, 
    refit = True, # keep a fitted version of the overall best model
    cv = 5, 
    return_train_score = True,
    verbose = 2,
)

In [307]:
tuned_model.fit(x_train, y_train)

Fitting 5 folds for each of 19200 candidates, totalling 96000 fits


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid=[{'max_depth': [2, 3, 5, 7, 10, 15, 20, None],
                          'max_features': [0.25, 0.4, 0.5, 0.7, 0.8],
                          'max_samples': [0.2, 0.4, 0.6, 0.8, 0.9, 1],
                          'min_samples_split': [2, 4, 6, 8, 10, 15, 20, 30, 40,
                                                50],
                          'n_estimators': [10, 30, 50, 70, 90, 110, 130, 150]}],
             return_train_score=True, verbose=2)

In [19]:
tuned_model.best_params_

AttributeError: 'GridSearchCV' object has no attribute 'best_params_'

## Test des meilleurs hyperparamètres issus de la fonction Grid_search:

In [28]:
classifier = RandomForestClassifier(
    n_estimators = 10, 
    class_weight = "balanced", # Utilisation du paramètre "balanced" car nos classes sont inégales
    criterion = 'gini',  
    max_depth = 10, 
    min_samples_split = 40, 
    min_samples_leaf = 1, 
    min_weight_fraction_leaf = 0.0, 
    max_features = 0.5, 
    max_leaf_nodes = None, 
    min_impurity_decrease = 0.0, 
    min_impurity_split = None, 
    ccp_alpha = 0.0, 
    random_state = 42, 
    bootstrap = True, 
    oob_score = True, 
    max_samples = 0.2,
    warm_start = False, 
    n_jobs = -1, 
    verbose = 0, 
)

In [26]:
classifier.fit(x_train, y_train)
classifier_metrics(y_train, classifier.predict(x_train))

Accuracy : 92.80%
Recall : 87.65%
Precision : 83.17%
F1-score : 85.35%


In [27]:
cv_score = cross_val_score(classifier, x_train, y_train, cv=5, scoring= 'f1')
cv_score.mean()

0.814244441826836

Ce modèle est le meilleur modèle que nous ayons trouvé en faisant des forêts aléatoires avec un score f1 de 0.814 par cross validation.