## Imports

In [9]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import math
from sklearn import metrics

In [49]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score

In [11]:
df_lapil42 = pd.read_csv('data/LAPIL42_final_selection.csv')

## Tri des variables

Pour être efficace un modèle bayésien ne doit reposer sur des variables explicatives indépendantes. Vérifions cette hypothèse, et supprimons les variables qui ne le sont pas.

In [24]:
data = df_lapil42.drop(['date', 'congestion+30min'], axis = 1)
data.corr(method = 'spearman')

Unnamed: 0,speed,hour,TV/h,Dayofweek,PL/h,Vitesse du vent moyen 10 mn,VL/h,Variation de pression en 3 heures,Etat du sol,Température,Température minimale du sol sur 12 heures
speed,1.0,0.064036,-0.364981,0.40382,-0.578953,-0.089506,-0.299871,-0.146394,-0.023157,-0.173294,0.046123
hour,0.064036,1.0,0.439234,0.0,0.102023,0.065962,0.485779,0.200906,-0.316235,0.229533,0.155208
TV/h,-0.364981,0.439234,1.0,-0.038127,0.730651,0.309847,0.986358,0.187637,-0.312847,0.540111,0.005789
Dayofweek,0.40382,0.0,-0.038127,1.0,-0.386415,0.125439,0.032691,-0.017733,-0.033602,-0.00238,-0.11533
PL/h,-0.578953,0.102023,0.730651,-0.386415,1.0,0.252208,0.628168,0.085358,-0.199844,0.397695,-0.08273
Vitesse du vent moyen 10 mn,-0.089506,0.065962,0.309847,0.125439,0.252208,1.0,0.291816,0.157771,-0.118781,0.532747,0.239525
VL/h,-0.299871,0.485779,0.986358,0.032691,0.628168,0.291816,1.0,0.193759,-0.321655,0.537074,0.027295
Variation de pression en 3 heures,-0.146394,0.200906,0.187637,-0.017733,0.085358,0.157771,0.193759,1.0,0.18305,-0.044886,0.09436
Etat du sol,-0.023157,-0.316235,-0.312847,-0.033602,-0.199844,-0.118781,-0.321655,0.18305,1.0,-0.483623,0.008462
Température,-0.173294,0.229533,0.540111,-0.00238,0.397695,0.532747,0.537074,-0.044886,-0.483623,1.0,0.333007


In [27]:
cor = data.corr(method = 'spearman')

In [32]:
columns = np.full((cor.shape[0],), True, dtype=bool)
print(columns)
for i in range(cor.shape[0]):
    for j in range(i+1, cor.shape[0]):
        if cor.iloc[i,j] >= 0.45:
            print(i,j,cor.iloc[i,j])
            if columns[j]:
                columns[j] = False
selected_columns = df_lapil42.drop(['date', 'congestion+30min'], axis = 1).columns[columns]
#data = polynomial_df[selected_columns]
selected_columns

[ True  True  True  True  True  True  True  True  True  True  True]
1 6 0.48577925178411707
2 4 0.7306514112477317
2 6 0.9863576138252252
2 9 0.5401114904579603
4 6 0.6281681993854236
5 9 0.5327472126561941
6 9 0.5370740295596701


Index(['speed', 'hour', 'TV/h', 'Dayofweek', 'Vitesse du vent moyen 10 mn',
       'Variation de pression en 3 heures', 'Etat du sol',
       'Température minimale du sol sur 12 heures'],
      dtype='object')

In [35]:
selected_columns

Index(['speed', 'hour', 'TV/h', 'Dayofweek', 'Vitesse du vent moyen 10 mn',
       'Variation de pression en 3 heures', 'Etat du sol',
       'Température minimale du sol sur 12 heures'],
      dtype='object')

In [36]:
Df = df_lapil42[selected_columns]

## Echantillonage des données

In [38]:
def split_vals(df, n): 
    return df[:n].copy(), df[n:].copy()

Df = df_lapil42[selected_columns]
y = df_lapil42['congestion+30min']

n_total = len(df_lapil42)
n_test = 1500 #arbitrary
n_train = n_total - n_test

X_train, X_valid = split_vals(Df, n_train)
y_train, y_valid = split_vals(y, n_train)

print('Number of full training data points: X = {}, y = {}'.format(X_train.shape, y_train.shape))
print('Number of validation data points: X = {}, y = {}'.format(X_valid.shape, y_valid.shape))

Number of full training data points: X = (5220, 8), y = (5220,)
Number of validation data points: X = (1500, 8), y = (1500,)


## Modèle Gaussien

In [41]:
def classifier_metrics (y_test, y_preds, average='weighted'):
    """Return Accuracy, Recall, Precision and F-1 score. 
    Average can take two arguments : macro or weighted """

    acc = metrics.accuracy_score(y_test, y_preds)
    rec = metrics.recall_score(y_test, y_preds, sample_weight = None)
    prc = metrics.precision_score(y_test, y_preds, sample_weight = None)
    f1  = metrics.f1_score(y_test, y_preds, sample_weight = None)

    print('Accuracy : {:.2f}%'.format(acc*100))
    print('Recall : {:.2f}%'.format(rec*100))
    print('Precision : {:.2f}%'.format(prc*100))
    print('F1-score : {:.2f}%'.format(f1*100))

In [39]:
#instanciation
model_Gaussian = GaussianNB()
#training
model_Gaussian.fit(X_train, y_train)

GaussianNB()

In [43]:
classifier_metrics(y_train, model_Gaussian.predict(X_train))

Accuracy : 89.54%
Recall : 62.14%
Precision : 86.55%
F1-score : 72.34%


In [46]:
mean_score = cross_val_score(model_Gaussian, X_train, y_train, scoring="f1", cv = 7).mean()
mean_score

0.7083374002225532

## Modèle Bernouilli

In [51]:
model_Bernouilli = BernoulliNB()
model_Bernouilli = model_Bernouilli.fit(X_train, y_train)

In [52]:
classifier_metrics(y_train, model_Bernouilli.predict(X_train))

Accuracy : 79.35%
Recall : 7.75%
Precision : 83.18%
F1-score : 14.17%


In [53]:
mean_score = cross_val_score(model_Bernouilli, X_train, y_train, scoring="f1", cv = 7).mean()
mean_score

0.036989887469984155

Le modèle Gaussien semble prometteur. Essayons d'affiner le choix des variables.

## Choix des variables sur le modèle gaussien

In [67]:
model_Gaussian = GaussianNB()
l = 0
global_F1 = dict()

for k in range(0,95,2):
    columns = np.full((cor.shape[0],), True, dtype=bool)
    #print(columns)
    for i in range(cor.shape[0]):
        for j in range(i+1, cor.shape[0]):
            if cor.iloc[i,j] >= k/100:
                #print(i,j,cor.iloc[i,j])
                if columns[j]:
                    columns[j] = False
    selected_columns = df_lapil42.drop(['date', 'congestion+30min'], axis = 1).columns[columns]
    
    if len(selected_columns) > l:
        l = len(selected_columns)
    
        Df = df_lapil42[selected_columns]
        y = df_lapil42['congestion+30min']

        n_total = len(df_lapil42)
        n_test = 1500 #arbitrary
        n_train = n_total - n_test

        X_train, X_valid = split_vals(Df, n_train)

        model_Gaussian.fit(X_train, y_train)
        mean_score = cross_val_score(model_Gaussian, X_train, y_train, scoring="f1", cv = 7).mean()
        print(l, mean_score, k/100)
        print('  ')
        global_F1[l] = mean_score

global_F1

1 0.7041922667252126 0.0
  
2 0.7054769792673422 0.08
  
3 0.703101832747822 0.2
  
4 0.7028159224880357 0.22
  
5 0.702614875310654 0.32
  
6 0.7006868372149399 0.34
  
7 0.7056873977436443 0.42
  
8 0.7083374002225532 0.44
  
9 0.7122921358884008 0.56
  
10 0.7479538294555576 0.74
  


{1: 0.7041922667252126,
 2: 0.7054769792673422,
 3: 0.703101832747822,
 4: 0.7028159224880357,
 5: 0.702614875310654,
 6: 0.7006868372149399,
 7: 0.7056873977436443,
 8: 0.7083374002225532,
 9: 0.7122921358884008,
 10: 0.7479538294555576}

L'amélioration des résultats semble minime. Par soucis d'explicatbilité nous allons donc garder seulement deux variables et un seuil de corrélation de 0.08.

## Conclusion

Le meilleur modèle est le modèle Gaussien couplé au jeu de données ayant pour seules variables explicatives la vitesse et l'heure.