In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import root_mean_squared_error

from sklearn.svm import SVR

%matplotlib inline


In [None]:
## Chargement des données

In [2]:
df = pd.read_csv("../../data/processed/compteur_name_dataset.csv", index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1797907 entries, 12886 to 708012
Data columns (total 9 columns):
 #   Column              Dtype 
---  ------              ----- 
 0   Nom du compteur     object
 1   Comptage horaire    int64 
 2   Jour                int64 
 3   Mois                int64 
 4   Année               int64 
 5   Heure               int64 
 6   Jour_semaine        int64 
 7   Jour férié          int64 
 8   Vacances scolaires  int64 
dtypes: int64(8), object(1)
memory usage: 137.2+ MB


In [12]:
all_compteurs = list(df["Nom du compteur"].unique())

In [6]:
compteurs_models3 = {}

In [None]:
## Entrainement du modèle par compteur (SVR ne supportant pas le dataset complet)

In [9]:
def svr_compteur(name):
    df_t = df.loc[df["Nom du compteur"] == name].drop(columns=["Nom du compteur"])
    y = df_t["Comptage horaire"]
    X = df_t.drop(columns=["Comptage horaire"])
    enc = OneHotEncoder()
    X = enc.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print(name)
    clf = SVR(cache_size=300, kernel='poly', gamma=0.5, C=1, degree=3)
    clf.fit(X_train, y_train)
    compteurs_models3[name] = clf
    score_train = clf.score(X_train, y_train)
    score_test = clf.score(X_test, y_test)
    y_pred = clf.predict(X_test)
    rmse = root_mean_squared_error(y_test, y_pred)
    print(f"   Score on train set: {score_train}")
    print(f"   Score on test set: {score_test}")
    print(f"   RMSE on test set: {rmse}")
    print()
    return score_train, score_test, rmse

In [13]:
avg_train = 0
avg_test = 0
avg_rmse = 0
for name in all_compteurs:
    train, test, rmse = svr_compteur(name)
    avg_train += train
    avg_test += test
    avg_rmse += rmse


106 avenue Denfert Rochereau NE-SO
   Score on train set: 0.38843868049434793
   Score on test set: 0.305332989989491
   RMSE on test set: 86.56660271832696

Quai d'Orsay O-E
   Score on train set: 0.8508632903004018
   Score on test set: 0.7307539173319146
   RMSE on test set: 72.1786202907005

Totem Cours la Reine O-E
   Score on train set: 0.8140205452585797
   Score on test set: 0.6497508054154302
   RMSE on test set: 70.82494343217911

132 rue Lecourbe NE-SO
   Score on train set: 0.5769211492454969
   Score on test set: 0.47658368042150234
   RMSE on test set: 42.46046731362929

Totem 64 Rue de Rivoli O-E
   Score on train set: 0.9022342532121537
   Score on test set: 0.8817130671639145
   RMSE on test set: 71.2371414583742

Totem 73 boulevard de Sébastopol S-N
   Score on train set: 0.9242664538521804
   Score on test set: 0.9108417353361946
   RMSE on test set: 78.77334247573907

Quai d'Orsay E-O
   Score on train set: 0.8407097870129467
   Score on test set: 0.8037077533867082

In [14]:
print(avg_train/len(all_compteurs))
print(avg_test/len(all_compteurs))
print(avg_rmse/len(all_compteurs))

0.9079092316286067
0.8659536559975083
26.587977034154402
