In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

%matplotlib inline


In [None]:
## Chargement des données

In [2]:
df = pd.read_csv("../../data/processed/lieu-compteur-one-hot-encoded.csv", index_col = 0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1797907 entries, 12886 to 708012
Columns: 109 entries, Comptage horaire to Nom du compteur_Voie Georges Pompidou SO-NE
dtypes: int64(109)
memory usage: 1.5 GB


In [None]:
## Processing variables catégorielles

In [3]:
y = df["Comptage horaire"]
X = df.drop(columns=["Comptage horaire"])

col_norm = ["Jour", "Mois", "Année", "Heure", "Jour_semaine", "Jour férié", "Vacances scolaires"]
encoder = preprocessing.OneHotEncoder(sparse_output=False, dtype=int) 

array = encoder.fit_transform(X[col_norm])

encoded_df_clean = pd.DataFrame(array, columns=encoder.get_feature_names_out(col_norm))

encoded_df_clean.index = X.index

X_clean = pd.concat([X.drop(columns=col_norm), encoded_df_clean], axis=1)

In [4]:
X_clean.head()

Unnamed: 0,Nom du compteur_10 avenue de la Grande Armée SE-NO,Nom du compteur_10 boulevard Auguste Blanqui NE-SO,Nom du compteur_102 boulevard de Magenta SE-NO,Nom du compteur_106 avenue Denfert Rochereau NE-SO,Nom du compteur_129 rue Lecourbe SO-NE,Nom du compteur_132 rue Lecourbe NE-SO,Nom du compteur_135 avenue Daumesnil SE-NO,Nom du compteur_147 avenue d'Italie S-N,Nom du compteur_152 boulevard du Montparnasse E-O,Nom du compteur_152 boulevard du Montparnasse O-E,...,Jour_semaine_2,Jour_semaine_3,Jour_semaine_4,Jour_semaine_5,Jour_semaine_6,Jour_semaine_7,Jour férié_0,Jour férié_1,Vacances scolaires_0,Vacances scolaires_1
12886,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
685401,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,1
667057,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,1,0
12622,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0
12680,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0


In [None]:
## Séparation données d'entrainement et de test

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X_clean, y, test_size=0.2, random_state=42)

In [7]:
params = {
    'loss': ['squared_error', 'huber', 'epsilon_insensitive'],
    'alpha': 10.0**-np.arange(1,7),
}

In [None]:
## GridSearch CV

In [8]:
sgd = SGDRegressor()
clf = GridSearchCV(sgd, params, scoring='r2', n_jobs=2)
clf.fit(X_train, y_train)
print("best params", clf.best_params_)
print("best score", clf.best_score_)
print("train score", clf.score(X_train, y_train))
print("test score", clf.score(X_test, y_test))

best params {'alpha': np.float64(1e-05), 'loss': 'squared_error'}
best score 0.5709845998948111
train score 0.5711562267757032
test score 0.5715165931378903
