In [1]:
from lazypredict.Supervised import LazyRegressor
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor

In [2]:
# Chargement des données sans en-tête
data_path = './../DATA/IN/extract_gold_dvf_11_04_24_true_gold.csv'
data = pd.read_csv(data_path, header=None, sep=';')

# Préparation des données
data.drop(data.columns[[6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17]], axis=1, inplace=True)
data[1] = pd.to_datetime(data[1]).dt.year
for col in [0, 2, 3, 4]:
    data[col] = data[col].astype(str).str.replace(',', '.').astype(float)
    if col in [3, 4]:
        data[col].fillna(0, inplace=True)

# Exclusion des outliers
for col in [0, 2, 3, 4]:
    q1 = data[col].quantile(0.25)
    q3 = data[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]

# Définition des caractéristiques
numeric_features = [0, 1, 2, 3, 4]
categorical_features = [5, 6]

# Configuration du préprocesseur
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', RobustScaler())]), numeric_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_features)])

# Configuration du modèle avec pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', GradientBoostingRegressor())])

# Séparation des données
X = data.drop(0, axis=1)
y = data[0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
reg = LazyRegressor(verbose=0,ignore_warnings=False, custom_metric=None )

In [4]:
models,predictions = reg.fit(X_train, X_test, y_train, y_test)

100%|██████████| 42/42 [00:00<00:00, 1732.76it/s]

AdaBoostRegressor model failed to execute
all features must be in [0, 6] or [-7, 0]
BaggingRegressor model failed to execute
all features must be in [0, 6] or [-7, 0]
BayesianRidge model failed to execute
all features must be in [0, 6] or [-7, 0]
DecisionTreeRegressor model failed to execute
all features must be in [0, 6] or [-7, 0]
DummyRegressor model failed to execute
all features must be in [0, 6] or [-7, 0]
ElasticNet model failed to execute
all features must be in [0, 6] or [-7, 0]
ElasticNetCV model failed to execute
all features must be in [0, 6] or [-7, 0]
ExtraTreeRegressor model failed to execute
all features must be in [0, 6] or [-7, 0]
ExtraTreesRegressor model failed to execute
all features must be in [0, 6] or [-7, 0]
GammaRegressor model failed to execute
all features must be in [0, 6] or [-7, 0]
GaussianProcessRegressor model failed to execute
all features must be in [0, 6] or [-7, 0]
GradientBoostingRegressor model failed to execute
all features must be in [0, 6] or [




In [5]:
models

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingRegressor
from lazypredict.Supervised import LazyRegressor
import numpy as np
import datetime
import time

# Début du comptage du temps
start_time = time.time()

# Chargement des données sans en-tête
data_path = './../DATA/IN/extract_gold_dvf_11_04_24_true_gold.csv'
data = pd.read_csv(data_path, header=None, sep=';')

# Préparation des données
data.drop(data.columns[[6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17]], axis=1, inplace=True)
data[1] = pd.to_datetime(data[1]).dt.year
for col in [0, 2, 3, 4]:
    data[col] = data[col].astype(str).str.replace(',', '.').astype(float)

# Définition des colonnes numériques et catégorielles par leurs indices
numeric_features = [0, 1, 2, 3, 4]
categorical_features = [5, 6]

# Préprocesseur
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', RobustScaler())
        ]), numeric_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
        ]), categorical_features)
    ])

# Application du prétraitement
X = data.drop(0, axis=1)
y = data[0]
X_preprocessed = preprocessor.fit_transform(X)

# Séparation des données transformées
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.2, random_state=42)

# Utilisation de LazyPredict pour trouver le meilleur modèle
reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

models

# Fin du comptage du temps
end_time = time.time()
training_duration_seconds = end_time - start_time
minutes = int(training_duration_seconds // 60)
seconds = int(training_duration_seconds % 60)
training_duration_str = f"{minutes} minutes and {seconds} seconds"
print(f"Training Duration: {training_duration_str}")


 26%|██▌       | 11/42 [09:10<38:48, 75.11s/it]  

GaussianProcessRegressor model failed to execute
Unable to allocate 145. GiB for an array with shape (139692, 139692) and data type float64


 38%|███▊      | 16/42 [10:38<08:49, 20.35s/it]

KernelRidge model failed to execute
Unable to allocate 145. GiB for an array with shape (139692, 139692) and data type float64


 62%|██████▏   | 26/42 [12:16<06:13, 23.37s/it]