In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import VotingRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import warnings


In [2]:
# Désactiver les avertissements
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

# Charger le fichier
data = pd.read_csv('C:/Users/KRISTIAN/Desktop/ISM PARIS/M2/cours/Projet/cie_pour_entrainement.csv')


In [8]:
data.head()

Unnamed: 0,Ref_Contrat_v3,DR,codexp,reglageDisjoncteur,TypeBranchement,Type_Tarif,Type_Abonne_libelle,genreAbon_libelle,periode,KWH,Montant,Log_KWH
0,21000005000,20,21,5,Monophasé,01 Domestique Social 5A,Privé,Particulier,202403,7.0,6.872128,1.94591
1,21000005000,20,21,5,Monophasé,01 Domestique Social 5A,Privé,Particulier,202405,1.0,6.620073,0.0
2,21000006000,20,21,5,Monophasé,01 Domestique Social 5A,Privé,Particulier,202407,185.0,9.316051,5.220356
3,21000009000,20,21,5,Monophasé,01 Domestique Social 5A,Privé,Particulier,202407,69.0,8.098643,4.234107
4,21000012000,20,21,5,Monophasé,01 Domestique Social 5A,Privé,Particulier,202403,3.0,6.709304,1.098612


In [4]:
# Appliquer la transformation log sur KWH et log1p sur Montant
data['Log_KWH'] = np.log(data['KWH'])
data['Montant'] = np.log(data['Montant'])

In [5]:
# Séparation des données en caractéristiques (X) et cible (y)
X = data.drop("Montant", axis=1)
y = data["Montant"]

# Séparation des données en entraînement et test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Liste des colonnes numériques et catégoriques
categorical_ordinal = ['reglageDisjoncteur']
categorical_onehot = ['TypeBranchement', 'Type_Tarif', 'Type_Abonne_libelle', 'genreAbon_libelle']
numeric_features = ['Log_KWH']

# Prétraitement des données avec gestion des valeurs manquantes
numeric_transformer = Pipeline(steps=[("scaler", RobustScaler())])

ordinal_transformer = Pipeline(steps=[("ordinal", OrdinalEncoder())])

onehot_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))])

# Création du préprocesseur pour combiner les transformations
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("ord", ordinal_transformer, categorical_ordinal),
        ("cat", onehot_transformer, categorical_onehot)
    ]
)

# Transformation des données
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


In [6]:
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Initialisation des modèles individuels
catboost_model = CatBoostRegressor(verbose=0, random_state=42)
lgbm_model = LGBMRegressor(learning_rate=0.1, max_depth=7, n_estimators=200, random_state=42)
xgb_model = XGBRegressor(random_state=42)

# Entraînement des modèles individuels
catboost_model.fit(X_train, y_train)
lgbm_model.fit(X_train, y_train)
xgb_model.fit(X_train, y_train)

# Création du Voting Regressor
voting_model = VotingRegressor([
    ('catboost', catboost_model),
    ('lgbm', lgbm_model),
    ('xgb', xgb_model)
])

# Entraînement du Voting Regressor
voting_model.fit(X_train, y_train)

# Prédictions avec le Voting Regressor
y_pred_voting = voting_model.predict(X_test)

# Calcul des métriques
mse_voting = mean_squared_error(y_test, y_pred_voting)
rmse_voting = np.sqrt(mse_voting)
mae_voting = mean_absolute_error(y_test, y_pred_voting)
r2_voting = r2_score(y_test, y_pred_voting)

# Affichage des métriques pour le Voting Regressor
print("\n### Voting Regressor ###")
print(f'Mean Squared Error (MSE): {mse_voting:.4f}')
print(f'Root Mean Squared Error (RMSE): {rmse_voting:.4f}')
print(f'Mean Absolute Error (MAE): {mae_voting:.4f}')
print(f'R^2 Score: {r2_voting:.4f}')



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.139580 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 298
[LightGBM] [Info] Number of data points in the train set: 11557196, number of used features: 18
[LightGBM] [Info] Start training from score 9.537664
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.007061 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 298
[LightGBM] [Info] Number of data points in the train set: 11557196, number of used features: 18
[LightGBM] [Info] Start training from score 9.537664

### Voting Regressor ###
Mean Squared Error (MSE): 0.0025
Root Mean Squared Error (RMSE): 0.0499
Mean Absolute Error (MAE): 0.0307
R^2 Score: 0.9972


In [7]:
from joblib import dump

# Chemins d'enregistrement
model_path = 'C:/Users/KRISTIAN/Desktop/ISM PARIS/M2/cours/Projet/voting_regressor_model.pkl'
preprocessor_path = 'C:/Users/KRISTIAN/Desktop/ISM PARIS/M2/cours/Projet/preprocessor.pkl'

# Enregistrement du modèle VotingRegressor
dump(voting_model, model_path)

# Enregistrement du préprocesseur
dump(preprocessor, preprocessor_path)

print("Modèle et préprocesseur enregistrés avec succès !")


Modèle et préprocesseur enregistrés avec succès !
