In [18]:
# -----------------------------
# 1Ô∏è‚É£ Import librairies
# -----------------------------
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Optionnel : XGBoost si install√©
try:
    from xgboost import XGBRegressor
    xgb_available = True
except:
    xgb_available = False

# -----------------------------
# 2Ô∏è‚É£ Charger le dataset
# -----------------------------
chemin_fichier = r"C:\Users\lynou\OneDrive\Documents\Master 2\web_scraping\Projet\data\autoscrap_FIN_clean.csv"
df = pd.read_csv(chemin_fichier, sep=';')
df.columns = df.columns.str.strip().str.replace('√©','e').str.replace(' ','_')
df['age_voiture'] = 2026 - df['annee']

# Nettoyage des outliers
def detect_outliers_iqr(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5*IQR
    upper = Q3 + 1.5*IQR
    return series[(series < lower) | (series > upper)]

df_clean = df[
    (~df['prix'].isin(detect_outliers_iqr(df['prix']))) &
    (~df['kilometrage'].isin(detect_outliers_iqr(df['kilometrage']))) &
    (~df['puissance_cv'].isin(detect_outliers_iqr(df['puissance_cv'])))
].copy()

# -----------------------------
# 3Ô∏è‚É£ Features et target
# -----------------------------
numerical_features = ['age_voiture','kilometrage','puissance_cv']
categorical_features = ['carburant','boite_vitesse']

X = df_clean[numerical_features + categorical_features]
y = df_clean['prix']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(drop='first'), categorical_features)
])

# -----------------------------
# 4Ô∏è‚É£ D√©finir les mod√®les
# -----------------------------
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.1),
    "DecisionTree": DecisionTreeRegressor(random_state=42),
    "RandomForest": RandomForestRegressor(n_estimators=200, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=200, random_state=42)
}

if xgb_available:
    models["XGBoost"] = XGBRegressor(n_estimators=200, random_state=42, verbosity=0)

# -----------------------------
# 5Ô∏è‚É£ Cr√©er dossiers pour organisation
# -----------------------------
folders = ["metrics", "models", "params", "results"]
for f in folders:
    os.makedirs(f, exist_ok=True)

# -----------------------------
# 6Ô∏è‚É£ Boucle : entra√Æner, √©valuer, sauvegarder
# -----------------------------
for name, model_instance in models.items():
    print(f"\nüîπ Traitement du mod√®le : {name}")

    # Cr√©er pipeline
    pipe = Pipeline([
        ('preprocess', preprocessor),
        ('model', model_instance)
    ])
    
    # Entra√Ænement
    pipe.fit(X_train, y_train)

    # Sauvegarder le mod√®le
    model_file = f"models/{name}.pkl"
    joblib.dump(pipe, model_file)

    # Sauvegarder hyperparam√®tres
    params = model_instance.get_params()
    params_file = f"params/param_{name}.csv"
    pd.DataFrame([params]).to_csv(params_file, index=False)

    # √âvaluation sur test set
    y_pred_test = pipe.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    r2 = r2_score(y_test, y_pred_test)
    
    metrics_file = f"metrics/metrics_{name}.csv"
    pd.DataFrame([{"MAE": mae, "RMSE": rmse, "R2": r2}]).to_csv(metrics_file, index=False)

    # Pr√©diction sur dataset complet pour sauvegarder les r√©sultats
    y_pred_all = pipe.predict(X)
    results_file = f"results/result_{name}.csv"
    pd.DataFrame({
        "prix_reel": y,
        "prix_pred": y_pred_all,
        "diff": y_pred_all - y
    }).to_csv(results_file, index=False)

    print(f"‚úÖ Sauvegard√© : mod√®le, m√©triques, params et r√©sultats pour {name}")

print("\nüéâ Tous les mod√®les et fichiers ont √©t√© sauvegard√©s dans leurs dossiers respectifs !")



üîπ Traitement du mod√®le : LinearRegression
‚úÖ Sauvegard√© : mod√®le, m√©triques, params et r√©sultats pour LinearRegression

üîπ Traitement du mod√®le : Ridge
‚úÖ Sauvegard√© : mod√®le, m√©triques, params et r√©sultats pour Ridge

üîπ Traitement du mod√®le : Lasso
‚úÖ Sauvegard√© : mod√®le, m√©triques, params et r√©sultats pour Lasso

üîπ Traitement du mod√®le : DecisionTree
‚úÖ Sauvegard√© : mod√®le, m√©triques, params et r√©sultats pour DecisionTree

üîπ Traitement du mod√®le : RandomForest
‚úÖ Sauvegard√© : mod√®le, m√©triques, params et r√©sultats pour RandomForest

üîπ Traitement du mod√®le : GradientBoosting
‚úÖ Sauvegard√© : mod√®le, m√©triques, params et r√©sultats pour GradientBoosting

üéâ Tous les mod√®les et fichiers ont √©t√© sauvegard√©s dans leurs dossiers respectifs !


In [19]:
import pandas as pd
import os

# Lister tous les fichiers metrics
metrics_files = [f for f in os.listdir("metrics") if f.endswith(".csv")]

best_model_name = None
best_r2 = -np.inf

for file in metrics_files:
    df_metrics = pd.read_csv(os.path.join("metrics", file))
    model_name = file.replace("metrics_", "").replace(".csv", "")
    r2 = df_metrics.loc[0, "R2"]
    if r2 > best_r2:
        best_r2 = r2
        best_model_name = model_name

print(f"‚úÖ Meilleur mod√®le s√©lectionn√© : {best_model_name} avec R¬≤ = {best_r2:.2f}")


‚úÖ Meilleur mod√®le s√©lectionn√© : GradientBoosting avec R¬≤ = 0.78


In [20]:
import joblib

model_path = f"models/{best_model_name}.pkl"
model = joblib.load(model_path)
