In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configurations générales
sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 6)

# Charger les données
df = pd.read_csv("datas/train.csv")

# Aperçu des premières lignes
df.head()

In [None]:
# Fonction pour extraire les valeurs numériques
def extract_numeric(value):
    if isinstance(value, str):
        value = ''.join([c for c in value if c.isdigit() or c == '.'])
        return float(value) if value else None
    return value

In [None]:
# Appliquer la conversion
df["Mileage"] = df["Mileage"].apply(extract_numeric)
df["Engine"] = df["Engine"].apply(extract_numeric)
df["Power"] = df["Power"].apply(extract_numeric)

In [None]:
df.drop(df[df["Mileage"].isna()].index, inplace=True)
# Vérification des valeurs manquantes
df.fillna(df.select_dtypes(include=['number']).median(), inplace=True)

In [None]:
df_cleaned = df.copy()

# Extraire la marque depuis la colonne Name
df_cleaned["Brand"] = df_cleaned["Name"].apply(lambda x: x.split(" ")[0] if isinstance(x, str) else "Unknown")

# Supprimer les colonnes inutiles
df_cleaned = df_cleaned.drop(["Name", "New_Price", "Seats"], axis=1)

# Suppression des outliers AVANT d'appliquer la médiane
numerical_cols = df_cleaned.select_dtypes(include=['number']).columns

for col in numerical_cols:
    Q1 = df_cleaned[col].quantile(0.25)
    Q3 = df_cleaned[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_cleaned = df_cleaned[(df_cleaned[col] >= lower_bound) & (df_cleaned[col] <= upper_bound)]

# Appliquer la médiane des prix pour encoder les catégories
categorical_cols = ["Fuel_Type", "Transmission", "Owner_Type", "Location", "Brand"]

for col in categorical_cols:
    median_price = df_cleaned.groupby(col)["Price"].median()
    df_cleaned[col] = df_cleaned[col].map(median_price)

# Calculer la matrice de corrélation
correlations = df_cleaned.corr()["Price"].sort_values(ascending=False)
correlations

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

# Recharger le dataset nettoyé
if 'df_cleaned' in locals():
    df = df_cleaned.copy()
else:
    raise NameError("Le dataset nettoyé (`df_cleaned`) n'est pas disponible.")

# Définir les features (X) et la target (y)
X = df.drop(columns=["Price"])
y = df["Price"]

# Séparer en train/test (80% entraînement, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Appliquer la normalisation (StandardScaler)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialiser les modèles
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Lasso Regression": Lasso(alpha=0.1),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "Support Vector Regression (SVR)": SVR(kernel="rbf"),
    "Neural Network (MLP)": MLPRegressor(hidden_layer_sizes=(50, 50), max_iter=1000, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

# Tester les modèles SANS normalisation
results_no_norm = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results_no_norm[name] = {
        "MAE": mean_absolute_error(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "R² Score": r2_score(y_test, y_pred)
    }

# Tester les modèles AVEC normalisation
results_norm = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    results_norm[name] = {
        "MAE": mean_absolute_error(y_test, y_pred),
        "RMSE": np.sqrt(mean_squared_error(y_test, y_pred)),
        "R² Score": r2_score(y_test, y_pred)
    }

# Convertir les résultats en DataFrame
df_results_no_norm = pd.DataFrame(results_no_norm).T
df_results_norm = pd.DataFrame(results_norm).T

# Afficher les résultats
print("🔴 Résultats modèles SANS normalisation:")
print(df_results_no_norm)
print("\n🟢 Résultats modèles AVEC normalisation:")
print(df_results_norm)


In [None]:
from sklearn.model_selection import GridSearchCV

# Définition des hyperparamètres à tester
param_grid = {
    "n_estimators": [50, 100, 200],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 5, 7],
    "subsample": [0.8, 1],
    "colsample_bytree": [0.8, 1]
}

# Initialiser le modèle XGBoost
model_xgb = XGBRegressor(random_state=42)

# GridSearchCV avec validation croisée
grid_search = GridSearchCV(
    estimator=model_xgb,
    param_grid=param_grid,
    scoring="r2",
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Exécution du GridSearchCV
grid_search.fit(X_train, y_train)

# Meilleurs paramètres
best_params = grid_search.best_params_

# Meilleure performance
best_score = grid_search.best_score_

# Afficher les résultats
print("✅ Meilleurs paramètres trouvés :")
print(best_params)
print("\n🎯 Meilleur score R² obtenu :", best_score)

# Ré-entraîner XGBoost avec les meilleurs paramètres
best_model = XGBRegressor(**best_params, random_state=42)
best_model.fit(X_train, y_train)

# Prédictions finales
y_pred_best = best_model.predict(X_test)

# Évaluer la performance
results_best = {
    "MAE": mean_absolute_error(y_test, y_pred_best),
    "RMSE": np.sqrt(mean_squared_error(y_test, y_pred_best)),
    "R² Score": r2_score(y_test, y_pred_best)
}

# Convertir les résultats en DataFrame
df_results_best = pd.DataFrame([results_best])

# Afficher les résultats finaux
print("\n🔵 Performance de XGBoost après GridSearch :")
print(df_results_best)