In [1075]:
# Import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

import math
from sklearn.metrics import r2_score, mean_squared_error

import os

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

from sklearn.model_selection import KFold




In [1076]:
# Lecture CSV
cars_data = pd.read_csv('../data/voiture.csv')

In [1077]:
print(len(cars_data))


11914


In [1078]:
cars_data.head()


Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [1079]:
# Data cleaning

cars_data.columns = cars_data.columns.str.lower().str.replace(" ", "_")
cars_data.rename(columns = {'engine_fuel_type' : 'fuel_type', 'engine_hp' : 'hp', 'engine_cylinders' : 'cylinders', 'transmission_type' : 'transmission', 'driven_wheels' : 'drive', 'number_of_doors' : 'doors', 'market_category' : 'market', 'vehicle_size' : 'size', 'vehicle_style' : 'style', 'msrp' : 'price'}, inplace = True)

In [1080]:
cars_data.head()


Unnamed: 0,make,model,year,fuel_type,hp,cylinders,transmission,drive,doors,market,size,style,highway_mpg,city_mpg,popularity,price
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [1081]:
print('Number of duplicates are : ', cars_data.duplicated().sum())
# exemple de doublon
print(cars_data[cars_data.duplicated(keep=False)].head(10))
# nombre de doublon 

print('Nb avant suppression de doublon : ', len(cars_data))
cars_data = cars_data.drop_duplicates()

print('Nb après suppression de doublon : ', len(cars_data))

Number of duplicates are :  715
      make     model  year                    fuel_type     hp  cylinders  \
11     BMW  1 Series  2013  premium unleaded (required)  230.0        6.0   
14     BMW  1 Series  2013  premium unleaded (required)  230.0        6.0   
17    Audi       100  1992             regular unleaded  172.0        6.0   
18    Audi       100  1992             regular unleaded  172.0        6.0   
20    Audi       100  1992             regular unleaded  172.0        6.0   
22    Audi       100  1993             regular unleaded  172.0        6.0   
24    Audi       100  1993             regular unleaded  172.0        6.0   
25    Audi       100  1993             regular unleaded  172.0        6.0   
87  Nissan     200SX  1996             regular unleaded  115.0        4.0   
88  Nissan     200SX  1996             regular unleaded  115.0        4.0   

   transmission              drive  doors              market     size  style  \
11       MANUAL   rear wheel drive    2

In [1082]:
print('Number of missing values in each columns are below : ')
print(cars_data.isnull().sum())

Number of missing values in each columns are below : 
make               0
model              0
year               0
fuel_type          3
hp                69
cylinders         30
transmission       0
drive              0
doors              6
market          3376
size               0
style              0
highway_mpg        0
city_mpg           0
popularity         0
price              0
dtype: int64


In [1083]:
cars_data.drop('market', axis = 1, inplace = True)

cars_data.drop('popularity', axis = 1, inplace = True)



In [1084]:
null_values = cars_data[cars_data.isnull().any(axis = 1)]
null_values


Unnamed: 0,make,model,year,fuel_type,hp,cylinders,transmission,drive,doors,size,style,highway_mpg,city_mpg,price
539,FIAT,500e,2015,electric,,0.0,DIRECT_DRIVE,front wheel drive,2.0,Compact,2dr Hatchback,108,122,31800
540,FIAT,500e,2016,electric,,0.0,DIRECT_DRIVE,front wheel drive,2.0,Compact,2dr Hatchback,103,121,31800
541,FIAT,500e,2017,electric,,0.0,DIRECT_DRIVE,front wheel drive,2.0,Compact,2dr Hatchback,103,121,31800
1983,Chevrolet,Bolt EV,2017,electric,200.0,,DIRECT_DRIVE,front wheel drive,4.0,Compact,4dr Hatchback,110,128,40905
1984,Chevrolet,Bolt EV,2017,electric,200.0,,DIRECT_DRIVE,front wheel drive,4.0,Compact,4dr Hatchback,110,128,36620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9853,Kia,Soul EV,2016,electric,,0.0,DIRECT_DRIVE,front wheel drive,4.0,Compact,Wagon,92,120,31950
9854,Kia,Soul EV,2016,electric,,0.0,DIRECT_DRIVE,front wheel drive,4.0,Compact,Wagon,92,120,35950
11321,Suzuki,Verona,2004,,155.0,6.0,AUTOMATIC,front wheel drive,4.0,Midsize,Sedan,25,17,17199
11322,Suzuki,Verona,2004,,155.0,6.0,AUTOMATIC,front wheel drive,4.0,Midsize,Sedan,25,17,20199


In [1085]:


cars_data['fuel_type'] = cars_data['fuel_type'].fillna('regular unleaded')

cars_data['hp'] = cars_data['hp'].fillna(cars_data['hp'].mean())

cars_data['cylinders'] = cars_data['cylinders'].fillna(0.0)

cars_data['doors'] = cars_data['doors'].fillna(cars_data['doors'].mean())

In [1086]:
num_col = cars_data.select_dtypes(include = [np.number])
cat_col = cars_data.select_dtypes(exclude = [np.number])

In [1087]:
# Supression des voitures ayant unknown

cars_data.drop(cars_data[cars_data['transmission']=='UNKNOWN'].index, axis='index', inplace = True)

print(len(cars_data))
print('Number of missing values in each columns are below : ')
print(cars_data.isnull().sum())


11187
Number of missing values in each columns are below : 
make            0
model           0
year            0
fuel_type       0
hp              0
cylinders       0
transmission    0
drive           0
doors           0
size            0
style           0
highway_mpg     0
city_mpg        0
price           0
dtype: int64


In [1088]:
s1 = cars_data.shape
clean = cars_data[['hp', 'cylinders', 'highway_mpg', 'city_mpg', 'price']]
for i in clean.columns:
    qt1 = cars_data[i].quantile(0.25)
    qt3 = cars_data[i].quantile(0.75)
    iqr =  qt3 - qt1
    lower = qt1-(1.5*iqr)
    upper = qt3+(1.5*iqr)
    min_in = cars_data[cars_data[i]<lower].index
    max_in = cars_data[cars_data[i]>upper].index
    
    cars_data.drop(min_in, inplace = True)
    cars_data.drop(max_in, inplace = True)
s2 = cars_data.shape
outliers = s1[0] - s2[0]
print("Deleted outliers are : ", outliers)

Deleted outliers are :  1407


In [1089]:
# Nombre de ligne restante dans le dataset après nettoyage

print(len(cars_data))


9780


Partie IA KNN
Deviner le prix en fonction des caractéristiques


In [1090]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

In [1091]:
# Fonction d'encodage des variables catégoriques en fonction du prix moyen
def encode_categorical(df, col, target='price'):
    encoding = df.groupby(col)[target].mean() / df[target].mean()
    df[col] = df[col].map(encoding).fillna(0)
    return df, encoding

# Fonction de normalisation des variables numériques entre 0 et 1
def normalize_numerical(df, col):
    min_val = df[col].min()
    max_val = df[col].max()
    df[col] = (df[col] - min_val) / (max_val - min_val)
    return df, (min_val, max_val)

In [1092]:
# Fonction d'encodage et de normalisation dynamique
def encode_and_normalize(df, encoders, normalizers, categorical_cols, numerical_cols):
    for col in categorical_cols:
        if col in df.columns:
            df[col] = df[col].map(encoders[col]).fillna(0)

    for col in numerical_cols:
        if col in df.columns:
            min_val, max_val = normalizers[col]
            df[col] = (df[col] - min_val) / (max_val - min_val)

    return df

In [1093]:
# Variables
input_features = [
    'make', 'model', 'year', 'fuel_type', 'hp', 'cylinders',
    'transmission', 'drive', 'doors', 'size', 'style',
    'highway_mpg', 'city_mpg'
]

In [1094]:
def train_knn_model(cars_data, k_neighbors=4):
    categorical_cols = ['make', 'model', 'fuel_type', 'transmission', 'drive', 'size', 'style']
    numerical_cols = ['year', 'hp', 'cylinders', 'doors', 'highway_mpg', 'city_mpg']

    encoders = {}
    normalizers = {}

    # Target encoding pour les colonnes catégoriques
    for col in categorical_cols:
        if col in cars_data.columns:
            cars_data, encoder = encode_categorical(cars_data, col)
            encoders[col] = encoder

    # Normalisation pour les colonnes numériques
    for col in numerical_cols:
        if col in cars_data.columns:
            cars_data, normalizer = normalize_numerical(cars_data, col)
            normalizers[col] = normalizer

    available_cols = [col for col in input_features if col in cars_data.columns]
    print(f"Available columns for training: {available_cols}")

    X = cars_data[available_cols]
    y = cars_data['price']

    model = KNeighborsRegressor(n_neighbors=k_neighbors)
    model.fit(X, y)

    return model, encoders, normalizers, X


In [1095]:
def find_best_k(cars_data, input_features, max_k=30, test_size=0.3):
    categorical_cols = ['make', 'model', 'fuel_type', 'transmission', 'drive', 'size', 'style']
    numerical_cols = ['year', 'hp', 'cylinders', 'doors', 'highway_mpg', 'city_mpg']


    # Copie du dataset pour éviter modification originale
    data = cars_data.copy()

    encoders = {}
    normalizers = {}

    for col in categorical_cols:
        if col in data.columns:
            data, encoder = encode_categorical(data, col)
            encoders[col] = encoder

    for col in numerical_cols:
        if col in data.columns:
            data, normalizer = normalize_numerical(data, col)
            normalizers[col] = normalizer

    available_cols = [col for col in input_features if col in data.columns]
    X = data[available_cols]
    y = data['price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, shuffle=True)

    rmse_list = []

    for k in range(1, max_k + 1):
        model = KNeighborsRegressor(n_neighbors=k)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        rmse_list.append(rmse)

    best_k = np.argmin(rmse_list) + 1

    # Affichage de la courbe
    #plt.figure(figsize=(10, 5))
    #plt.plot(range(1, max_k + 1), rmse_list, marker='o')
    #plt.title('Erreur RMSE en fonction du nombre de voisins (k)')
    #plt.xlabel('k (nombre de voisins)')
    #plt.ylabel('RMSE')
    #plt.grid(True)
    #plt.axvline(x=best_k, color='red', linestyle='--', label=f'Meilleur k = {best_k}')
    #plt.legend()
    #plt.show()

    print(f"✅ Meilleur k trouvé : {best_k} avec RMSE = {rmse_list[best_k - 1]:.2f}")

    return best_k

In [1096]:
# Charger et entraîner le modèle avec les données disponibles
#best_k = find_best_k(cars_data, input_features)
#knn_model, encoders, normalizers, X_train = train_knn_model(cars_data, k_neighbors=best_k)

In [1097]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder

Partie IA RF
Deviner le prix en fonction des caractéristiques


In [1098]:
from sklearn.preprocessing import LabelEncoder

def train_rf_model(cars_data, test_size=0.3):
    categorical_cols = ['make', 'model', 'fuel_type', 'transmission', 'drive', 'size', 'style']
    numerical_cols = ['year', 'hp', 'cylinders', 'doors', 'highway_mpg', 'city_mpg']

    encoders = {}
    normalizers = {}

    # LabelEncoding for categorical columns
    for col in categorical_cols:
        if col in cars_data.columns:
            le = LabelEncoder()
            cars_data[col] = le.fit_transform(cars_data[col])
            encoders[col] = le

    # Normalization for numerical columns
    for col in numerical_cols:
        if col in cars_data.columns:
            cars_data, normalizer = normalize_numerical(cars_data, col)
            normalizers[col] = normalizer


    available_cols = [col for col in cars_data.columns if col != 'price']
    X = cars_data[available_cols]
    y = cars_data['price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    y_train_pred = rf_model.predict(X_train)

    train_r2 = r2_score(y_train, y_train_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))

    print("\n--- Random Forest Training Results ---")
    print(f"Train R²: {train_r2:.4f}")
    print(f"Train RMSE: {train_rmse:.2f}")
    print(f"Train RMSE %: {(train_rmse / y_train.mean()) * 100:.2f}%")

    return rf_model, encoders, normalizers, X_train

In [1099]:
rf_model, rf_encoders, rf_normalizers, rf_X_train = train_rf_model(cars_data)

def preprocess_input_data(input_df, encoders, normalizers):
    # Encodage des colonnes catégoriques avec LabelEncoder
    for col, encoder in encoders.items():
        if col in input_df.columns:
            input_df[col] = input_df[col].apply(
                lambda x: encoder.transform([x])[0] if x in encoder.classes_ else -1
            )

    # Normalisation des colonnes numériques
    for col, (min_val, max_val) in normalizers.items():
        if col in input_df.columns:
            input_df[col] = (input_df[col] - min_val) / (max_val - min_val)

    return input_df



# Fonction pour prédire le prix d'une voiture simulant une entrée utilisateur

def predict_price_rf(new_car, rf_model, encoders, normalizers, X_train):
    new_car_df = pd.DataFrame([new_car])

    # ✅ Prétraitement complet
    new_car_df = preprocess_input_data(new_car_df, encoders, normalizers)

    # Réalignement des colonnes (en cas de colonne manquante)
    for col in X_train.columns:
        if col not in new_car_df.columns:
            new_car_df[col] = 0
    new_car_df = new_car_df[X_train.columns]

    # Prédiction
    predicted_price = rf_model.predict(new_car_df)
    print(f"Predicted price for the new car: ${predicted_price[0]:,.2f}")


# Exemple d'entrée utilisateur

input_data = {
    'make': 'Ford',
    'model': 'Mustang',
    'year': 2020,
    'fuel_type': 'premium unleaded (required)',
    'hp': 450,
    'cylinders': 8,
    'transmission': 'AUTOMATIC',
    'drive': 'rear wheel drive',
    'doors': 2,
    'size': 'Midsize',
    'style': 'Convertible',
    'highway_mpg': 28,
    'city_mpg': 16
}

# Autre exemple d'entrée utilisateur

input_data2 = { 
    'make': 'Kia',
    'model': 'Sportage',
    'year': 2017,
    'fuel_type': 'regular unleaded',
    'hp': 181,
    'cylinders': 4,
    'transmission': 'AUTOMATIC',
    'drive': 'front wheel drive',
    'doors': 4,
    'size': 'Midsize',
    'style': 'Convertible',
    'highway_mpg': 29,
    'city_mpg': 22
}


input_data3 = {
    'make': 'Audi',
    'model': '100',
    'year': 1993,
    'fuel_type': 'regular unleaded',
    'hp': 172,
    'cylinders': 6,
    'transmission': 'MANUAL',
    'drive': 'front wheel drive',
    'doors': 4,
    'size': 'Midsize',
    'style': 'Sedan',
    'highway_mpg': 24,
    'city_mpg': 17
}

input_data4 = {
    'make': 'BMW',
    'model': '5 Series',
    'year': 2016,
    'fuel_type': 'premium unleaded (required)',
    'hp': 240,
    'cylinders': 4,
    'transmission': 'AUTOMATIC',
    'drive': 'all wheel drive',
    'doors': 4,
    'size': 'Large',
    'style': 'Sedan',
    'highway_mpg': 34,
    'city_mpg': 22
}

# Prédiction du prix

predicted_price = predict_price_rf(input_data, rf_model, rf_encoders, rf_normalizers, rf_X_train)

predicted_price2 = predict_price_rf(input_data2, rf_model, rf_encoders, rf_normalizers, rf_X_train)

predicted_price3 = predict_price_rf(input_data3, rf_model, rf_encoders, rf_normalizers, rf_X_train)

predicted_price4 = predict_price_rf(input_data4, rf_model, rf_encoders, rf_normalizers, rf_X_train)



--- Random Forest Training Results ---
Train R²: 0.9815
Train RMSE: 2114.67
Train RMSE %: 7.21%
Predicted price for the new car: $54,884.28
Predicted price for the new car: $25,433.40
Predicted price for the new car: $2,000.00
Predicted price for the new car: $47,283.97


In [1100]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import numpy as np

def kfold_random_forest(cars_data, n_splits=5):
    categorical_cols = ['make', 'model', 'fuel_type', 'transmission', 'drive', 'size', 'style']
    numerical_cols = ['year', 'hp', 'cylinders', 'doors', 'highway_mpg', 'city_mpg']

    encoders = {}
    normalizers = {}
    data = cars_data.copy()

    # LabelEncoding pour les colonnes catégoriques
    for col in categorical_cols:
        if col in data.columns:
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col])
            encoders[col] = le

    # Normalisation des colonnes numériques
    for col in numerical_cols:
        if col in data.columns:
            data, normalizer = normalize_numerical(data, col)
            normalizers[col] = normalizer

    available_cols = [col for col in data.columns if col != 'price']
    X = data[available_cols]
    y = data['price']

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    r2_scores = []
    rmse_scores = []

    for fold, (train_idx, test_idx) in enumerate(kf.split(X), start=1):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)

        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        r2_scores.append(r2)
        rmse_scores.append(rmse)

        print(f"Fold {fold}: R² = {r2:.4f}, RMSE = {rmse:.2f}")

    print("\n--- K-Fold Results for Random Forest ---")
    print(f"Average R²: {np.mean(r2_scores):.4f}")
    print(f"Average RMSE: {np.mean(rmse_scores):.2f}")
    print(f"Average RMSE %: {(np.mean(rmse_scores) / y.mean()) * 100:.2f}%")

    return model, encoders, normalizers


# Execute K-Fold Cross-Validation for Random Forest
rf_model, rf_encoders, rf_normalizers = kfold_random_forest(cars_data)

Fold 1: R² = 0.9537, RMSE = 3370.31
Fold 2: R² = 0.9521, RMSE = 3383.02
Fold 3: R² = 0.9497, RMSE = 3498.97
Fold 4: R² = 0.9534, RMSE = 3411.61
Fold 5: R² = 0.9526, RMSE = 3349.10

--- K-Fold Results for Random Forest ---
Average R²: 0.9523
Average RMSE: 3402.60
Average RMSE %: 11.66%


In [1101]:
from sklearn.model_selection import KFold

def kfold_knn_model(cars_data, n_splits=5, k_neighbors=4):
    categorical_cols = ['make', 'model', 'fuel_type', 'transmission', 'drive', 'size', 'style']
    numerical_cols = ['year', 'hp', 'cylinders', 'doors', 'highway_mpg', 'city_mpg']

    encoders = {}
    normalizers = {}
    data = cars_data.copy()

    # Target encoding
    for col in categorical_cols:
        if col in data.columns:
            data, encoder = encode_categorical(data, col)
            encoders[col] = encoder

    # Normalisation
    for col in numerical_cols:
        if col in data.columns:
            data, normalizer = normalize_numerical(data, col)
            normalizers[col] = normalizer

    available_cols = [col for col in input_features if col in data.columns]
    X = data[available_cols]
    y = data['price']

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    r2_scores = []
    rmse_scores = []

    for fold, (train_idx, test_idx) in enumerate(kf.split(X), start=1):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = KNeighborsRegressor(n_neighbors=k_neighbors)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        r2_scores.append(r2)
        rmse_scores.append(rmse)

        print(f"Fold {fold}: R² = {r2:.4f}, RMSE = {rmse:.2f}")

    print("\n--- K-Fold Results for KNN ---")
    print(f"Average R²: {np.mean(r2_scores):.4f}")
    print(f"Average RMSE: {np.mean(rmse_scores):.2f}")
    print(f"Average RMSE %: {(np.mean(rmse_scores) / y.mean()) * 100:.2f}%")

    return model, encoders, normalizers

# Execute K-Fold Cross-Validation for KNN
knn_model, knn_encoders, knn_normalizers = kfold_knn_model(cars_data, n_splits=5, k_neighbors=find_best_k(cars_data, input_features, max_k=30, test_size=0.3))


✅ Meilleur k trouvé : 5 avec RMSE = 3543.47
Fold 1: R² = 0.9498, RMSE = 3510.31
Fold 2: R² = 0.9506, RMSE = 3436.55
Fold 3: R² = 0.9450, RMSE = 3659.12
Fold 4: R² = 0.9502, RMSE = 3526.58
Fold 5: R² = 0.9473, RMSE = 3529.24

--- K-Fold Results for KNN ---
Average R²: 0.9486
Average RMSE: 3532.36
Average RMSE %: 12.10%


Partie IA SVM
Deviner le prix en fonction des caractéristiques



In [1102]:
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV


# Sélection des colonnes utiles
features = ['year', 'hp', 'cylinders', 'highway_mpg', 'city_mpg']
X = cars_data[features]
y = cars_data['price']

# Nettoyage : suppression des lignes avec valeurs manquantes
X = X.dropna()
y = y.loc[X.index]

# Split des données en train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Pipeline avec standardisation + SVM
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', SVR(kernel='rbf', C=1000, epsilon=100))
])

# Entraînement
svm_pipeline.fit(X_train, y_train)

# Prédictions
y_pred = svm_pipeline.predict(X_test)

# Évaluation
print("R2 Score :", r2_score(y_test, y_pred))
print("RMSE :", np.sqrt(mean_squared_error(y_test, y_pred)))

#plt.figure(figsize=(8, 6))
#plt.scatter(y_test, y_pred, alpha=0.5)
#plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # ligne idéale
#plt.xlabel("Prix réel")
#plt.ylabel("Prix prédit")
#plt.title("SVR : Prédiction vs Réalité")
#plt.grid(True)
#plt.tight_layout()
#plt.show()


R2 Score : 0.7898302269754142
RMSE : 7182.462173668957


Nous avons testé une approche par SVM (Support Vector Machine) pour prédire le prix des véhicules à partir de caractéristiques techniques. Malgré son élégance théorique, la méthode s’est révélée nettement moins performante que les alternatives comme KNN ou Random Forest.

Avec un R² de seulement 0.79 et une erreur moyenne de plus de 7 000 $, le modèle SVM peine à capturer la complexité des relations non linéaires dans les données, même avec une transformation RBF (Radial Basis Function).

Trop fragile face à des données riches et variées
La SVM reste intéressante sur des problèmes bien cadrés, avec peu de variables ou des relations simples. Mais dans notre cas, avec des données riches, hétérogènes.

Partie IA Réseau de neurones
Deviner le prix en fonction des caractéristiques



In [1103]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

def train_neural_network(cars_data, test_size=0.3, epochs=100, batch_size=128):
    categorical_cols = ['make', 'model', 'fuel_type', 'transmission', 'drive', 'size', 'style']
    numerical_cols = ['year', 'hp', 'cylinders', 'doors', 'highway_mpg', 'city_mpg']

    encoders = {}
    normalizers = {}
    data = cars_data.copy()

    # Encodage des colonnes catégoriques
    for col in categorical_cols:
        if col in data.columns:
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col])
            encoders[col] = le

    # Normalisation des colonnes numériques
    for col in numerical_cols:
        if col in cars_data.columns:
            cars_data, normalizer = normalize_numerical(cars_data, col)
            normalizers[col] = normalizer

    # Affichage après encodage et normalisation

    # Cible
    y_scaler = MinMaxScaler()
    y = y_scaler.fit_transform(data[['price']])
    X = data.drop(columns='price')

    X_train, X_test, y_train, y_test = train_test_split(X.values, y, test_size=test_size, random_state=42)

    # Convertir en tenseurs PyTorch
    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Modèle PyTorch
    class NeuralNetwork(nn.Module):
        def __init__(self, input_dim):
            super(NeuralNetwork, self).__init__()
            self.model = nn.Sequential(
                nn.Linear(input_dim, 128),
                nn.ReLU(),
                nn.Dropout(0.2),
                nn.Linear(128, 64),
                nn.ReLU(),
                nn.Dropout(0.2),
                nn.Linear(64, 1)
            )

        def forward(self, x):
            return self.model(x)

    model = NeuralNetwork(X_train.shape[1])
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    train_losses = []
    val_losses = []

    # Add metrics calculation and print statements for R², RMSE, and RMSE % at each epoch
    for epoch in range(epochs):
        model.train()
        batch_losses = []

        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            outputs = model(X_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()
            batch_losses.append(loss.item())

        train_losses.append(np.mean(batch_losses))

        model.eval()
        with torch.no_grad():
            val_output = model(X_test_tensor)
            val_loss = criterion(val_output, y_test_tensor).item()
            val_losses.append(val_loss)

            # Calculate R², RMSE, and RMSE %
            y_test_pred = val_output.numpy()
            y_test_actual = y_test_tensor.numpy()
            r2 = r2_score(y_test_actual, y_test_pred)
            rmse = np.sqrt(mean_squared_error(y_test_actual, y_test_pred))
            rmse_percent = (rmse / y_test_actual.mean()) * 100

        # Print metrics for each epoch
        print(f"Epoch {epoch + 1}/{epochs} - Training Loss: {train_losses[-1]:.4f}, Validation Loss: {val_loss:.4f}, R²: {r2:.4f}, RMSE: {rmse:.2f}, RMSE %: {rmse_percent:.2f}%")

    # Generate predictions for the test set
    model.eval()
    with torch.no_grad():
        y_test_pred = model(X_test_tensor).numpy()

    # Inverse transform predictions and actual values
    y_pred = y_scaler.inverse_transform(y_test_pred)
    y_test = y_scaler.inverse_transform(y_test_tensor.numpy())
    y_pred = y_pred.flatten()
    y_test = y_test.flatten()

    # Calculate metrics
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print("\n--- Neural Network Training Results ---")
    print(f"R²: {r2:.4f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"RMSE %: {(rmse / y_test.mean()) * 100:.2f}%")
    return model, encoders, normalizers, y_scaler



In [None]:
# Entraînement du modèle de réseau de neurones


nn_model, nn_encoders, nn_normalizers, y_scaler = train_neural_network(cars_data, test_size=0.3, epochs=100, batch_size=128)

# Fonction pour prédire le prix d'une voiture avec le modèle de réseau de neurones

def predict_price_nn(new_car, nn_model, encoders, normalizers, y_scaler):
    new_car_df = pd.DataFrame([new_car])

    # Prétraitement complet
    new_car_df = preprocess_input_data(new_car_df, encoders, normalizers)

    print(new_car_df)

    # Prédiction
    with torch.no_grad():
        predicted_price = nn_model(torch.tensor(new_car_df.values, dtype=torch.float32)).numpy()

    # Inverse transform the predicted price
    predicted_price = y_scaler.inverse_transform(predicted_price)
    print(f"Predicted price for the new car: ${predicted_price[0][0]:,.2f}")


# Exemple d'entrée utilisateur

input_data = {
    'make': 'BMW',
    'model': '1 Series',
    'year': 2013,
    'fuel_type': 'premium unleaded (required)',
    'hp': 320,
    'cylinders': 6,
    'transmission': 'MANUAL',
    'drive': 'rear wheel drive',
    'doors': 2,
    'size': 'Compact',
    'style': 'Convertible',
    'highway_mpg': 25,
    'city_mpg': 18
}

# exec

predicted_price = predict_price_nn(input_data, nn_model, nn_encoders, nn_normalizers, y_scaler)

Epoch 1/100 - Training Loss: 19.6779, Validation Loss: 0.0951, R²: -0.8407, RMSE: 0.31, RMSE %: 78.89%
Epoch 2/100 - Training Loss: 0.7955, Validation Loss: 0.0802, R²: -0.5517, RMSE: 0.28, RMSE %: 72.44%
Epoch 3/100 - Training Loss: 0.2235, Validation Loss: 0.0874, R²: -0.6919, RMSE: 0.30, RMSE %: 75.64%
Epoch 4/100 - Training Loss: 0.1271, Validation Loss: 0.0819, R²: -0.5854, RMSE: 0.29, RMSE %: 73.22%
Epoch 5/100 - Training Loss: 0.1045, Validation Loss: 0.0771, R²: -0.4932, RMSE: 0.28, RMSE %: 71.06%
Epoch 6/100 - Training Loss: 0.0937, Validation Loss: 0.0644, R²: -0.2458, RMSE: 0.25, RMSE %: 64.91%
Epoch 7/100 - Training Loss: 0.0875, Validation Loss: 0.0635, R²: -0.2298, RMSE: 0.25, RMSE %: 64.49%
Epoch 8/100 - Training Loss: 0.0800, Validation Loss: 0.0689, R²: -0.3332, RMSE: 0.26, RMSE %: 67.14%
Epoch 9/100 - Training Loss: 0.0766, Validation Loss: 0.0577, R²: -0.1170, RMSE: 0.24, RMSE %: 61.46%
Epoch 10/100 - Training Loss: 0.0757, Validation Loss: 0.0585, R²: -0.1321, RMSE: