In [416]:
# Import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import make_pipeline

import math
from sklearn.metrics import r2_score, mean_squared_error

import os

import warnings
warnings.filterwarnings('ignore')

In [417]:
# Lecture CSV
cars_data = pd.read_csv('../data/voiture.csv')

In [418]:
print(len(cars_data))


11914


In [419]:
cars_data.head()


Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [420]:
# Data cleaning

cars_data.columns = cars_data.columns.str.lower().str.replace(" ", "_")
cars_data.rename(columns = {'engine_fuel_type' : 'fuel_type', 'engine_hp' : 'hp', 'engine_cylinders' : 'cylinders', 'transmission_type' : 'transmission', 'driven_wheels' : 'drive', 'number_of_doors' : 'doors', 'market_category' : 'market', 'vehicle_size' : 'size', 'vehicle_style' : 'style', 'msrp' : 'price'}, inplace = True)

In [421]:
cars_data.head()


Unnamed: 0,make,model,year,fuel_type,hp,cylinders,transmission,drive,doors,market,size,style,highway_mpg,city_mpg,popularity,price
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


In [422]:
print('Number of duplicates are : ', cars_data.duplicated().sum())
cars_data = cars_data.drop_duplicates()

Number of duplicates are :  715


In [423]:
print('Number of missing values in each columns are below : ')
print(cars_data.isnull().sum())

Number of missing values in each columns are below : 
make               0
model              0
year               0
fuel_type          3
hp                69
cylinders         30
transmission       0
drive              0
doors              6
market          3376
size               0
style              0
highway_mpg        0
city_mpg           0
popularity         0
price              0
dtype: int64


In [424]:
cars_data.drop('market', axis = 1, inplace = True)

cars_data.drop('popularity', axis = 1, inplace = True)



In [425]:
null_values = cars_data[cars_data.isnull().any(axis = 1)]
null_values


Unnamed: 0,make,model,year,fuel_type,hp,cylinders,transmission,drive,doors,size,style,highway_mpg,city_mpg,price
539,FIAT,500e,2015,electric,,0.0,DIRECT_DRIVE,front wheel drive,2.0,Compact,2dr Hatchback,108,122,31800
540,FIAT,500e,2016,electric,,0.0,DIRECT_DRIVE,front wheel drive,2.0,Compact,2dr Hatchback,103,121,31800
541,FIAT,500e,2017,electric,,0.0,DIRECT_DRIVE,front wheel drive,2.0,Compact,2dr Hatchback,103,121,31800
1983,Chevrolet,Bolt EV,2017,electric,200.0,,DIRECT_DRIVE,front wheel drive,4.0,Compact,4dr Hatchback,110,128,40905
1984,Chevrolet,Bolt EV,2017,electric,200.0,,DIRECT_DRIVE,front wheel drive,4.0,Compact,4dr Hatchback,110,128,36620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9853,Kia,Soul EV,2016,electric,,0.0,DIRECT_DRIVE,front wheel drive,4.0,Compact,Wagon,92,120,31950
9854,Kia,Soul EV,2016,electric,,0.0,DIRECT_DRIVE,front wheel drive,4.0,Compact,Wagon,92,120,35950
11321,Suzuki,Verona,2004,,155.0,6.0,AUTOMATIC,front wheel drive,4.0,Midsize,Sedan,25,17,17199
11322,Suzuki,Verona,2004,,155.0,6.0,AUTOMATIC,front wheel drive,4.0,Midsize,Sedan,25,17,20199


In [426]:


cars_data['fuel_type'] = cars_data['fuel_type'].fillna('regular unleaded')

cars_data['hp'] = cars_data['hp'].fillna(0)

cars_data['cylinders'] = cars_data['cylinders'].fillna(0)

cars_data['doors'] = cars_data['doors'].fillna(cars_data['doors'].mean())

In [427]:
num_col = cars_data.select_dtypes(include = [np.number])
cat_col = cars_data.select_dtypes(exclude = [np.number])

In [428]:
# Supression des voitures ayant unknown

cars_data.drop(cars_data[cars_data['transmission']=='UNKNOWN'].index, axis='index', inplace = True)

print('Number of missing values in each columns are below : ')
print(cars_data.isnull().sum())


Number of missing values in each columns are below : 
make            0
model           0
year            0
fuel_type       0
hp              0
cylinders       0
transmission    0
drive           0
doors           0
size            0
style           0
highway_mpg     0
city_mpg        0
price           0
dtype: int64


In [429]:
s1 = cars_data.shape
clean = cars_data[['hp', 'cylinders', 'highway_mpg', 'city_mpg', 'price']]
for i in clean.columns:
    qt1 = cars_data[i].quantile(0.25)
    qt3 = cars_data[i].quantile(0.75)
    iqr =  qt3 - qt1
    lower = qt1-(1.5*iqr)
    upper = qt3+(1.5*iqr)
    min_in = cars_data[cars_data[i]<lower].index
    max_in = cars_data[cars_data[i]>upper].index
    
    cars_data.drop(min_in, inplace = True)
    cars_data.drop(max_in, inplace = True)
s2 = cars_data.shape
outliers = s1[0] - s2[0]
print("Deleted outliers are : ", outliers)

Deleted outliers are :  1403


In [430]:
# Nombre de ligne restante dans le dataset après nettoyage

print(len(cars_data))


9784


Partie IA KNN
Deviner le prix en fonction des caractéristiques
Certaines caractèristiques sont obligatoires à rentré tel que : make, year, hp, cylinders.
Puis le reste des caractèristiques sont optionnelles.


In [431]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split

# Fonction d'encodage des variables catégoriques en fonction du prix moyen
def encode_categorical(df, col, target='price'):
    encoding = df.groupby(col)[target].mean() / df[target].mean()
    df[col] = df[col].map(encoding).fillna(0)
    return df, encoding

# Fonction de normalisation des variables numériques entre 0 et 1
def normalize_numerical(df, col):
    min_val = df[col].min()
    max_val = df[col].max()
    df[col] = (df[col] - min_val) / (max_val - min_val)
    return df, (min_val, max_val)

# Fonction d'encodage et de normalisation dynamique
def encode_and_normalize(df, encoders, normalizers, categorical_cols, numerical_cols):
    for col in categorical_cols:
        if col in df.columns:
            df[col] = df[col].map(encoders[col]).fillna(0)

    for col in numerical_cols:
        if col in df.columns:
            min_val, max_val = normalizers[col]
            df[col] = (df[col] - min_val) / (max_val - min_val)

    return df

# Variables à utiliser (obligatoires désormais)
input_features = ['make', 'model', 'year', 'fuel_type', 'transmission']


# Fonction pour entraîner un modèle KNN en fonction des colonnes disponibles

def train_knn_model(cars_data, test_size=0.4, k_neighbors=4):
    categorical_cols = ['make', 'model', 'fuel_type', 'transmission', 'drive', 'size', 'style']
    numerical_cols = ['year', 'hp', 'cylinders']

    encoders = {}
    normalizers = {}

    for col in categorical_cols:
        if col in cars_data.columns:
            cars_data, encoder = encode_categorical(cars_data, col)
            encoders[col] = encoder

    for col in numerical_cols:
        if col in cars_data.columns:
            cars_data, normalizer = normalize_numerical(cars_data, col)
            normalizers[col] = normalizer

    available_cols = [col for col in input_features if col in cars_data.columns]

    X = cars_data[available_cols]
    y = cars_data['price']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    knn_model = KNeighborsRegressor(n_neighbors=k_neighbors)
    knn_model.fit(X_train, y_train)

    # Évaluation sur l'ensemble de test
    y_test_pred = knn_model.predict(X_test)
    test_r2 = r2_score(y_test, y_test_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_rmse_percentage = (test_rmse / y_test.mean()) * 100

    # Évaluation sur l'ensemble d'entraînement
    y_train_pred = knn_model.predict(X_train)
    train_r2 = r2_score(y_train, y_train_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_rmse_percentage = (train_rmse / y_train.mean()) * 100

    # Affichage structuré des résultats
    print("\n--- Model Training Results ---")
    print(f"Train R-squared: {train_r2:.4f}")
    print(f"Train RMSE: {train_rmse:.2f}")
    print(f"Train RMSE Percentage: {train_rmse_percentage:.2f}%")
    print(f"Test R-squared: {test_r2:.4f}")
    print(f"Test RMSE: {test_rmse:.2f}")
    print(f"Test RMSE Percentage: {test_rmse_percentage:.2f}%")

    # Vérification de l'overfitting
    if train_r2 > test_r2 and (train_r2 - test_r2) > 0.1:
        print("\nWarning: The model might be overfitting. The training performance is significantly better than the test performance.")
    else:
        print("\nThe model does not show significant signs of overfitting.")

    return knn_model, encoders, normalizers, X_train

# Fonction pour faire une prédiction
def predict_price(new_car, knn_model, encoders, normalizers, X_train):
    new_car_df = pd.DataFrame([new_car])

    categorical_cols = list(encoders.keys())
    numerical_cols = list(normalizers.keys())

    new_car_df = encode_and_normalize(new_car_df, encoders, normalizers, categorical_cols, numerical_cols)

    available_cols = [col for col in input_features if col in cars_data.columns]

    new_car_df = new_car_df[available_cols]

    missing_cols = set(X_train.columns) - set(new_car_df.columns)
    for col in missing_cols:
        new_car_df[col] = 0

    new_car_df = new_car_df[X_train.columns]

    predicted_price = knn_model.predict(new_car_df)
    print(f"Predicted price for the new car: ${predicted_price[0]:,.2f}")

# Charger et entraîner le modèle avec les données disponibles
knn_model, encoders, normalizers, X_train = train_knn_model(cars_data)

# Définir le véhicule Audi 90 (1995) à tester
new_car_audi = {
    'make': 'Audi',
    'model': '90',
    'year': 1995,
    'fuel_type': 'regular unleaded',
    'hp': 172,
    'cylinders': 6,
    'transmission': 'MANUAL',
    'drive': 'front wheel drive',
    'size': 'Compact',
    'style': 'Luxury,Sedan'
}

# Tester la prédiction du prix
predict_price(new_car_audi, knn_model, encoders, normalizers, X_train)

# Définir le véhicule Chevrolet Silverado 1500 (2015) à tester
new_car_chevrolet = {
    'make': 'Chevrolet',
    'model': 'Silverado 1500',
    'year': 2015,
    'fuel_type': 'regular unleaded',
    'hp': 355,
    'cylinders': 8,
    'transmission': 'AUTOMATIC',
    'drive': 'rear wheel drive',
    'size': 'Large',
    'style': 'Crew Cab Pickup'
}

# Tester la prédiction du prix
predict_price(new_car_chevrolet, knn_model, encoders, normalizers, X_train)

# Définir le véhicule Volkswagen Golf GTI (2015) à tester
new_car_vw_gti = {
    'make': 'Volkswagen',
    'model': 'Golf GTI',
    'year': 2015,
    'fuel_type': 'premium unleaded (recommended)',
    'hp': 220,
    'cylinders': 4,
    'transmission': 'AUTOMATED_MANUAL',
    'drive': 'front wheel drive',
    'size': 'Compact',
    'style': 'Hatchback,Performance,4dr Hatchback'
}

# Tester la prédiction du prix
predict_price(new_car_vw_gti, knn_model, encoders, normalizers, X_train)

# Définir le véhicule Volkswagen Beetle (2015) à tester
new_car_vw_beetle = {
    'make': 'Volkswagen',
    'model': 'Beetle',
    'year': 2015,
    'fuel_type': 'premium unleaded (recommended)',
    'hp': 210,
    'cylinders': 4,
    'transmission': 'AUTOMATED_MANUAL',
    'drive': 'front wheel drive',
    'size': 'Compact',
    'style': 'Hatchback,Performance,2dr Hatchback'
}

# Tester la prédiction du prix
predict_price(new_car_vw_beetle, knn_model, encoders, normalizers, X_train)

# Définir le véhicule BMW 8 Series (1995) à tester
new_car_bmw_8series = {
    'make': 'BMW',
    'model': '8 Series',
    'year': 1995,
    'fuel_type': 'regular unleaded',
    'hp': 372,
    'cylinders': 12,
    'transmission': 'MANUAL',
    'drive': 'rear wheel drive',
    'size': 'Midsize',
    'style': 'Factory Tuner,Luxury,Performance,Coupe'
}

# Tester la prédiction du prix
predict_price(new_car_bmw_8series, knn_model, encoders, normalizers, X_train)



--- Model Training Results ---
Train R-squared: 0.9476
Train RMSE: 3546.98
Train RMSE Percentage: 12.04%
Test R-squared: 0.9267
Test RMSE: 4265.09
Test RMSE Percentage: 14.80%

The model does not show significant signs of overfitting.
Predicted price for the new car: $2,000.00
Predicted price for the new car: $46,848.75
Predicted price for the new car: $30,162.50
Predicted price for the new car: $29,503.75
Predicted price for the new car: $2,000.00
