# Prédiction du prix des voitures d'occasion – Modèles avec XGBoost

In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error

from xgboost import XGBRegressor


## Chargement du dataset

In [11]:
df = pd.read_csv("../data/raw/used_car_dataset.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,car_name,registration_year,insurance_validity,fuel_type,seats,kms_driven,ownsership,transmission,manufacturing_year,mileage(kmpl),engine(cc),max_power(bhp),torque(Nm),price(in lakhs)
0,0,2017 Mercedes-Benz S-Class S400,Jul-17,Comprehensive,Petrol,5,56000,First Owner,Automatic,2017,7.81,2996.0,2996.0,333.0,63.75
1,1,2020 Nissan Magnite Turbo CVT XV Premium Opt BSVI,Jan-21,Comprehensive,Petrol,5,30615,First Owner,Automatic,2020,17.4,999.0,999.0,9863.0,8.99
2,2,2018 BMW X1 sDrive 20d xLine,Sep-18,Comprehensive,Diesel,5,24000,First Owner,Automatic,2018,20.68,1995.0,1995.0,188.0,23.75
3,3,2019 Kia Seltos GTX Plus,Dec-19,Comprehensive,Petrol,5,18378,First Owner,Manual,2019,16.5,1353.0,1353.0,13808.0,13.56
4,4,2019 Skoda Superb LK 1.8 TSI AT,Aug-19,Comprehensive,Petrol,5,44900,First Owner,Automatic,2019,14.67,1798.0,1798.0,17746.0,24.0


## Nettoyage du dataset (à adapter si besoin)

In [12]:
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

def clean_registration_year(x):
    x = str(x)
    if "-" in x:
        yy = x.split("-")[-1]
        if yy.isdigit():
            return 2000 + int(yy)
        else:
            return np.nan
    if x.isdigit() and len(x) == 4:
        return int(x)
    return np.nan

df["registration_year"] = df["registration_year"].apply(clean_registration_year)
df = df.dropna(subset=["registration_year"])
df["registration_year"] = df["registration_year"].astype(int)

df["manufacturing_year"] = pd.to_numeric(df["manufacturing_year"], errors="coerce")
df["age"] = 2025 - df["manufacturing_year"]

def extract_brand(name):
    parts = name.split()
    if parts[0].isdigit() and len(parts[0]) == 4:
        return parts[1]
    return parts[0]

df["brand"] = df["car_name"].apply(extract_brand)

num_cols = ["mileage(kmpl)", "engine(cc)", "max_power(bhp)", "torque(Nm)"]
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

df = df[df["engine(cc)"] < 10000]
df = df[df["max_power(bhp)"] < 2000]
df = df[df["torque(Nm)"] < 1500]
df = df[df["price(in lakhs)"] < 300]
df = df[df["kms_driven"] < 500000]
df = df[df["age"] >= 0]

df = df.reset_index(drop=True)

df.head(), df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 650 entries, 0 to 649
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   car_name            650 non-null    object 
 1   registration_year   650 non-null    int64  
 2   insurance_validity  650 non-null    object 
 3   fuel_type           650 non-null    object 
 4   seats               650 non-null    int64  
 5   kms_driven          650 non-null    int64  
 6   ownsership          650 non-null    object 
 7   transmission        650 non-null    object 
 8   manufacturing_year  650 non-null    float64
 9   mileage(kmpl)       650 non-null    float64
 10  engine(cc)          650 non-null    float64
 11  max_power(bhp)      650 non-null    float64
 12  torque(Nm)          650 non-null    float64
 13  price(in lakhs)     650 non-null    float64
 14  age                 650 non-null    float64
 15  brand               650 non-null    object 
dtypes: float

(                               car_name  registration_year  \
 0          2018 BMW X1 sDrive 20d xLine               2018   
 1          2017 Honda Jazz 1.2 S i VTEC               2017   
 2             2018 Hyundai Creta 1.6 SX               2018   
 3  2019 Tata Nexon 1.2 Revotron XZ Plus               2019   
 4                2020 Honda City VX CVT               2020   
 
       insurance_validity fuel_type  seats  kms_driven   ownsership  \
 0          Comprehensive    Diesel      5       24000  First Owner   
 1  Third Party insurance    Petrol      5       42000  First Owner   
 2          Comprehensive    Petrol      5       76000  First Owner   
 3          Comprehensive    Petrol      5       28783  First Owner   
 4          Comprehensive    Petrol      5       40000  First Owner   
 
   transmission  manufacturing_year  mileage(kmpl)  engine(cc)  max_power(bhp)  \
 0    Automatic              2018.0          20.68      1995.0          1995.0   
 1       Manual             

## Séparation X / y et Train/Test Split

In [13]:
X = df.drop(columns=["price(in lakhs)"]) 
y = df["price(in lakhs)"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


## Préprocessing

In [14]:
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)


## Modèle : XGBoost Regressor

In [15]:
model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("xgb", XGBRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    ))
])

model.fit(X_train, y_train)

pred = model.predict(X_test)
mae = mean_absolute_error(y_test, pred)
rmse = (mean_squared_error(y_test, pred)) ** 0.5

mae, rmse


(0.8212055940994851, 1.772336185162157)