## Imports

In [69]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split


## Chargement des données

In [70]:
df = pd.read_csv("../data/raw/Used_Car_Dataset.csv")
print(df.describe())

        Unnamed: 0         seats     kms_driven  mileage(kmpl)    engine(cc)  \
count  1553.000000   1553.000000    1553.000000    1550.000000  1.550000e+03   
mean    776.000000     91.480361   52841.931101     236.927277  1.471857e+10   
std     448.456798   2403.424060   40067.800347     585.964295  2.185629e+11   
min       0.000000      4.000000     620.000000       7.810000  5.000000e+00   
25%     388.000000      5.000000   30000.000000      16.342500  1.197000e+03   
50%     776.000000      5.000000   49134.000000      18.900000  1.462000e+03   
75%    1164.000000      5.000000   70000.000000      22.000000  1.995000e+03   
max    1552.000000  67000.000000  810000.000000    3996.000000  3.258640e+12   

       max_power(bhp)    torque(Nm)  price(in lakhs)  
count    1.550000e+03  1.549000e+03      1553.000000  
mean     1.471857e+10  1.423989e+04       166.141494  
std      2.185629e+11  9.666241e+04      3478.855090  
min      5.000000e+00  5.000000e+00         1.000000  
25% 

In [71]:
print(df.head())

   Unnamed: 0                                           car_name  \
0           0                    2017 Mercedes-Benz S-Class S400   
1           1  2020 Nissan Magnite Turbo CVT XV Premium Opt BSVI   
2           2                       2018 BMW X1 sDrive 20d xLine   
3           3                           2019 Kia Seltos GTX Plus   
4           4                    2019 Skoda Superb LK 1.8 TSI AT   

  registration_year insurance_validity fuel_type  seats  kms_driven  \
0            Jul-17      Comprehensive    Petrol      5       56000   
1            Jan-21      Comprehensive    Petrol      5       30615   
2            Sep-18      Comprehensive    Diesel      5       24000   
3            Dec-19      Comprehensive    Petrol      5       18378   
4            Aug-19      Comprehensive    Petrol      5       44900   

    ownsership transmission manufacturing_year  mileage(kmpl)  engine(cc)  \
0  First Owner    Automatic               2017           7.81      2996.0   
1  First O

In [72]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1553 entries, 0 to 1552
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          1553 non-null   int64  
 1   car_name            1553 non-null   object 
 2   registration_year   1553 non-null   object 
 3   insurance_validity  1553 non-null   object 
 4   fuel_type           1553 non-null   object 
 5   seats               1553 non-null   int64  
 6   kms_driven          1553 non-null   int64  
 7   ownsership          1553 non-null   object 
 8   transmission        1553 non-null   object 
 9   manufacturing_year  1553 non-null   object 
 10  mileage(kmpl)       1550 non-null   float64
 11  engine(cc)          1550 non-null   float64
 12  max_power(bhp)      1550 non-null   float64
 13  torque(Nm)          1549 non-null   float64
 14  price(in lakhs)     1553 non-null   float64
dtypes: float64(5), int64(3), object(7)
memory usage: 182.1+

## Nettoyage

In [73]:
df = df.drop(columns=["Unnamed: 0"], errors="ignore")

def clean_registration_year(x):
    x = str(x)
    if "-" in x:
        yy = x.split("-")[-1]
        if yy.isdigit():
            return 2000 + int(yy)
        else:
            return np.nan
    if x.isdigit() and len(x) == 4:
        return int(x)
    return np.nan

df["registration_year"] = df["registration_year"].apply(clean_registration_year)
df = df.dropna(subset=["registration_year"])
df["registration_year"] = df["registration_year"].astype(int)

df["manufacturing_year"] = pd.to_numeric(df["manufacturing_year"], errors="coerce")
df["age"] = 2025 - df["manufacturing_year"]

def extract_brand(name):
    parts = name.split()
    if parts[0].isdigit() and len(parts[0]) == 4:
        return parts[1]
    return parts[0]

df["brand"] = df["car_name"].apply(extract_brand)

num_cols = ["mileage(kmpl)", "engine(cc)", "max_power(bhp)", "torque(Nm)"]
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

df = df[df["engine(cc)"] < 10000]
df = df[df["max_power(bhp)"] < 2000]
df = df[df["torque(Nm)"] < 1500]
df = df[df["price(in lakhs)"] < 300]
df = df[df["kms_driven"] < 500000]
df = df[df["age"] >= 0]

df = df.reset_index(drop=True)

df.head(), df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 650 entries, 0 to 649
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   car_name            650 non-null    object 
 1   registration_year   650 non-null    int64  
 2   insurance_validity  650 non-null    object 
 3   fuel_type           650 non-null    object 
 4   seats               650 non-null    int64  
 5   kms_driven          650 non-null    int64  
 6   ownsership          650 non-null    object 
 7   transmission        650 non-null    object 
 8   manufacturing_year  650 non-null    float64
 9   mileage(kmpl)       650 non-null    float64
 10  engine(cc)          650 non-null    float64
 11  max_power(bhp)      650 non-null    float64
 12  torque(Nm)          650 non-null    float64
 13  price(in lakhs)     650 non-null    float64
 14  age                 650 non-null    float64
 15  brand               650 non-null    object 
dtypes: float

(                               car_name  registration_year  \
 0          2018 BMW X1 sDrive 20d xLine               2018   
 1          2017 Honda Jazz 1.2 S i VTEC               2017   
 2             2018 Hyundai Creta 1.6 SX               2018   
 3  2019 Tata Nexon 1.2 Revotron XZ Plus               2019   
 4                2020 Honda City VX CVT               2020   
 
       insurance_validity fuel_type  seats  kms_driven   ownsership  \
 0          Comprehensive    Diesel      5       24000  First Owner   
 1  Third Party insurance    Petrol      5       42000  First Owner   
 2          Comprehensive    Petrol      5       76000  First Owner   
 3          Comprehensive    Petrol      5       28783  First Owner   
 4          Comprehensive    Petrol      5       40000  First Owner   
 
   transmission  manufacturing_year  mileage(kmpl)  engine(cc)  max_power(bhp)  \
 0    Automatic              2018.0          20.68      1995.0          1995.0   
 1       Manual             

## Préparation des variables

In [74]:
X = df.drop(columns=["price(in lakhs)"])
y = df["price(in lakhs)"]

print("X :", X.shape)
print("y :", y.shape)

X : (650, 15)
y : (650,)


## Encodage

In [75]:
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

## Séparation entraînement / validation

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"X_train : {X_train.shape}, X_test : {X_val.shape}")

X_train : (520, 15), X_test : (130, 15)


## Modèle régression multiple

In [77]:
model = Pipeline(steps=[
    ("preprocessing", preprocessor),
    ("regression", LinearRegression())
])

model.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessing', ...), ('regression', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


## Évaluer

In [78]:
pred = model.predict(X_test)

mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)
rmse = mse ** 0.5

mae, rmse


(1.2015061028241913, 2.320309989081376)