In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import os, sys


##### Load the dataset

In [4]:
scripts_path = os.path.abspath(os.path.join(os.getcwd(), '../Scripts'))
sys.path.append(scripts_path)

In [None]:
from path import PATH_PROCESSED_CSV

In [6]:
data = pd.read_csv(PATH_PROCESSED_CSV) 

In [7]:
data.head(3)

Unnamed: 0,Etat,Milieage,Carburant,CarVendeur,Portes,seats,fillcareName,Price,etatCategory,Milieage.1,...,Puissance,TransmissionV2,Cylindrée,Vitesses,Cylindres,HorsePower,ClasseEmission,Typedecarburant,Consommationdecarburant,ÉmissionsdeCO2
0,Très bon prix,250 000 km,Diesel,Professionnel,5.0,5.0,FIAT-FIORINO,1450,3,250000,...,55,Boîte manuelle,1248.0,5,1090.0,75,Euro 4,Diesel,4.5,123.0
1,Bon prix,27 500 km,Electrique,Professionnel,5.0,4.0,AUDI-E-TRON GT,599501,2,27500,...,350,Boîte automatique,,3,2254.0,476,,Électrique,,0.0
2,Très bon prix,73 409 km,Essence,Professionnel,5.0,3.0,VOLKSWAGEN-GOLF,4000,3,73409,...,55,Boîte manuelle,1390.0,5,1154.0,75,Euro 4,Essence,6.8,166.0


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0    Etat                    397 non-null    object 
 1    Milieage                397 non-null    object 
 2    Carburant               397 non-null    object 
 3    CarVendeur              397 non-null    object 
 4    Portes                  386 non-null    float64
 5    seats                   392 non-null    float64
 6   fillcareName             397 non-null    object 
 7   Price                    397 non-null    int64  
 8   etatCategory             397 non-null    int64  
 9   Milieage                 397 non-null    int64  
 10   Month                   395 non-null    float64
 11   Year                    395 non-null    float64
 12  Puissance                397 non-null    int64  
 13  TransmissionV2           391 non-null    object 
 14  Cylindrée                3

In [9]:
data=data.drop(columns=[' Milieage'])

##### Rename columns for better handling

In [11]:
data.columns = data.columns.str.replace(' ', '')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Etat                     397 non-null    object 
 1   Carburant                397 non-null    object 
 2   CarVendeur               397 non-null    object 
 3   Portes                   386 non-null    float64
 4   seats                    392 non-null    float64
 5   fillcareName             397 non-null    object 
 6   Price                    397 non-null    int64  
 7   etatCategory             397 non-null    int64  
 8   Milieage                 397 non-null    int64  
 9   Month                    395 non-null    float64
 10  Year                     395 non-null    float64
 11  Puissance                397 non-null    int64  
 12  TransmissionV2           391 non-null    object 
 13  Cylindrée                382 non-null    float64
 14  Vitesses                 3

In [12]:
num_cols = ['Milieage', 'Year', 'Puissance', 'Cylindrée', 'HorsePower', 'Consommationdecarburant', 'ÉmissionsdeCO2']
for col in num_cols:
    data[col] = data[col].astype(float)

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 21 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Etat                     397 non-null    object 
 1   Carburant                397 non-null    object 
 2   CarVendeur               397 non-null    object 
 3   Portes                   386 non-null    float64
 4   seats                    392 non-null    float64
 5   fillcareName             397 non-null    object 
 6   Price                    397 non-null    int64  
 7   etatCategory             397 non-null    int64  
 8   Milieage                 397 non-null    float64
 9   Month                    395 non-null    float64
 10  Year                     395 non-null    float64
 11  Puissance                397 non-null    float64
 12  TransmissionV2           391 non-null    object 
 13  Cylindrée                382 non-null    float64
 14  Vitesses                 3

# Feature Engineering: 

In [14]:
data = data.dropna(subset=['Price'])

#### Creating Age Column

In [15]:
from datetime import datetime 
data['Age'] = datetime.now().year - data['Year']

#### Handling Missing Values

In [16]:
imputer_num = SimpleImputer(strategy='median')
imputer_cat = SimpleImputer(strategy='most_frequent')

#### Encoding categorical variables

In [17]:
cat_cols = ['Etat', 'Carburant', 'CarVendeur', 'TransmissionV2', 'ClasseEmission', 'Typedecarburant']
num_pipeline = Pipeline([
    ('imputer', imputer_num),
    ('scaler', StandardScaler())
])
cat_pipeline = Pipeline([
    ('imputer', imputer_cat),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols + ['Age']),
    ('cat', cat_pipeline, cat_cols)
])


In [18]:
# Define target variable and features
X = data.drop(['Price'], axis=1)
y = data['Price']

In [19]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
# Transform data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [21]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)


(317, 32) (80, 32)
(317,) (80,)


In [22]:
# Train Machine Learning Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [23]:
# Predictions
y_pred = model.predict(X_test)

# Model Evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


In [24]:
print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared Score: {r2}")

Mean Absolute Error: 53576.818999999996
Mean Squared Error: 8312970958.582663
R-squared Score: 0.6021332399425305


Preprocessing completed! Data is ready for machine learning.