In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from joblib import dump, load
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import Ridge, Lasso
import tensorflow as tf
import warnings
from TransformationPipeline import TransformationPipeline
warnings.filterwarnings('ignore')

### **1. Loading Data**

In [87]:
df = pd.read_csv("Ballouchi.csv")

In [55]:
df.head(15)

Unnamed: 0.1,Unnamed: 0,name,price,marque,modele,Transmission,Carburant,Annee,Kilométrage
0,0,Peugeot 301 très bon état,125 000,Peugeot,301,22193320,5 CV,144000,Essence
1,1,BMW serie 5 a vendre,125 000,BMW,520,Automatique,Essence,2017,80000
2,2,peugeot 301,25 000,Peugeot,301,Manuelle,Essence,2018,78000
3,3,Golf 7 1.2 TSI,53 000,Volkswagen,,Manuelle,Essence,2016,115000
4,4,av mazda 2,37 000,Mazda,2,Manuelle,Essence,2013,150000
5,5,205 JUNIOR,4 900,Peugeot,205,,4 CV,1985,Essence
6,6,ford fiesta,26 000,Ford,,Manuelle,Essence,2010,121000
7,7,mercedes Benz GL 2016,125 000,Mercedes-Benz,,,Contacter l'annonceur,Manuelle,Rolan
8,8,a vendre 406,21 000,Peugeot,,,6 CV,310000,Diesel
9,9,Mazda bt 50 4×4,125 000,Mazda,BT-50,,becem13,160000,Manuelle


In [56]:
df.dropna(inplace=True)

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2876 entries, 0 to 7589
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    2876 non-null   int64 
 1   name          2876 non-null   object
 2   price         2876 non-null   object
 3   marque        2876 non-null   object
 4   modele        2876 non-null   object
 5   Transmission  2876 non-null   object
 6   Carburant     2876 non-null   object
 7   Annee         2876 non-null   object
 8   Kilométrage   2876 non-null   object
dtypes: int64(1), object(8)
memory usage: 224.7+ KB


In [115]:
filtered_df = df[df['Kilométrage'].str.isnumeric()]
# Function to check if a value is a valid year
def is_valid_year(value):
    if str(value).isnumeric():
        year = int(value)
        if 1900 <= year <= 2023:  # Adjust the range based on your requirements
            return True
    return False

# Filter the DataFrame to keep only valid year rows
filtered_df = filtered_df[filtered_df['Annee'].apply(is_valid_year)]

filtered_df = filtered_df[filtered_df["Transmission"].isin(["Manuelle","Automatique"])]
filtered_df = filtered_df[filtered_df["Carburant"].isin(["Essence","Diesel","Electrique"])]

In [116]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1178 entries, 1 to 1657
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    1178 non-null   int64 
 1   name          1178 non-null   object
 2   price         1178 non-null   object
 3   marque        1178 non-null   object
 4   modele        1159 non-null   object
 5   Transmission  1178 non-null   object
 6   Carburant     1178 non-null   object
 7   Annee         1178 non-null   object
 8   Kilométrage   1178 non-null   object
dtypes: int64(1), object(8)
memory usage: 92.0+ KB


In [117]:
filtered_df

Unnamed: 0.1,Unnamed: 0,name,price,marque,modele,Transmission,Carburant,Annee,Kilométrage
1,1,BMW serie 5 a vendre,125 000,BMW,520,Automatique,Essence,2017,80000
2,2,peugeot 301,25 000,Peugeot,301,Manuelle,Essence,2018,78000
3,3,Golf 7 1.2 TSI,53 000,Volkswagen,,Manuelle,Essence,2016,115000
4,4,av mazda 2,37 000,Mazda,2,Manuelle,Essence,2013,150000
5,5,205 JUNIOR,4 900,Peugeot,205,Manuelle,Essence,1996,232000
...,...,...,...,...,...,...,...,...,...
1647,7570,golf 6 (7CV) essence 1.6,38 000,Volkswagen,Golf,Manuelle,Essence,2010,98000
1648,7571,polo for,10 000,Volkswagen,Polo,Manuelle,Essence,2001,130000
1652,7577,4X4 PAJERO,18 000,Mitsubishi,Pajero,Manuelle,Diesel,1995,348000
1653,7578,Polo 5 en bonne etat,13 500,Volkswagen,Polo,Manuelle,Essence,2005,145000


In [166]:
clean_df = pd.read_csv("BallouchiCleaned.csv")

In [167]:
clean_df

Unnamed: 0,Titre,Prix,Marque,Modele,Transmission,Carburant,Annee,Kilométrage
0,BMW serie 5 a vendre,110 000,BMW,520,Automatique,Essence,2017,80000
1,peugeot 301,25 000,Peugeot,301,Manuelle,Essence,2018,78000
2,Golf 7 1.2 TSI,53 000,Volkswagen,,Manuelle,Essence,2016,115000
3,av mazda 2,37 000,Mazda,2,Manuelle,Essence,2013,150000
4,205 JUNIOR,4 900,Peugeot,205,Manuelle,Essence,1996,232000
...,...,...,...,...,...,...,...,...
1172,golf 6 (7CV) essence 1.6,38 000,Volkswagen,Golf,Manuelle,Essence,2010,98000
1173,polo for,10 000,Volkswagen,Polo,Manuelle,Essence,2001,130000
1174,4X4 PAJERO,18 000,Mitsubishi,Pajero,Manuelle,Diesel,1995,348000
1175,Polo 5 en bonne etat,13 500,Volkswagen,Polo,Manuelle,Essence,2005,145000


In [168]:
clean_df.dropna(inplace=True)

In [169]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1158 entries, 0 to 1176
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Titre         1158 non-null   object
 1   Prix          1158 non-null   object
 2   Marque        1158 non-null   object
 3   Modele        1158 non-null   object
 4   Transmission  1158 non-null   object
 5   Carburant     1158 non-null   object
 6   Annee         1158 non-null   int64 
 7   Kilométrage   1158 non-null   int64 
dtypes: int64(2), object(6)
memory usage: 81.4+ KB


In [170]:
# Function to remove spaces and convert to numeric
def clean_numeric(column):
    return pd.to_numeric(column.str.replace(' ', ''), errors='coerce')

clean_df['Prix'] = clean_numeric(clean_df['Prix'])

In [173]:
clean_df = clean_df[clean_df["Prix"] > 100]

In [174]:
clean_df

Unnamed: 0,Titre,Prix,Marque,Modele,Transmission,Carburant,Annee,Kilométrage
0,BMW serie 5 a vendre,110000.0,BMW,520,Automatique,Essence,2017,80000
1,peugeot 301,25000.0,Peugeot,301,Manuelle,Essence,2018,78000
3,av mazda 2,37000.0,Mazda,2,Manuelle,Essence,2013,150000
4,205 JUNIOR,4900.0,Peugeot,205,Manuelle,Essence,1996,232000
7,belle polo7,50000.0,Volkswagen,Polo,Manuelle,Essence,2010,195000
...,...,...,...,...,...,...,...,...
1172,golf 6 (7CV) essence 1.6,38000.0,Volkswagen,Golf,Manuelle,Essence,2010,98000
1173,polo for,10000.0,Volkswagen,Polo,Manuelle,Essence,2001,130000
1174,4X4 PAJERO,18000.0,Mitsubishi,Pajero,Manuelle,Diesel,1995,348000
1175,Polo 5 en bonne etat,13500.0,Volkswagen,Polo,Manuelle,Essence,2005,145000


In [175]:
clean_df["Prix"] = clean_df["Prix"].astype(int) 

In [176]:
clean_df

Unnamed: 0,Titre,Prix,Marque,Modele,Transmission,Carburant,Annee,Kilométrage
0,BMW serie 5 a vendre,110000,BMW,520,Automatique,Essence,2017,80000
1,peugeot 301,25000,Peugeot,301,Manuelle,Essence,2018,78000
3,av mazda 2,37000,Mazda,2,Manuelle,Essence,2013,150000
4,205 JUNIOR,4900,Peugeot,205,Manuelle,Essence,1996,232000
7,belle polo7,50000,Volkswagen,Polo,Manuelle,Essence,2010,195000
...,...,...,...,...,...,...,...,...
1172,golf 6 (7CV) essence 1.6,38000,Volkswagen,Golf,Manuelle,Essence,2010,98000
1173,polo for,10000,Volkswagen,Polo,Manuelle,Essence,2001,130000
1174,4X4 PAJERO,18000,Mitsubishi,Pajero,Manuelle,Diesel,1995,348000
1175,Polo 5 en bonne etat,13500,Volkswagen,Polo,Manuelle,Essence,2005,145000


In [177]:
clean_df.to_csv("BellouchiFinal")