In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('cleaned_cars.csv')

df

Unnamed: 0,url,brand,model,price,mileage,fuel_type,color,gearbox,power,engine_size,...,body_type,doors,seats,drivetrain,emission_class,condition,upholstery,upholstery_color,year,country
0,https://www.autoscout24.com/offers/alfa-romeo-...,alfa romeo,159,5500.0,233685.0,diesel,red,manual,136.0,1956.0,...,sedan,4.0,5.0,front,134.0,used,grey,,2011.0,it
1,https://www.autoscout24.com/offers/alfa-romeo-...,alfa romeo,159,13990.0,134000.0,gasoline,red,manual,200.0,1742.0,...,sedan,4.0,5.0,front,,used,,,2010.0,es
2,https://www.autoscout24.com/offers/renault-oth...,renault,,2300.0,27000.0,gasoline,yellow,manual,80.0,800.0,...,sedan,,,,,used,,alcantara,1963.0,be
3,https://www.autoscout24.com/offers/opel-antara...,opel,antara,3499.0,212575.0,gasoline,black,manual,140.0,2405.0,...,off-road/pick-up,5.0,5.0,4wd,229.0,used,,metallic,2008.0,nl
4,https://www.autoscout24.com/offers/alfa-romeo-...,alfa romeo,giulia,39800.0,7100.0,gasoline,white,automatic,280.0,1995.0,...,sedan,4.0,4.0,front,,employee's car,full leather,black,2023.0,de
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15412,https://www.autoscout24.com/offers/tesla-model...,tesla,model 3,31000.0,24000.0,electric,white,automatic,120.0,,...,sedan,5.0,5.0,rear,,used,full leather,black,2020.0,it
15413,https://www.autoscout24.com/offers/tesla-model...,tesla,model 3,28590.0,49393.0,electric,white,automatic,325.0,,...,sedan,4.0,5.0,rear,,used,,metallic,2020.0,es
15414,https://www.autoscout24.com/offers/tesla-model...,tesla,model y,57900.0,7749.0,electric,black,automatic,534.0,,...,off-road/pick-up,5.0,5.0,4wd,,used,full leather,black,2023.0,nl
15415,https://www.autoscout24.com/offers/tesla-model...,tesla,model y,59900.0,2500.0,electric,grey,,480.0,,...,sedan,5.0,5.0,4wd,,used,full leather,black,2023.0,fr


## Preprocessing

In [3]:
# Drop the url column
df = df.drop('url', axis=1)

df

Unnamed: 0,brand,model,price,mileage,fuel_type,color,gearbox,power,engine_size,seller,body_type,doors,seats,drivetrain,emission_class,condition,upholstery,upholstery_color,year,country
0,alfa romeo,159,5500.0,233685.0,diesel,red,manual,136.0,1956.0,dealer,sedan,4.0,5.0,front,134.0,used,grey,,2011.0,it
1,alfa romeo,159,13990.0,134000.0,gasoline,red,manual,200.0,1742.0,dealer,sedan,4.0,5.0,front,,used,,,2010.0,es
2,renault,,2300.0,27000.0,gasoline,yellow,manual,80.0,800.0,dealer,sedan,,,,,used,,alcantara,1963.0,be
3,opel,antara,3499.0,212575.0,gasoline,black,manual,140.0,2405.0,dealer,off-road/pick-up,5.0,5.0,4wd,229.0,used,,metallic,2008.0,nl
4,alfa romeo,giulia,39800.0,7100.0,gasoline,white,automatic,280.0,1995.0,dealer,sedan,4.0,4.0,front,,employee's car,full leather,black,2023.0,de
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15412,tesla,model 3,31000.0,24000.0,electric,white,automatic,120.0,,private seller,sedan,5.0,5.0,rear,,used,full leather,black,2020.0,it
15413,tesla,model 3,28590.0,49393.0,electric,white,automatic,325.0,,dealer,sedan,4.0,5.0,rear,,used,,metallic,2020.0,es
15414,tesla,model y,57900.0,7749.0,electric,black,automatic,534.0,,private seller,off-road/pick-up,5.0,5.0,4wd,,used,full leather,black,2023.0,nl
15415,tesla,model y,59900.0,2500.0,electric,grey,,480.0,,dealer,sedan,5.0,5.0,4wd,,used,full leather,black,2023.0,fr


In [4]:
# Check if model column has any missing values
print(df['model'].isnull().sum())

# Replace the missing values with the most frequent value for the corresponding brand
df['model'] = df.groupby('brand')['model'].transform(lambda x: x.fillna(x.mode()[0]))

print(df['model'].isnull().sum())

219
0


In [5]:
# Use a unique id for each model (brand + model)
df['model'] = df['brand'] + '_' + df['model']

df

Unnamed: 0,brand,model,price,mileage,fuel_type,color,gearbox,power,engine_size,seller,body_type,doors,seats,drivetrain,emission_class,condition,upholstery,upholstery_color,year,country
0,alfa romeo,alfa romeo_159,5500.0,233685.0,diesel,red,manual,136.0,1956.0,dealer,sedan,4.0,5.0,front,134.0,used,grey,,2011.0,it
1,alfa romeo,alfa romeo_159,13990.0,134000.0,gasoline,red,manual,200.0,1742.0,dealer,sedan,4.0,5.0,front,,used,,,2010.0,es
2,renault,renault_clio,2300.0,27000.0,gasoline,yellow,manual,80.0,800.0,dealer,sedan,,,,,used,,alcantara,1963.0,be
3,opel,opel_antara,3499.0,212575.0,gasoline,black,manual,140.0,2405.0,dealer,off-road/pick-up,5.0,5.0,4wd,229.0,used,,metallic,2008.0,nl
4,alfa romeo,alfa romeo_giulia,39800.0,7100.0,gasoline,white,automatic,280.0,1995.0,dealer,sedan,4.0,4.0,front,,employee's car,full leather,black,2023.0,de
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15412,tesla,tesla_model 3,31000.0,24000.0,electric,white,automatic,120.0,,private seller,sedan,5.0,5.0,rear,,used,full leather,black,2020.0,it
15413,tesla,tesla_model 3,28590.0,49393.0,electric,white,automatic,325.0,,dealer,sedan,4.0,5.0,rear,,used,,metallic,2020.0,es
15414,tesla,tesla_model y,57900.0,7749.0,electric,black,automatic,534.0,,private seller,off-road/pick-up,5.0,5.0,4wd,,used,full leather,black,2023.0,nl
15415,tesla,tesla_model y,59900.0,2500.0,electric,grey,,480.0,,dealer,sedan,5.0,5.0,4wd,,used,full leather,black,2023.0,fr


### Null handling

In [6]:
# Check if price column has any missing values
print(df['price'].isnull().sum())

# Check if price column has any zero values
print(df['price'].eq(0).sum())

0
0


In [7]:
# Check if mileage column has any missing values
print(df['mileage'].isnull().sum())

# Use the median value to replace the missing values in the mileage column
df['mileage'] = df['mileage'].fillna(df['mileage'].median())

print(df['mileage'].isnull().sum())

df

181
0


Unnamed: 0,brand,model,price,mileage,fuel_type,color,gearbox,power,engine_size,seller,body_type,doors,seats,drivetrain,emission_class,condition,upholstery,upholstery_color,year,country
0,alfa romeo,alfa romeo_159,5500.0,233685.0,diesel,red,manual,136.0,1956.0,dealer,sedan,4.0,5.0,front,134.0,used,grey,,2011.0,it
1,alfa romeo,alfa romeo_159,13990.0,134000.0,gasoline,red,manual,200.0,1742.0,dealer,sedan,4.0,5.0,front,,used,,,2010.0,es
2,renault,renault_clio,2300.0,27000.0,gasoline,yellow,manual,80.0,800.0,dealer,sedan,,,,,used,,alcantara,1963.0,be
3,opel,opel_antara,3499.0,212575.0,gasoline,black,manual,140.0,2405.0,dealer,off-road/pick-up,5.0,5.0,4wd,229.0,used,,metallic,2008.0,nl
4,alfa romeo,alfa romeo_giulia,39800.0,7100.0,gasoline,white,automatic,280.0,1995.0,dealer,sedan,4.0,4.0,front,,employee's car,full leather,black,2023.0,de
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15412,tesla,tesla_model 3,31000.0,24000.0,electric,white,automatic,120.0,,private seller,sedan,5.0,5.0,rear,,used,full leather,black,2020.0,it
15413,tesla,tesla_model 3,28590.0,49393.0,electric,white,automatic,325.0,,dealer,sedan,4.0,5.0,rear,,used,,metallic,2020.0,es
15414,tesla,tesla_model y,57900.0,7749.0,electric,black,automatic,534.0,,private seller,off-road/pick-up,5.0,5.0,4wd,,used,full leather,black,2023.0,nl
15415,tesla,tesla_model y,59900.0,2500.0,electric,grey,,480.0,,dealer,sedan,5.0,5.0,4wd,,used,full leather,black,2023.0,fr


In [8]:
# Check if fuel_type column has any missing values
print(df['fuel_type'].isnull().sum())

# Replace the missing values with the most frequent value for the corresponding model 

print(df['fuel_type'].isnull().sum())

26
26


In [9]:
# Check if color column has any missing values
print(df['color'].isnull().sum())

# Replace the missing values with the most frequent value for the color column
df['color'] = df['color'].fillna(df['color'].mode()[0])

print(df['color'].isnull().sum())

110
0


In [10]:
# Step 1: Calculate the most common gearbox for each model
most_common_gearbox = df.groupby('model')['gearbox'].agg(pd.Series.mode)

# Step 2: Define a function to apply to each row which fills missing gearbox values
def fill_gearbox(row):
    if pd.isna(row['gearbox']):
        return most_common_gearbox[row['model']]
    else:
        return row['gearbox']

# Step 3: Apply the function to fill missing gearbox values
df['gearbox'] = df.apply(fill_gearbox, axis=1)

# Check the result
print(df['gearbox'].isnull().sum())


0


In [11]:
# Check if power column has any missing values
print(df['power'].isnull().sum())

# Step 1: Calculate the median power for each model
median_power = df.groupby('model')['power'].median()

# Step 2: Define a function to apply to each row which fills missing power values
def fill_power(row):
    if pd.isna(row['power']):
        return median_power[row['model']]
    else:
        return row['power']

# Step 3: Apply the function to fill missing power values
df['power'] = df.apply(fill_power, axis=1)

# Check the result
print(df['power'].isnull().sum())

236
9


In [12]:
# For the remaining missing values in the power column, replace them with the median power of the brand
df['power'] = df.groupby('brand')['power'].transform(lambda x: x.fillna(x.median()))

print(df['power'].isnull().sum())

0


In [13]:
# Check if engine_size column has any missing values
print(df['engine_size'].isnull().sum())

# Step 1: Calculate the median engine_size for each model
median_engine_size = df.groupby('model')['engine_size'].median()

# Step 2: Define a function to apply to each row which fills missing engine_size values
def fill_engine_size(row):
    if pd.isna(row['engine_size']):
        return median_engine_size[row['model']]
    else:
        return row['engine_size']

# Step 3: Apply the function to fill missing engine_size values
df['engine_size'] = df.apply(fill_engine_size, axis=1)

# Check the result
print(df['engine_size'].isnull().sum())

1626
139


In [14]:
# For the remaining missing values in the engine_size column, replace them with the median engine_size of the brand
df['engine_size'] = df.groupby('brand')['engine_size'].transform(lambda x: x.fillna(x.median()))

print(df['engine_size'].isnull().sum())

0


In [15]:
# Check if seller column has any missing values
print(df['seller'].isnull().sum())

0


In [16]:
# Check if body_type column has any missing values
print(df['body_type'].isnull().sum())

0


In [17]:
# Check if doors column has any missing values
print(df['doors'].isnull().sum())

# Step 1: Calculate the most common doors for each model 
most_common_doors = df.groupby('model')['doors'].agg(pd.Series.mode)

# Step 2: Define a function to apply to each row which fills missing doors values
def fill_doors(row):
    if pd.isna(row['doors']):
        return most_common_doors[row['model']]
    else:
        return row['doors']

# Step 3: Apply the function to fill missing doors values
df['doors'] = df.apply(fill_doors, axis=1)

print(df['doors'].isnull().sum())

4264


0


In [18]:
# Check if seats column has any missing values
print(df['seats'].isnull().sum())

# Step 1: Calculate the most common seats for each model
most_common_seats = df.groupby('model')['seats'].agg(pd.Series.mode)

# Step 2: Define a function to apply to each row which fills missing seats values
def fill_seats(row):
    if pd.isna(row['seats']):
        return most_common_seats[row['model']]
    else:
        return row['seats']

# Step 3: Apply the function to fill missing seats values
df['seats'] = df.apply(fill_seats, axis=1)

print(df['seats'].isnull().sum())

1329
0


In [19]:
# Check if drivetrain column has any missing values
print(df['drivetrain'].isnull().sum())

# Step 1: Calculate the most common drivetrain for each model
most_common_drivetrain = df.groupby('model')['drivetrain'].agg(pd.Series.mode)

# Step 2: Define a function to apply to each row which fills missing drivetrain values
def fill_drivetrain(row):
    if pd.isna(row['drivetrain']):
        return most_common_drivetrain[row['model']]
    else:
        return row['drivetrain']

# Step 3: Apply the function to fill missing drivetrain values
df['drivetrain'] = df.apply(fill_drivetrain, axis=1)

print(df['drivetrain'].isnull().sum())


3886
0


In [20]:
# Check if emission_class column has any missing values
print(df['emission_class'].isnull().sum())

# Step 1: Calculate the most common emission_class for each model
most_common_emission_class = df.groupby('model')['emission_class'].agg(pd.Series.mean)

# Step 2: Define a function to apply to each row which fills missing emission_class values
def fill_emission_class(row):
    if pd.isna(row['emission_class']):
        return most_common_emission_class[row['model']]
    else:
        return row['emission_class']

# Step 3: Apply the function to fill missing emission_class values
df['emission_class'] = df.apply(fill_emission_class, axis=1)

print(df['emission_class'].isnull().sum())

# Replace the remaining missing values in the emission_class column with the most frequent value for the corresponding brand
df['emission_class'] = df.groupby('brand')['emission_class'].transform(lambda x: x.fillna(x.mode()[0]))

print(df['emission_class'].isnull().sum())

6048
455
0


In [21]:
# Check if condition column has any missing values
print(df['condition'].isnull().sum())

0


In [22]:
# Check if upholstery column has any missing values
print(df['upholstery'].isnull().sum())

# Step 1: Calculate the most common upholstery for each model
most_common_upholstery = df.groupby('model')['upholstery'].agg(pd.Series.mode)

# Step 2: Define a function to apply to each row which fills missing upholstery values
def fill_upholstery(row):
    if pd.isna(row['upholstery']):
        return most_common_upholstery[row['model']]
    else:
        return row['upholstery']
        
# Step 3: Apply the function to fill missing upholstery values
df['upholstery'] = df.apply(fill_upholstery, axis=1)

print(df['upholstery'].isnull().sum())

4414
0


In [23]:
# Check if year column has any missing values
print(df['year'].isnull().sum())

# Step 1: Calculate the median year for each model
median_year = df.groupby('model')['year'].median()

# Step 2: Define a function to apply to each row which fills missing year values
def fill_year(row):
    if pd.isna(row['year']):
        return median_year[row['model']]
    else:
        return row['year']

# Step 3: Apply the function to fill missing year values
df['year'] = df.apply(fill_year, axis=1)

print(df['year'].isnull().sum())

# For the remaining missing values in the year column, replace them with the median year of the brand
df['year'] = df.groupby('brand')['year'].transform(lambda x: x.fillna(x.median()))

718
2


In [24]:
# Check if country column has any missing values
print(df['country'].isnull().sum())

0


### Normalization and one-hot encoding