## Imports

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge,Lasso 
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import glob
import math

### https://www.kaggle.com/datasets/adityadesai13/used-car-dataset-ford-and-mercedes
This dataset was used in the two notebooks, after deleting the 2 uncleaned csv files.

## Get all CSV files and combine them

In [2]:
# Get all CSV files in the folder
csv_files = glob.glob("cars/*.csv") 

# Read all CSV files and store them in a list of DataFrames
dfs = [pd.read_csv(file) for file in csv_files]

# Combine all DataFrames into one
df_combined = pd.concat(dfs, ignore_index=True)

print(df_combined.shape)  # Check total rows & columns
print(df_combined.head())  # Show first 5 rows


(108540, 10)
  model  year  price transmission  mileage fuelType    tax   mpg  engineSize  \
0    A1  2017  12500       Manual    15735   Petrol  150.0  55.4         1.4   
1    A6  2016  16500    Automatic    36203   Diesel   20.0  64.2         2.0   
2    A1  2016  11000       Manual    29946   Petrol   30.0  55.4         1.4   
3    A4  2017  16800    Automatic    25952   Diesel  145.0  67.3         2.0   
4    A3  2019  17300       Manual     1998   Petrol  145.0  49.6         1.0   

   tax(£)  
0     NaN  
1     NaN  
2     NaN  
3     NaN  
4     NaN  


## Dropping what's not important (or what's unclean)

In [3]:
df_combined = df_combined.drop("tax(£)", axis=1)
df_combined = df_combined.drop("tax", axis=1)
print(df_combined.shape)  # Check total rows & columns
print(df_combined.head())  # Show first 5 rows

(108540, 8)
  model  year  price transmission  mileage fuelType   mpg  engineSize
0    A1  2017  12500       Manual    15735   Petrol  55.4         1.4
1    A6  2016  16500    Automatic    36203   Diesel  64.2         2.0
2    A1  2016  11000       Manual    29946   Petrol  55.4         1.4
3    A4  2017  16800    Automatic    25952   Diesel  67.3         2.0
4    A3  2019  17300       Manual     1998   Petrol  49.6         1.0


## MPG did have null values, you can fill them with median or mean

In [4]:
# Create an imputer with median strategy
imputer = SimpleImputer(strategy="median")

# Fit and transform only the 'mileage' column
df_combined["mpg"] = imputer.fit_transform(df_combined[["mpg"]])

## Any categorical column can be hot encoded

In [5]:
# Load data (assuming df_combined is already read)
categorical_cols = ["model","transmission", "fuelType"]

# One-Hot Encoding (drop first category to avoid multicollinearity)
encoder = OneHotEncoder(sparse_output=False, drop="first")
encoded_data = encoder.fit_transform(df_combined[categorical_cols])

# Convert to DataFrame with proper column names
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_cols))

# Drop original categorical columns and concatenate new encoded columns
df_combined = df_combined.drop(categorical_cols, axis=1)
df_combined = pd.concat([df_combined, encoded_df], axis=1)
print(df_combined.head())

   year  price  mileage   mpg  engineSize  model_ 2 Series  model_ 3 Series  \
0  2017  12500    15735  55.4         1.4              0.0              0.0   
1  2016  16500    36203  64.2         2.0              0.0              0.0   
2  2016  11000    29946  55.4         1.4              0.0              0.0   
3  2017  16800    25952  67.3         2.0              0.0              0.0   
4  2019  17300     1998  49.6         1.0              0.0              0.0   

   model_ 4 Series  model_ 5 Series  model_ 6 Series  ...  model_200  \
0              0.0              0.0              0.0  ...        0.0   
1              0.0              0.0              0.0  ...        0.0   
2              0.0              0.0              0.0  ...        0.0   
3              0.0              0.0              0.0  ...        0.0   
4              0.0              0.0              0.0  ...        0.0   

   model_220  model_230  transmission_Manual  transmission_Other  \
0        0.0        0.0 

## Car age is better than year and more descriptive

In [6]:
df_combined["car_age"] = 2025 - df_combined["year"]
df_combined = df_combined.drop("year", axis=1)  # Drop the original year column
print(df_combined.head())  # Show first 5 rows

   price  mileage   mpg  engineSize  model_ 2 Series  model_ 3 Series  \
0  12500    15735  55.4         1.4              0.0              0.0   
1  16500    36203  64.2         2.0              0.0              0.0   
2  11000    29946  55.4         1.4              0.0              0.0   
3  16800    25952  67.3         2.0              0.0              0.0   
4  17300     1998  49.6         1.0              0.0              0.0   

   model_ 4 Series  model_ 5 Series  model_ 6 Series  model_ 7 Series  ...  \
0              0.0              0.0              0.0              0.0  ...   
1              0.0              0.0              0.0              0.0  ...   
2              0.0              0.0              0.0              0.0  ...   
3              0.0              0.0              0.0              0.0  ...   
4              0.0              0.0              0.0              0.0  ...   

   model_220  model_230  transmission_Manual  transmission_Other  \
0        0.0        0.0 

## Separate features (X) and target (y)

In [7]:
X = df_combined.drop("price", axis=1)  # Features
y = df_combined["price"]  # Target
print("Mean price:", y.mean())
y.median()

Mean price: 16890.124046434496


14698.0

## Split data: 60% train, 20% validation, 20% test

In [8]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)  # 60% train
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # 20% val, 20% test

## Scaling the data (zero mean, unit variance)

In [9]:
scaler = StandardScaler()

# Fit on training data, transform all sets
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


## Linear Regression Model

In [10]:
print(df_combined.isnull().sum())  # used it before MPG
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

# Predict
y_val_pred = lr_model.predict(X_val_scaled)
y_test_pred = lr_model.predict(X_test_scaled)

# Calculate MSE
val_mse = mean_squared_error(y_val, y_val_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f"Linear Regression - Validation MSE: {val_mse:.2f}")
print(f"Linear Regression - Test MSE: {test_mse:.2f}")


price                0
mileage              0
mpg                  0
engineSize           0
model_ 2 Series      0
                    ..
fuelType_Electric    0
fuelType_Hybrid      0
fuelType_Other       0
fuelType_Petrol      0
car_age              0
Length: 206, dtype: int64
Linear Regression - Validation MSE: 14043075.13
Linear Regression - Test MSE: 14211374.03


## L1 Regularization

In [11]:
lasso_model = Lasso(alpha=1.0)  # Alpha = Regularization Strength
lasso_model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_val_lasso = lasso_model.predict(X_val_scaled)
y_test_lasso = lasso_model.predict(X_test_scaled)

lasso_val_mse = mean_squared_error(y_val, y_val_lasso)
lasso_test_mse = mean_squared_error(y_test, y_test_lasso)

print(f"Ridge Regression - Validation MSE: {lasso_val_mse:.2f}")
print(f"Ridge Regression - Test MSE: {lasso_test_mse:.2f}")


Ridge Regression - Validation MSE: 14043009.11
Ridge Regression - Test MSE: 14214127.53


## L2 Regularization

In [12]:
ridge_model = Ridge(alpha=1.0)  # Alpha = Regularization Strength
ridge_model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_val_ridge = ridge_model.predict(X_val_scaled)
y_test_ridge = ridge_model.predict(X_test_scaled)

ridge_val_mse = mean_squared_error(y_val, y_val_ridge)
ridge_test_mse = mean_squared_error(y_test, y_test_ridge)

print(f"Ridge Regression - Validation MSE: {ridge_val_mse:.2f}")
print(f"Ridge Regression - Test MSE: {ridge_test_mse:.2f}")


Ridge Regression - Validation MSE: 14043056.36
Ridge Regression - Test MSE: 14211397.68


In [13]:
math.sqrt(ridge_test_mse)/y.median()

0.256483926234425

In [14]:
math.sqrt(lasso_test_mse)/y.median()

0.2565085588850643

It can be better, check version two for a better model, and check the changes.