In [1]:
import os
import pandas as pd
import numpy as np

df = pd.read_csv(r'./CarsData.csv')

In [2]:
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,Manufacturer
0,I10,2017,7495,Manual,11630,Petrol,145,60.1,1.0,hyundi
1,Polo,2017,10989,Manual,9200,Petrol,145,58.9,1.0,volkswagen
2,2 Series,2019,27990,Semi-Auto,1614,Diesel,145,49.6,2.0,BMW
3,Yeti Outdoor,2017,12495,Manual,30960,Diesel,150,62.8,2.0,skoda
4,Fiesta,2017,7999,Manual,19353,Petrol,125,54.3,1.2,ford


In [3]:
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, mean_squared_error

df = pd.get_dummies(df, columns=['transmission'])

model_mean_prices = df.groupby('model')['price'].mean()
df['model_mean_price'] = df['model'].map(model_mean_prices)

manufacturer_mean_prices = df.groupby('Manufacturer')['price'].mean()
df['manufacturer_mean_price'] = df['Manufacturer'].map(manufacturer_mean_prices)

df = pd.get_dummies(df, columns=['fuelType'])

df.head()

Unnamed: 0,model,year,price,mileage,tax,mpg,engineSize,Manufacturer,transmission_Automatic,transmission_Manual,transmission_Other,transmission_Semi-Auto,model_mean_price,manufacturer_mean_price,fuelType_Diesel,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol
0,I10,2017,7495,11630,145,60.1,1.0,hyundi,False,True,False,False,7718.184731,12727.809384,False,False,False,False,True
1,Polo,2017,10989,9200,145,58.9,1.0,volkswagen,False,True,False,False,11318.725647,16807.898073,False,False,False,False,True
2,2 Series,2019,27990,1614,145,49.6,2.0,BMW,False,False,False,True,19445.345118,22692.888691,True,False,False,False,False
3,Yeti Outdoor,2017,12495,30960,150,62.8,2.0,skoda,False,True,False,False,12756.576497,14284.802683,True,False,False,False,False
4,Fiesta,2017,7999,19353,125,54.3,1.2,ford,False,True,False,False,10190.856199,12269.779238,False,False,False,False,True


In [9]:
min_threshold = 10
max_threshold = 120
df_cleaned = df[(df['mpg'] < max_threshold)]
df_cleaned = df_cleaned[(df_cleaned['mpg'] > min_threshold)]
df_cleaned.describe()

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,model_mean_price,manufacturer_mean_price
count,97139.0,97139.0,97139.0,97139.0,97139.0,97139.0,97139.0,97139.0
mean,2017.06501,16731.439607,23188.005786,120.408435,54.518845,1.664819,16744.685539,16749.40012
std,2.121469,9846.748583,21035.371032,63.209165,11.355634,0.558563,7750.858173,5264.537719
min,1970.0,450.0,1.0,0.0,11.0,0.0,1295.0,10314.259541
25%,2016.0,9999.0,7672.0,125.0,47.1,1.2,10548.84034,12269.779238
50%,2017.0,14386.0,17652.0,145.0,54.3,1.6,15810.910805,14284.802683
75%,2019.0,20695.0,32457.0,145.0,62.8,2.0,20836.05814,22692.888691
max,2024.0,159999.0,323000.0,580.0,117.7,6.6,98934.2,24636.426361


In [11]:
min_threshold_eg = 0.6
max_threshold_eg = 6.5
df_cleaned = df_cleaned[(df_cleaned['engineSize'] < max_threshold_eg)]
df_cleaned = df_cleaned[(df_cleaned['engineSize'] > min_threshold_eg)]
df_cleaned.describe()

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,model_mean_price,manufacturer_mean_price
count,96906.0,96906.0,96906.0,96906.0,96906.0,96906.0,96906.0,96906.0
mean,2017.06498,16731.324562,23196.602904,120.40142,54.521414,1.668686,16748.755651,16751.035085
std,2.115785,9844.979316,21043.620034,63.19778,11.348004,0.552828,7752.21681,5264.610286
min,1970.0,450.0,1.0,0.0,11.0,1.0,1295.0,10314.259541
25%,2016.0,9999.0,7674.0,125.0,47.1,1.2,10548.84034,12269.779238
50%,2017.0,14390.0,17651.5,145.0,54.3,1.6,15810.910805,14284.802683
75%,2019.0,20698.0,32469.0,145.0,62.8,2.0,20836.05814,22692.888691
max,2024.0,159999.0,323000.0,580.0,117.7,6.3,98934.2,24636.426361


In [12]:
df_cleaned[['transmission_Automatic', 'transmission_Manual', 'transmission_Other', 'transmission_Semi-Auto']] = df_cleaned[['transmission_Automatic', 'transmission_Manual', 'transmission_Other', 'transmission_Semi-Auto']].astype(float)
df_cleaned[['fuelType_Diesel', 'fuelType_Electric', 'fuelType_Hybrid', 'fuelType_Other', 'fuelType_Petrol']] = df_cleaned[['fuelType_Diesel', 'fuelType_Electric', 'fuelType_Hybrid', 'fuelType_Other', 'fuelType_Petrol']].astype(float)
df_cleaned['price'] = df_cleaned['price'].astype(int)

df_cleaned.head()

Unnamed: 0,model,year,price,mileage,tax,mpg,engineSize,Manufacturer,transmission_Automatic,transmission_Manual,transmission_Other,transmission_Semi-Auto,model_mean_price,manufacturer_mean_price,fuelType_Diesel,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol
0,I10,2017,7495,11630,145,60.1,1.0,hyundi,0.0,1.0,0.0,0.0,7718.184731,12727.809384,0.0,0.0,0.0,0.0,1.0
1,Polo,2017,10989,9200,145,58.9,1.0,volkswagen,0.0,1.0,0.0,0.0,11318.725647,16807.898073,0.0,0.0,0.0,0.0,1.0
2,2 Series,2019,27990,1614,145,49.6,2.0,BMW,0.0,0.0,0.0,1.0,19445.345118,22692.888691,1.0,0.0,0.0,0.0,0.0
3,Yeti Outdoor,2017,12495,30960,150,62.8,2.0,skoda,0.0,1.0,0.0,0.0,12756.576497,14284.802683,1.0,0.0,0.0,0.0,0.0
4,Fiesta,2017,7999,19353,125,54.3,1.2,ford,0.0,1.0,0.0,0.0,10190.856199,12269.779238,0.0,0.0,0.0,0.0,1.0


In [13]:
df_cleaned.drop(['model', 'Manufacturer'], axis=1, inplace=True)

In [14]:
X = df_cleaned.drop('price', axis=1)
y = df_cleaned['price']

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 0.25 x 0.8 = 0.2

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
X_train_scaled

array([[-1.42752194,  1.5286799 , -1.42243676, ..., -0.16544   ,
        -0.04490365, -1.11750198],
       [-0.4919582 , -0.10079655, -1.42243676, ..., -0.16544   ,
        -0.04490365, -1.11750198],
       [-0.4919582 ,  0.91034421, -1.5799873 , ..., -0.16544   ,
        -0.04490365, -1.11750198],
       ...,
       [-0.02417633, -1.01168576,  0.46816978, ..., -0.16544   ,
        -0.04490365,  0.894853  ],
       [ 0.91138741, -0.99250269,  0.3893945 , ..., -0.16544   ,
        -0.04490365,  0.894853  ],
       [ 0.91138741, -0.8051968 ,  0.3893945 , ..., -0.16544   ,
        -0.04490365, -1.11750198]])

In [16]:
from sklearn.metrics import mean_squared_error, r2_score

svr = SVR(kernel='rbf', C=1000, epsilon=0.5, verbose=3)
svr.fit(X_train_scaled, y_train)

y_val_pred = svr.predict(X_val_scaled)
initial_mse = mean_squared_error(y_val, y_val_pred)
initial_r2 = r2_score(y_val, y_val_pred)

print(f"Initial MSE: {initial_mse}")
print(f"Initial R²: {initial_r2}")

[LibSVM]Initial MSE: 10838171.246303327
Initial R²: 0.8897905767560066


In [17]:
best_svr = SVR(kernel='rbf', C=750000, epsilon=1000)
best_svr.fit(X_train_scaled, y_train)

In [18]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

y_test_pred = best_svr.predict(X_test_scaled)

final_mse = mean_squared_error(y_test, y_test_pred)
final_mae = mean_absolute_error(y_test, y_test_pred)
final_r2 = r2_score(y_test, y_test_pred)

print(f"Final MSE: {final_mse}")
print(f"Final MAE: {final_mae}")
print(f"Final R²: {final_r2}")

Final MSE: 5619117.5178378485
Final MAE: 1395.6160079254976
Final R²: 0.9419197135613324
