In [234]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error


car_filepath = 'C:/Users/luisd/Downloads/used_car_datasets/car_dataset.csv'
car_df = pd.read_csv(car_filepath)

car_df.head(5)



Unnamed: 0,make,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,audi,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4
1,audi,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0
2,audi,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4
3,audi,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0
4,audi,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0


In [235]:
car_df.describe()

Unnamed: 0,year,price,mileage,tax,mpg,engineSize
count,85555.0,85555.0,85555.0,85555.0,85555.0,85555.0
mean,2017.108305,17824.921793,22988.745076,118.950733,55.745505,1.702485
std,2.127201,10162.376307,21312.435922,64.404267,16.839501,0.584701
min,1970.0,495.0,1.0,0.0,0.3,0.0
25%,2016.0,10790.0,7041.0,125.0,47.1,1.2
50%,2017.0,15700.0,17232.0,145.0,55.4,1.6
75%,2019.0,21998.0,32345.0,145.0,62.8,2.0
max,2060.0,159999.0,323000.0,580.0,470.8,6.6


In [236]:
#Shows the total models of each make
car_df.groupby(['make'])['model'].count()

make
Ford        17965
Hyundai      4860
Mercedes    13119
Skoda        6267
Toyota       6738
VW          15157
audi        10668
bmw         10781
Name: model, dtype: int64

In [237]:
car_df =car_df.dropna()

#Encoding categorical variables
#dummy_df = pd.get_dummies(car_df['make'], drop_first=True, prefix='make')
#car_df = pd.concat([car_df, dummy_df], axis=1)

car_df.head(5)

Unnamed: 0,make,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,audi,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4
1,audi,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0
2,audi,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4
3,audi,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0
4,audi,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0


In [238]:
from scipy import stats

z_scores = np.abs(stats.zscore(car_df.select_dtypes(include=[np.number])))

# Define a threshold
threshold = 2

# Identify outliers
outliers = (z_scores > threshold).any(axis=1)

# Remove outliers
car_df_no_outliers = car_df[~outliers]

print(f"Original data shape: {car_df.shape}")
print(f"Data shape after outlier removal: {car_df_no_outliers.shape}")

car_df = car_df_no_outliers
car_df.shape

Original data shape: (85555, 10)
Data shape after outlier removal: (72951, 10)


(72951, 10)

In [239]:
cols_to_encode = ['make', 'model', 'fuelType','transmission']
df_dummies = pd.get_dummies(car_df[cols_to_encode],drop_first=True)
car_df_encoded = pd.concat([car_df, df_dummies], axis=1)
car_df_encoded.head(5)
#used dummy encoding to encode categorical variables

Unnamed: 0,make,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize,...,model_180,model_200,model_220,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,transmission_Manual,transmission_Other,transmission_Semi-Auto
0,audi,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4,...,False,False,False,False,False,False,True,True,False,False
1,audi,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0,...,False,False,False,False,False,False,False,False,False,False
2,audi,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4,...,False,False,False,False,False,False,True,True,False,False
3,audi,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0,...,False,False,False,False,False,False,False,False,False,False
4,audi,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0,...,False,False,False,False,False,False,True,True,False,False


In [240]:
car_df_encoded['car_age'] = 2024 - car_df_encoded['year']

car_df_encoded = car_df_encoded.drop(columns=['make', 'model', 'year', 'fuelType', 'transmission'])

car_df_encoded.head(5)

Unnamed: 0,price,mileage,tax,mpg,engineSize,make_Hyundai,make_Mercedes,make_Skoda,make_Toyota,make_VW,...,model_200,model_220,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,transmission_Manual,transmission_Other,transmission_Semi-Auto,car_age
0,12500,15735,150,55.4,1.4,False,False,False,False,False,...,False,False,False,False,False,True,True,False,False,7
1,16500,36203,20,64.2,2.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,8
2,11000,29946,30,55.4,1.4,False,False,False,False,False,...,False,False,False,False,False,True,True,False,False,8
3,16800,25952,145,67.3,2.0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,7
4,17300,1998,145,49.6,1.0,False,False,False,False,False,...,False,False,False,False,False,True,True,False,False,5


In [241]:
#creating target variable
X= car_df_encoded.drop('price', axis=1)
y = car_df_encoded['price']

scaler = StandardScaler()
X = scaler.fit_transform(X)

#splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

#creating KNN model
k = 3
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 1662.2890494675141


In [242]:
#original rmse is quite high
#using minmax to improve performance
from sklearn.preprocessing import MinMaxScaler

scale_cols = ['tax', 'mileage', 'engineSize', 'mpg']
scale = MinMaxScaler()
scale.fit(car_df_encoded[scale_cols])

In [243]:
normalized = scale.fit_transform(car_df_encoded[scale_cols])
normalized

array([[0.625     , 0.23980735, 0.22222222, 0.45439469],
       [0.08333333, 0.55176723, 0.55555556, 0.60033167],
       [0.125     , 0.45640213, 0.22222222, 0.45439469],
       ...,
       [0.60416667, 0.38879151, 0.55555556, 0.47263682],
       [0.52083333, 0.63783512, 0.55555556, 0.51243781],
       [0.52083333, 0.88398287, 0.55555556, 0.51243781]])

In [244]:
i=0
for col in scale_cols:
    car_df_encoded[col] = normalized[:,i]
    i+=1

In [245]:
car_df_encoded.head(5)

Unnamed: 0,price,mileage,tax,mpg,engineSize,make_Hyundai,make_Mercedes,make_Skoda,make_Toyota,make_VW,...,model_200,model_220,fuelType_Electric,fuelType_Hybrid,fuelType_Other,fuelType_Petrol,transmission_Manual,transmission_Other,transmission_Semi-Auto,car_age
0,12500,0.239807,0.625,0.454395,0.222222,False,False,False,False,False,...,False,False,False,False,False,True,True,False,False,7
1,16500,0.551767,0.083333,0.600332,0.555556,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,8
2,11000,0.456402,0.125,0.454395,0.222222,False,False,False,False,False,...,False,False,False,False,False,True,True,False,False,8
3,16800,0.395528,0.604167,0.651741,0.555556,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,7
4,17300,0.030437,0.604167,0.358209,0.0,False,False,False,False,False,...,False,False,False,False,False,True,True,False,False,5


In [246]:
X, y = car_df_encoded.drop('price', axis=1), car_df_encoded['price']

X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.75)

new_model = KNeighborsRegressor(n_neighbors=3, weights='distance', metric='euclidean')
new_model.fit(X_train, y_train)

In [247]:
#after minmax and using a new model with Euclidean distance the efficiency has skyrocketed
#using R^2
new_model.score(X_test, y_test)

0.944442189801134

In [248]:
#Using R^2
new_model.score(X_train, y_train)

0.9993651031383161

In [249]:
y_pred = new_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Root Mean Squared Error: {rmse}')

Root Mean Squared Error: 1664.4145505945537


In [250]:
#used cross validation to improve the rmse
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(knn, X, y, cv=10, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-cv_scores)

print(f'Cross-validated RMSE: {rmse_scores.mean()} ± {rmse_scores.std()}')

Cross-validated RMSE: 2527.908287810419 ± 836.6553230266518


In [251]:
y_pred_test = new_model.predict(X_test)
y_pred_train = new_model.predict(X_train)

y_test = pd.DataFrame({'Y Test':y_test,'Y Pred':y_pred_test})
y_train = pd.DataFrame({'Y Train':y_train,'Y Pred':y_pred_train})

In [252]:
#This df shows the comparisons between the target prediction and the actual value through testing
y_test.head(10)

Unnamed: 0,Y Test,Y Pred
58483,8000,8000.0
61708,23490,22893.732719
78553,9701,9245.566005
54443,13740,14759.572472
1404,20435,18169.315042
32490,9995,10061.304024
57745,14498,15568.043123
44496,17998,19994.537722
61288,21690,23169.233367
80337,10860,11406.507348
