In [187]:
import numpy as np 
import pandas as pd
import statistics
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [188]:
df = pd.read_csv("car_price_dataset.csv")
print(df.head())

        Brand   Model  Year  Engine_Size Fuel_Type    Transmission  Mileage  \
0         Kia     Rio  2020          4.2    Diesel          Manual   289944   
1   Chevrolet  Malibu  2012          2.0    Hybrid       Automatic     5356   
2    Mercedes     GLA  2020          4.2    Diesel       Automatic   231440   
3        Audi      Q5  2023          2.0  Electric          Manual   160971   
4  Volkswagen    Golf  2003          2.6    Hybrid  Semi-Automatic   286618   

   Doors  Owner_Count  Price  
0      3            5   8501  
1      2            3  12092  
2      4            2  11171  
3      2            1  11780  
4      3            3   2867  


In [189]:
# encoder
label_encoders = {}
cat_columns = ["Brand", "Model", "Fuel_Type", "Transmission"]

for col in cat_columns:
    lb_en = LabelEncoder()
    df[col] = lb_en.fit_transform(df[col])
    label_encoders[col] = lb_en

In [190]:
X = df.drop(["Price"], axis=1)
y = df["Price"]

In [191]:
# normalizer
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [192]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [193]:
# main functions i need: 
#  - calculate distance ( euclidean and manhattan)
#  - get the nearest neighbours function
#  - predict function

In [194]:
class KNN_Classifier:
    def __init__(self, distance_metric: str='euclidean', ):
        self.distance_metric = distance_metric

    def get_distance(self, training_dp, test_dp): #training_dp = [0,1,3]
        if self.distance_metric == "euclidean":
            # just mag of vec between points
            distance: float= 0

            for i in range(len(training_dp)):
                distance += (training_dp[i] - test_dp[i])**2
            euclidean_distance = np.sqrt(distance)
            return euclidean_distance

        elif self.distance_metric == "manhattan":
            distance: float= 0
            for i in range(len(training_dp)-1):
                distance += np.abs(training_dp[i] - test_dp[i])
            manhattan_distance = distance
            return manhattan_distance
        # add that last one later
    
    def get_nearest_neighbours(self, X_train,y_train, test_data,k: int=3 ):
        distance_list = []
        for i in range(len(X_train)-1):
            distance = self.get_distance(X_train[i], test_data)
            distance_list.append([y_train[i], distance])
        distance_list.sort(key=lambda x: x[1])
        return distance_list[:k]

    def predict(self,X_train,y_train, test_data,k: int=3,d_type:int = 0  ):
        neighbours = self.get_nearest_neighbours(X_train,y_train, test_data, k) # results, distance->mode of col 1
        # get the mode of the neighbours list
        if d_type == 0:#classify
            results = [x[0] for x in neighbours]
            prediction = statistics.mode(results)
            return prediction
        elif d_type == 1:#regress
            results = [x[0] for x in neighbours]
            prediction = np.mean(np.array(results))
            return prediction
    
    def predict_all(self,X_train,y_train, test_data,k: int=3,d_type:int = 0):
        predictions = []
        for i in range(len(test_data)):
            prediction = self.predict(X_train,y_train, test_data[i],k,d_type)
            predictions.append(prediction)
        return np.array(predictions)

In [195]:
# testing it 
knn = KNN_Classifier(distance_metric="manhattan")
# X_test[0]
# print(np.array(X_train).shape)
# print(np.array(y_train).shape)
# print(X_test.shape)
pred = knn.predict_all(np.array(X_train), np.array(y_train),test_data=X_test,d_type=1)
print(pred.shape)
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, pred))
print("RMSE:", rmse)


(2000,)
RMSE: 948.3909176131491
