In [4]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib
import psycopg2
import warnings


In [5]:
warnings.filterwarnings("ignore", category=UserWarning)

conn = psycopg2.connect(
    host="localhost",
    port="5432",
    dbname="autocar",
    user="admin",
    password="admin"
)

query = "SELECT * FROM car;"
df = pd.read_sql_query(query, conn)

conn.close()

df

Unnamed: 0,id,mileage,model,gear,offerType,price,horsePower,year,engineSize,doors,seats,previousOwner,color,averageReviewScore,brandId,fuelTypeId,categoryId,subCategoryId
0,1,41579,Focus,Automatic,used,53382,86,2012,1.1,2,4,1,Silver,0.0,1,1,1,1
1,2,159472,Malibu,Automatic,used,64322,151,2012,3.8,4,5,2,Blue,0.0,2,2,2,2
2,3,95165,Focus,Automatic,used,62432,344,2008,1.5,4,2,2,Silver,0.0,1,2,3,3
3,4,0,Accord,Manual,new,79039,189,2024,4.5,4,5,0,White,0.0,3,3,3,4
4,5,0,CR-V,Automatic,new,49719,195,2025,1.7,4,5,0,Gray,0.0,3,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,396,91904,Niro,Manual,used,77437,220,2023,2.2,2,5,3,Gray,0.0,4,4,1,2
396,397,0,Prius,Automatic,new,33643,112,2024,1.5,2,5,0,Black,0.0,9,2,2,3
397,398,34242,GLC,Automatic,used,57845,259,2005,4.2,4,4,3,White,0.0,6,3,2,3
398,399,0,RAV4,Manual,new,63929,281,2024,4.1,2,7,0,Gray,0.0,9,3,2,4


In [16]:
# 1: Loading and preparing data

# Extract IDs
car_ids = df["id"].values

# Drop columns not needed for modeling
df_features = df.drop(columns=["id"], errors='ignore')

# Update categorical and numeric columns
categorical_cols = [
    "model", "gear", "color", "offerType"
    # Note: brandId, fuelTypeId, categoryId, subCategoryId are numeric (but act categorical)
]

numeric_cols = [
    "mileage", "price", "horsePower", "year", "engineSize",
    "doors", "seats", "previousOwner", "averageReviewScore",
    "brandId", "fuelTypeId", "categoryId", "subCategoryId"
]


In [17]:
# 2: Creating and preprocessing pipelines

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, categorical_cols)
])

In [18]:
# 3: Transforming features

X = preprocessor.fit_transform(df_features)

In [19]:
df_features

Unnamed: 0,mileage,model,gear,offerType,price,horsePower,year,engineSize,doors,seats,previousOwner,color,averageReviewScore,brandId,fuelTypeId,categoryId,subCategoryId
0,41579,Focus,Automatic,used,53382,86,2012,1.1,2,4,1,Silver,0.0,1,1,1,1
1,159472,Malibu,Automatic,used,64322,151,2012,3.8,4,5,2,Blue,0.0,2,2,2,2
2,95165,Focus,Automatic,used,62432,344,2008,1.5,4,2,2,Silver,0.0,1,2,3,3
3,0,Accord,Manual,new,79039,189,2024,4.5,4,5,0,White,0.0,3,3,3,4
4,0,CR-V,Automatic,new,49719,195,2025,1.7,4,5,0,Gray,0.0,3,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,91904,Niro,Manual,used,77437,220,2023,2.2,2,5,3,Gray,0.0,4,4,1,2
396,0,Prius,Automatic,new,33643,112,2024,1.5,2,5,0,Black,0.0,9,2,2,3
397,34242,GLC,Automatic,used,57845,259,2005,4.2,4,4,3,White,0.0,6,3,2,3
398,0,RAV4,Manual,new,63929,281,2024,4.1,2,7,0,Gray,0.0,9,3,2,4


In [20]:
# 4: Training KNN model

knn = NearestNeighbors(metric="cosine", algorithm="brute")
knn.fit(X)

In [21]:
# 5: Creating recommendation function


index_to_id = {i: car_ids[i] for i in range(len(car_ids))}
id_to_index = {car_ids[i]: i for i in range(len(car_ids))}

def recommend_similar(car_id: int, n: int = 3) -> list:
    if car_id not in id_to_index:
        raise ValueError(f"Car ID {car_id} not found in the dataset.")
    
    idx = id_to_index[car_id]
    query_vector = X[idx].reshape(1, -1)
    distances, indices = knn.kneighbors(query_vector, n_neighbors=n+1) 

    similar_ids = []
    for i in indices[0]:
        if i != idx:
            similar_ids.append(int(index_to_id[i]))
        if len(similar_ids) == n:
            break

    return similar_ids

In [23]:
# 6: Testing

df[df["id"].isin(recommend_similar(1))]
 

Unnamed: 0,id,mileage,model,gear,offerType,price,horsePower,year,engineSize,doors,seats,previousOwner,color,averageReviewScore,brandId,fuelTypeId,categoryId,subCategoryId
10,11,37967,Focus,Manual,used,76545,154,2007,2.1,4,4,3,Silver,0.0,1,3,2,2
20,21,74328,C-Class,Automatic,used,63257,149,2008,2.1,2,5,1,Blue,0.0,6,1,3,3
262,263,123628,Sportage,Automatic,used,64355,183,2016,1.0,4,5,3,Silver,0.0,4,1,1,2


In [24]:
# 7: Save model and pipeline

joblib.dump(knn, "car_knn_model.joblib")
joblib.dump(preprocessor, "car_preprocessor.joblib")
joblib.dump(car_ids, "car_ids.joblib")

['car_ids.joblib']