In [30]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib

In [31]:
df = pd.read_csv("cars_dataset.csv", sep=",")
df

Unnamed: 0,id,milage,brand,model,fuel,gear,offerType,price,horse power,year,engine size,doors,seats,previous owner,color,category,image url
0,1,25699,Mercedes-Benz,V 250,Diesel,Automatic,Employee's car,63336,190.0,2020,2.5,4,4,2,Silver,Luxury,https://example.com/image.jpg
1,2,33572,Volvo,V90 Cross Country,Diesel,Automatic,Used,31995,190.0,2018,4.8,4,4,2,White,Pickup,https://example.com/image.jpg
2,3,12500,Mazda,3,Gasoline,Automatic,Used,26990,179.0,2019,3.9,4,4,3,Black,Hatchback,https://example.com/image.jpg
3,4,29750,BMW,320,Diesel,Automatic,Used,38900,190.0,2019,3.4,4,4,1,Gray,Luxury,https://example.com/image.jpg
4,5,99121,Peugeot,206,Gasoline,Manual,Used,3999,60.0,2011,1.6,4,4,3,Black,Sports Car,https://example.com/image.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,396,98000,Skoda,Rapid/Spaceback,Gasoline,Automatic,Used,8990,90.0,2017,2.2,2,4,0,Blue,Sports Car,https://example.com/image.jpg
396,397,58000,Chevrolet,Spark,Gasoline,Manual,Used,3600,68.0,2013,2.4,4,4,3,Black,Sedan,https://example.com/image.jpg
397,398,10572,Opel,Corsa,Gasoline,Manual,Employee's car,13490,75.0,2020,3.3,4,2,0,Red,Hatchback,https://example.com/image.jpg
398,399,25699,Mercedes-Benz,V 250,Diesel,Automatic,Employee's car,63336,190.0,2020,2.5,4,4,2,Silver,Luxury,https://example.com/image.jpg


In [36]:
# 1: Loading and preparing data


car_ids = df["id"].values
df_features = df.drop(columns=["id", "image url", "offerType"], errors='ignore')

categorical_cols = ["brand", "model", "gear", "fuel", "color", "category"]
numeric_cols = ["milage", "price", "horse power", "year", "engine size", "doors", "seats", "previous owner"]


In [37]:
# 2: Creating and preprocessing pipelines

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, categorical_cols)
])

In [38]:
# 3: Transforming features

X = preprocessor.fit_transform(df_features)

In [40]:
df_features

Unnamed: 0,milage,brand,model,fuel,gear,price,horse power,year,engine size,doors,seats,previous owner,color,category
0,25699,Mercedes-Benz,V 250,Diesel,Automatic,63336,190.0,2020,2.5,4,4,2,Silver,Luxury
1,33572,Volvo,V90 Cross Country,Diesel,Automatic,31995,190.0,2018,4.8,4,4,2,White,Pickup
2,12500,Mazda,3,Gasoline,Automatic,26990,179.0,2019,3.9,4,4,3,Black,Hatchback
3,29750,BMW,320,Diesel,Automatic,38900,190.0,2019,3.4,4,4,1,Gray,Luxury
4,99121,Peugeot,206,Gasoline,Manual,3999,60.0,2011,1.6,4,4,3,Black,Sports Car
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,98000,Skoda,Rapid/Spaceback,Gasoline,Automatic,8990,90.0,2017,2.2,2,4,0,Blue,Sports Car
396,58000,Chevrolet,Spark,Gasoline,Manual,3600,68.0,2013,2.4,4,4,3,Black,Sedan
397,10572,Opel,Corsa,Gasoline,Manual,13490,75.0,2020,3.3,4,2,0,Red,Hatchback
398,25699,Mercedes-Benz,V 250,Diesel,Automatic,63336,190.0,2020,2.5,4,4,2,Silver,Luxury


In [41]:
# 4: Training KNN model

knn = NearestNeighbors(metric="cosine", algorithm="brute")
knn.fit(X)

In [49]:
# 5: Creating recommendation function


index_to_id = {i: car_ids[i] for i in range(len(car_ids))}
id_to_index = {car_ids[i]: i for i in range(len(car_ids))}

def recommend_similar(car_id: int, n: int = 3) -> list:
    if car_id not in id_to_index:
        raise ValueError(f"Car ID {car_id} not found in the dataset.")
    
    idx = id_to_index[car_id]
    query_vector = X[idx].reshape(1, -1)
    distances, indices = knn.kneighbors(query_vector, n_neighbors=n+1) 

    similar_ids = []
    for i in indices[0]:
        if i != idx:
            similar_ids.append(int(index_to_id[i]))
        if len(similar_ids) == n:
            break

    return similar_ids

In [55]:
# 6: Testing

df[df["id"].isin(recommend_similar(7, n=3))]
 

Unnamed: 0,id,milage,brand,model,fuel,gear,offerType,price,horse power,year,engine size,doors,seats,previous owner,color,category,image url
148,149,53473,Opel,Adam,Gasoline,Manual,Used,7800,69.0,2015,1.2,2,2,1,White,Hatchback,https://example.com/image.jpg
155,156,90362,Skoda,Citigo,Gasoline,Manual,Used,6350,75.0,2016,2.0,2,2,3,Blue,Sedan,https://example.com/image.jpg
386,387,90362,Skoda,Citigo,Gasoline,Manual,Used,6350,75.0,2016,2.0,2,2,3,Blue,Sedan,https://example.com/image.jpg


In [56]:
# 7: Save model and pipeline

joblib.dump(knn, "car_knn_model.joblib")
joblib.dump(preprocessor, "car_preprocessor.joblib")
joblib.dump(car_ids, "car_ids.joblib")

['car_ids.joblib']