## DATA ACQUISTION
----

In [39]:
import requests
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [None]:
carlist_uri = 'https://www.carlist.my/new-cars-for-sale/malaysia?page_size=50&page_number={page_number}'

In [None]:
attrs_list = ["data-listing-id", "data-title", "data-display-title", 
              "data-url", "data-installment", "data-image-src", "data-compare-image", 
              "data-make", "data-model", "data-year", "data-mileage", "data-transmission", 
              "data-ad-type", "data-variant", "data-seller-id", "data-profile-id", "data-listing-trusted", 
              "data-dealer-isverified", "data-view-store", "data-country-code", "data-vehicle-type"]

In [None]:
df_carlist = pd.DataFrame(columns = attrs_list)
#carlist_uri = 'https://www.carlist.my/new-cars-for-sale/malaysia?page_size=5&page_number={page_number}'
max_page_number = 114
min_page_number = 1
i = min_page_number
for i in range(max_page_number):
  carlist_url = carlist_uri.format(page_number=i)
  print(carlist_url)
  page = requests.get(carlist_url,headers={'User-Agent': 'Mozilla/5.0'})
  if page.status_code == 200:
    soup = BeautifulSoup(page.text, 'html.parser')
    article = soup.find_all('article',{"class": "listing"})
    for a in article:
      data_dict = dict()
      data_value = []
      for attr in attrs_list:
        data_value.append(a[attr])
      listing_price = a.find('div',{"class":"listing__price"}).text
      #print('Retrieving {i} page - listing id : {listing_id}'.format(i=i,listing_id=a['data-listing-id']))
      data_dict = dict(zip(attrs_list, data_value))
      data_dict['listing_price'] = listing_price
      df_carlist = df_carlist.append(data_dict, ignore_index=True)

pd.DataFrame.to_csv(df_carlist,'df_carlist.csv')

In [None]:
df_carlist.head()

Unnamed: 0,data-listing-id,data-title,data-display-title,data-url,data-installment,data-image-src,data-compare-image,data-make,data-model,data-year,...,data-ad-type,data-variant,data-seller-id,data-profile-id,data-listing-trusted,data-dealer-isverified,data-view-store,data-country-code,data-vehicle-type,listing_price
0,11207652,2023 Honda Civic 1.5 E VTEC Sedan - Call Now t...,2023 Honda Civic 1.5 E VTEC Sedan,https://www.carlist.my/new-cars/2023-honda-civ...,"RM 1,694/month",https://img1.icarcdn.com/25670211/main-m_new-c...,https://img1.icarcdn.com/25670211/thumb-l_new-...,Honda,Civic,2023,...,New,E VTEC,0239a87f-185f-40d7-96c1-577a23382252,dea94362-6914-4d58-b318-193c0b84fea6,False,ʉʏʈʘ,False,my,car,"RM 130,700"
1,11219593,2022 TESLA Model 3 STANDARD RANGE,2022 TESLA Model 3 Standard Range Sedan,https://www.carlist.my/new-cars/2022-tesla-mod...,"RM 3,876/month",https://img1.icarcdn.com/39591211/main-m_new-c...,https://img1.icarcdn.com/39591211/thumb-l_new-...,TESLA,Model 3,2022,...,New,Standard Range,00a3e324-7194-42a5-8702-fdc9e816fd70,8fd728b8-6bb9-41c1-ac98-2aa31dbc83cc,True,ʉʏʈʘ,True,my,car,"RM 299,000"
2,11185947,HONDA PENANG (BM) - 2023 Honda City 1.5 V i-VT...,2023 Honda City 1.5 V i-VTEC Sedan,https://www.carlist.my/new-cars/honda-penang-b...,"RM 1,157/month",https://img1.icarcdn.com/74958111/main-m_new-c...,https://img1.icarcdn.com/74958111/thumb-l_new-...,Honda,City,2023,...,New,V i-VTEC,c99ef1ac-a828-e611-94c3-06e4185eac05,cd9ef1ac-a828-e611-94c3-06e4185eac05,True,ʉʏʈʘ,True,my,car,"RM 89,260"
3,9789463,2022 Perodua Myvi 1.5 AV Hatchback,2022 Perodua Myvi 1.5 AV Hatchback,https://www.carlist.my/new-cars/2022-perodua-m...,RM 777/month,https://img1.icarcdn.com/3649879/main-m_new-ca...,https://img1.icarcdn.com/3649879/thumb-l_new-c...,Perodua,Myvi,2022,...,New,AV,b4695981-437c-e411-94b6-06e4185eac05,b7695981-437c-e411-94b6-06e4185eac05,False,ʉʏʈʘ,False,my,car,"RM 59,900"
4,8637396,2022 Mitsubishi Triton 2.4 VGT Pickup Truck (A),2022 Mitsubishi Triton 2.4 VGT Pickup Truck,https://www.carlist.my/new-cars/2022-mitsubish...,"RM 1,426/month",https://img1.icarcdn.com/6937368/main-m_new-ca...,https://img1.icarcdn.com/6937368/thumb-l_new-c...,Mitsubishi,Triton,2022,...,New,VGT,33292c57-7c53-417e-96f8-c0bc34192ae3,ea48f048-e1d5-490f-b582-8cc0a77008fa,False,ʉʏʈʘ,False,my,car,"RM 109,990"


In [None]:
df_carlist.columns = df_carlist.columns.str.replace('-' , '_')
pd.DataFrame.to_csv(df_carlist,'df_carlist.csv')

In [2]:
#extract particular vehicle specifications & equipments
def extract_specs(ind,vehicle_id,detail_url):
  print('{ind} - Retrieving : {vehicle_id} at {detail_url}'.format(ind=ind,vehicle_id=vehicle_id,detail_url=detail_url))
  detail_url = detail_url
  page = requests.get(detail_url,headers={'User-Agent': 'Mozilla/5.0'})
  soup = BeautifulSoup(page.text, 'html.parser')
  tab_ids = ['tab-specifications','tab-equipments']
  specs_dict = dict() #using dict to store each specification keys & values
  for tab_id in tab_ids:
    spec = soup.find(id=tab_id)
    if spec != None:
      for s in spec.find_all('div',{'class':'u-border-bottom u-padding-ends-xs u-flex u-flex--justify-between'}):
        data_list = []
        for span in s.find_all('span',{'class':'u-width-1/2'}): #the true values is in the span
          data_list.append(span.text)
        specs_dict[data_list[0]] = data_list[1]
  specs_dict['vehicle_id']=vehicle_id
  return(specs_dict)

In [None]:
df_all_cars_specs = pd.DataFrame()

for ind in df_carlist.index:
  car_data = extract_specs(ind,df_carlist['data_listing_id'][ind],df_carlist['data_url'][ind])
  df_all_cars_specs = df_all_cars_specs.append(car_data, ignore_index=True)

In [None]:
df_all_cars_specs.columns

In [None]:
#rename df_all_cars_specs columns
#rename the columns
import re
keys = []
df_all_cars_specs02 = df_all_cars_specs
for k in df_all_cars_specs02.columns:
  keys.append(re.sub("\(.*?\)","",k).strip().replace(" ","_").lower())

df_all_cars_specs02.columns = keys
df_carlist.rename(columns={'data_listing_id':'vehicle_id'},inplace=True)

In [None]:
df_carlist_merged = pd.merge(df_carlist,df_all_cars_specs02,on='vehicle_id')
df_carlist_merged = df_carlist_merged.drop_duplicates()

rows = len(df_carlist_merged)

#drop the columns with too many null values (not significant to compute)
df_carlist_merged.drop(df_carlist_merged[df_carlist_merged.columns[df_carlist_merged.isna().sum()>rows/2]].columns, axis=1,inplace=True)
# pd.DataFrame.to_csv(df_carlist_merged,'df_carlist_merged.csv')

FileNotFoundError: ignored

In [None]:
df_carlist_merged.to_csv('df_carlist_merged.csv')

In [None]:
df_carlist_merged.isna().sum()

vehicle_id               0
data_title               0
data_display_title       0
data_url                 0
data_installment         0
                      ... 
rim_material           678
number_of_gears       1532
reverse_camera        2477
isofix                2762
fuel_tank             1577
Length: 66, dtype: int64

## Data Preprocessing
---
- Clean the data
- figure out which one are important and impute using KNN

In [3]:
df_carlist_merged = pd.read_csv('https://drive.google.com/uc?id=1hZqmEMmZDJRM1PNlRbV7yCQfkdWpicr4')

In [None]:
for w in ['RM',',','/month']:
  df_carlist_merged['data_installment'] = df_carlist_merged['data_installment'].str.replace(w, '')
  df_carlist_merged['listing_price'] = df_carlist_merged['listing_price'].str.replace(w, '')

df_carlist_merged['data_installment'] = pd.to_numeric(df_carlist_merged['data_installment'])
df_carlist_merged['listing_price'] = pd.to_numeric(df_carlist_merged['listing_price'])

In [None]:
df_carlist_merged.to_csv('df_carlist_cleaned.csv')

Data Visualization
- bar graph & pie chart
- remove outliers

In [3]:
#subset to get those important features
# df_carlist_merged = pd.read_csv('df_carlist_merged.csv')
df_carlist_cleaned = pd.read_csv('https://drive.google.com/uc?id=18HtoXfjwz5UWPsw84s9FJjxFKE6-gpJ1')

In [26]:
df_carlist02 = df_carlist_cleaned.copy()

df_carlist02 = df_carlist02[['vehicle_id','data_installment','listing_price','data_make','data_model','data_year', 
                'data_mileage','data_transmission','doors', 'seat_capacity', 'assembled',
                'engine_cc','peak_power', 'peak_torque', 'engine_type','fuel_type']]
df_carlist02.dtypes

vehicle_id             int64
data_installment       int64
listing_price          int64
data_make             object
data_model            object
data_year              int64
data_mileage           int64
data_transmission     object
doors                float64
seat_capacity        float64
assembled             object
engine_cc            float64
peak_power           float64
peak_torque          float64
engine_type           object
fuel_type             object
dtype: object

In [27]:
df_carlist02.isna().sum()

vehicle_id             0
data_installment       0
listing_price          0
data_make              0
data_model             0
data_year              0
data_mileage           0
data_transmission      0
doors                  1
seat_capacity          1
assembled            203
engine_cc            553
peak_power           340
peak_torque          345
engine_type          203
fuel_type              1
dtype: int64

In [28]:
df_carlist02['assembled'] = df_carlist02['assembled'].fillna(df_carlist02['assembled'].mode()[0])
df_carlist02['seat_capacity'] = df_carlist02['seat_capacity'].fillna(df_carlist02['seat_capacity'].mode()[0])
df_carlist02['doors'] = df_carlist02['doors'].fillna(df_carlist02['doors'].mode()[0])
df_carlist02['engine_cc'] = df_carlist02['engine_cc'].fillna(df_carlist02['engine_cc'].mode()[0])
df_carlist02['peak_power'] = df_carlist02['peak_power'].fillna(df_carlist02['peak_power'].mode()[0])
df_carlist02['peak_torque'] = df_carlist02['peak_torque'].fillna(df_carlist02['peak_torque'].mode()[0])
df_carlist02['engine_type'] = df_carlist02['engine_type'].fillna(df_carlist02['engine_type'].mode()[0])
df_carlist02['fuel_type'] = df_carlist02['fuel_type'].fillna(df_carlist02['fuel_type'].mode()[0])

In [29]:
df_carlist02.isna().sum()

vehicle_id           0
data_installment     0
listing_price        0
data_make            0
data_model           0
data_year            0
data_mileage         0
data_transmission    0
doors                0
seat_capacity        0
assembled            0
engine_cc            0
peak_power           0
peak_torque          0
engine_type          0
fuel_type            0
dtype: int64

### Remove outliers

In [60]:
# Statistical summary
df_carlist02.iloc[:,1:].describe()

Unnamed: 0,data_installment,listing_price,data_year,data_mileage,doors,seat_capacity,engine_cc,peak_power,peak_torque
count,5600.0,5600.0,5600.0,5600.0,5600.0,5600.0,5600.0,5600.0,5600.0
mean,1975.854286,152406.1,2021.979286,3.203393,4.459464,5.2475,1781.671786,164.213036,252.54625
std,1737.182048,133998.0,0.498542,88.338692,0.724485,1.517827,684.204159,73.344452,134.56929
min,303.0,23367.0,2012.0,0.0,2.0,2.0,659.0,53.0,78.0
25%,1009.25,77814.0,2022.0,0.0,4.0,5.0,1496.0,107.0,145.0
50%,1445.5,111494.0,2022.0,0.0,5.0,5.0,1498.0,150.0,213.0
75%,2169.25,167316.0,2022.0,0.0,5.0,5.0,1996.0,182.0,350.0
max,41356.0,3190000.0,2023.0,2500.0,5.0,28.0,12913.0,730.0,900.0


From the summary above, we can find that the `data_installment` and `listing_price` are having some anomalous data, as well as the `seat_capacity` (max=28). Some boxplots are plotted to check the data.

In [61]:
data = df_carlist02.data_installment
fig = px.box(data , x = "data_installment",points="outliers",title="Monthly Installment (RM/month)",width=600,height=300)
fig.show()

In [92]:
import plotly.graph_objs as go
import plotly.subplots as sp

fig = sp.make_subplots(rows=1, cols=2, shared_yaxes=True)

# Add the first boxplot graph to the first column of the subplot
fig.add_trace(go.Box(y=df_carlist02['data_installment'], 
                     name='Monthly Installment', boxpoints='outliers', jitter=0.3, pointpos=-1.8), row=1, col=1)

# Add the second boxplot graph to the second column of the subplot
fig.add_trace(go.Box(x=df_carlist02['data_make'],y=df_carlist02['data_installment'], 
                     name='Monthly Installment by Vehicle Make', boxpoints='outliers', jitter=0.3, pointpos=-1.8), row=1, col=2)

# Update the layout of the subplot
fig.update_layout(title='Installment')

# Show the subplot
fig.show()

In [94]:
#show the data_make with data_installment more than 3907
outliers = df_carlist02.iloc[np.where(df_carlist02['data_installment']>3907)].index

#Those vehicles are deleted
df_carlist02.drop(outliers,inplace=True)

In [98]:
fig = sp.make_subplots(rows=1, cols=2, shared_yaxes=False)

# Add the first boxplot graph to the first column of the subplot
fig.add_trace(go.Box(y=df_carlist02['data_installment'], 
                     name='Monthly Installment', boxpoints='outliers', jitter=0.3, pointpos=-1.8), row=1, col=1)

# Add the second boxplot graph to the second column of the subplot
fig.add_trace(go.Box(y=df_carlist02['listing_price'], 
                     name='Vehicle Price', boxpoints='outliers', jitter=0.3, pointpos=-1.8), row=1, col=2)

# Update the layout of the subplot
fig.update_layout(title='Monthly Installment and Listing Price')

# Show the subplot
fig.show()

In [124]:
fig = sp.make_subplots(rows=1, cols=2, shared_yaxes=False)

fig.add_trace(go.Scatter(x=df_carlist02['data_make'],
                         y=df_carlist02['seat_capacity'],mode = 'markers'))

# fig.add_trace(go.Box(y=df_carlist02['seat_capacity'], 
#                      name='Seat Capacity', boxpoints='outliers', jitter=0.3, pointpos=-1.8), row=1, col=2)

# Update the layout of the subplot
fig.update_layout(title='Vehicle Seat Capacity')

# Show the subplot
fig.show()

In [125]:
df_carlist02.describe()

Unnamed: 0,vehicle_id,data_installment,listing_price,data_year,data_mileage,doors,seat_capacity,engine_cc,peak_power,peak_torque
count,4916.0,4916.0,4916.0,4916.0,4916.0,4916.0,4916.0,4916.0,4916.0,4916.0
mean,9854395.0,1481.186534,114250.107811,2021.987592,3.638527,4.45891,5.26546,1780.575671,146.78926,231.140155
std,1104280.0,749.281293,57797.752513,0.514736,94.277101,0.742915,1.602552,703.166051,48.568831,115.197019
min,4274377.0,303.0,23367.0,2012.0,0.0,2.0,2.0,659.0,53.0,78.0
25%,9180608.0,972.0,75000.0,2022.0,0.0,4.0,5.0,1477.0,107.0,141.0
50%,10060760.0,1320.0,101800.0,2022.0,0.0,5.0,5.0,1498.0,150.0,200.0
75%,10717500.0,1833.25,141425.0,2022.0,0.0,5.0,5.0,1996.0,177.0,260.0
max,11222960.0,3907.0,301383.0,2023.0,2500.0,5.0,28.0,12913.0,462.0,660.0


# 4.0 Data Modeling
---
Purpose : recommend the most similar vehicles based on the user's input.


### 4.1 Convert all categorical variables into numbers by LabelEncoder.

In [198]:
df_carlist02.dtypes

vehicle_id             int64
data_installment       int64
listing_price          int64
data_make             object
data_model            object
data_year              int64
data_mileage           int64
data_transmission     object
doors                float64
seat_capacity        float64
assembled             object
engine_cc            float64
peak_power           float64
peak_torque          float64
engine_type           object
fuel_type             object
dtype: object

In [199]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

df_carlist03 = df_carlist02.copy()

for column in df_carlist03.iloc[:,1:].columns:
    if df_carlist02[column].dtype == 'object':
        le = preprocessing.LabelEncoder()
        le.fit(df_carlist03[column].unique())
        df_carlist03[column] = le.transform(df_carlist03[column])
        le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
        print(column,': mapping')
        print(pd.DataFrame.from_dict(le_name_mapping,orient='index'))
        print("\n")

data_make : mapping
                0
Audi            0
BMW             1
BYD             2
CAF DFSK        3
CAM             4
Chana           5
DFSK            6
Daihatsu        7
Fiat            8
Ford            9
Foton          10
GAC            11
Golden Dragon  12
Great Wall     13
Higer          14
Hino           15
Honda          16
Hyundai        17
Isuzu          18
JMC            19
Kia            20
Lexus          21
MG             22
MINI           23
Maxus          24
Mazda          25
Mercedes-Benz  26
Mitsubishi     27
Nissan         28
Perodua        29
Peugeot        30
Proton         31
Renault        32
Subaru         33
Suzuki         34
TESLA          35
Toyota         36
Volkswagen     37
Volvo          38


data_model : mapping
           0
2          0
2008       1
218i       2
3          3
3 Door     4
...      ...
XZU720   137
Xpander  138
Yaris    139
ZS       140
e        141

[142 rows x 1 columns]


data_transmission : mapping
           0
Automatic  0
M

Before training, mock a user's input dataset by using random library from python.
Selected features : "listing_price", "data_installment", "seat_capacity", "data_transmission", "engine_cc", "fuel_type".

The data to be mocked:
- salary (random numbers between 1500 and 10000)
- preferred monthly installment (random numbers between the min and max monthly_installment from dataset )
- seat capacity (random numbers between the min and median seat_capacity from dataset )
- data_transmission (random number between 0 and 1)
- engine_cc (random numbers between the min and median engine_cc from dataset )
- fuel_type (random numbers between 0 and 3)

In [203]:
import random

print(random.randint(0,9))

np.random.seed(6)
size = 10000

# The data to be mocked:

# salary (random numbers between 1500 and 10000)
# preferred monthly installment (random numbers between the min and max monthly_installment from dataset )
# seat capacity (random numbers between the min and median seat_capacity from dataset )
# data_transmission (random number between 0 and 1)
# engine_cc (random numbers between the min and median engine_cc from dataset )
# fuel_type (random numbers between 0 and 3)

salary = np.random.randint(low=1500,high=6000,size=size)

data_installment = np.random.randint(
    low = df_carlist03['data_installment'].min(),
    high = df_carlist03['data_installment'].median(),size=size)

seat_capacity = np.random.randint(
    low=2,
    high=10,size=size)

data_transmission = np.random.choice([0,0,1],size=size)

,
engine_cc= np.random.randint(
    low=df_carlist03['engine_cc'].min(),
    high=df_carlist03['engine_cc'].median(),size=size)

fuel_type = np.random.choice([0,1,2,3],size=size)

data = {
    'salary':salary,
    'data_installment':data_installment,
    #'preferred_data_model':preferred_data_model, 
    'seat_capacity':seat_capacity,
    'data_transmission':data_transmission,
    'engine_cc':engine_cc,
    'fuel_type':fuel_type
}

mock_user_input = pd.DataFrame(data)

2


In [204]:
mock_user_input.describe()

Unnamed: 0,salary,data_installment,seat_capacity,data_transmission,engine_cc,fuel_type
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,3764.5153,814.2563,5.4899,0.3253,1078.2301,1.4898
std,1291.128715,293.95686,2.288104,0.46851,241.64946,1.118491
min,1500.0,303.0,2.0,0.0,659.0,0.0
25%,2669.0,556.0,4.0,0.0,869.0,0.0
50%,3762.0,812.0,5.0,0.0,1076.0,1.0
75%,4884.25,1073.0,7.0,1.0,1287.0,2.0
max,5999.0,1319.0,9.0,1.0,1497.0,3.0


In [171]:
df_carlist03.dtypes

vehicle_id             int64
data_installment       int64
listing_price          int64
data_make              int32
data_model             int32
data_year              int64
data_mileage           int64
data_transmission      int32
doors                float64
seat_capacity        float64
assembled              int32
engine_cc            float64
peak_power           float64
peak_torque          float64
engine_type            int32
fuel_type              int32
dtype: object

Define a list for the selected features to be passed inside algorithm.

In [291]:
features = ["data_installment", "seat_capacity", "data_transmission", "engine_cc", "fuel_type"]

## 4.1 K-Nearest Neighbors (KNN)
This algorithm can be used to find similar vehicles based on the user's preferred monthly installment, salary, and seat capacity. The algorithm works by finding the K nearest vehicles to the user's input, and then recommending the most similar vehicles.

In [292]:
from sklearn.neighbors import NearestNeighbors

In [390]:
knn_carlist = df_carlist03.copy()

In [391]:
# Create an instance of the NearestNeighbors class
knn = NearestNeighbors(n_neighbors=3, algorithm='ball_tree')

In [392]:
# Fit the model to the data
knn.fit(knn_carlist[features])

In [393]:
user_input = mock_user_input.iloc[0:1,1:] #test
distances, indices = knn.kneighbors(user_input)

In [394]:
# Recommend the most similar vehicles
similar_vehicles = knn_carlist.iloc[indices[0], :]

In [395]:
print(user_input) #["data_installment", "seat_capacity", "data_transmission", "engine_cc", "fuel_type"]
similar_vehicles[["data_installment", "seat_capacity", "data_transmission", "engine_cc", "fuel_type"]]

   data_installment  seat_capacity  data_transmission  engine_cc  fuel_type
0              1042              6                  0       1469          1


Unnamed: 0,data_installment,seat_capacity,data_transmission,engine_cc,fuel_type
5180,1038,5.0,0,1477.0,3
1606,1037,5.0,0,1477.0,3
1725,1037,5.0,0,1477.0,3
4807,1027,5.0,0,1477.0,3
5106,1027,5.0,0,1477.0,3
4818,1027,5.0,0,1477.0,3
10,1018,5.0,0,1477.0,3
355,1018,5.0,0,1477.0,3
2433,1042,7.0,0,1496.0,3
348,1041,7.0,0,1496.0,3


#### KNN-based Recommendation Model - Evaluation
---

Euclidean distance is a similarity metric that can be used to evaluate the performance of a K-Nearest Neighbors (KNN) recommendation model. The Euclidean distance between two points is the square root of the sum of the squares of the differences between the corresponding coordinates. We will calculate the Euclidean distance between the recommended items and the user input by computing the square root of the sum of the squares of the differences between the corresponding features of the two items.

The Euclidean distance is a scalar value that ranges from 0 to infinity, where 0 indicates that the recommended items are exactly the same as the user input, and a high value indicates that the recommended items and user input are different. A small distance indicates that the recommended items are similar to the user input.

In [396]:
np.array(user_input).flatten()

array([1042,    6,    0, 1469,    1])

In [397]:
from scipy.spatial import distance

# Iterate over each row in the dataframe
for i, row in similar_vehicles.iterrows():
    # Extract the features of the recommended items and user input
    recommended_items =  np.array(row[features]).flatten()
    user_input = np.array(user_input).flatten()
    # Compute the Euclidean distance
    euclidean_distance = distance.euclidean(recommended_items, user_input)
    
    # Assign the Euclidean distance to the new column
    similar_vehicles.at[i, 'euclidean_distance'] = euclidean_distance



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [398]:
print(user_input)
similar_vehicles

[1042    6    0 1469    1]


Unnamed: 0,vehicle_id,data_installment,listing_price,data_make,data_model,data_year,data_mileage,data_transmission,doors,seat_capacity,assembled,engine_cc,peak_power,peak_torque,engine_type,fuel_type,euclidean_distance
5180,10063719,1038,80078,31,130,2022,1,0,5.0,5.0,0,1477.0,150.0,226.0,1,3,9.219544
1606,8194786,1037,80000,31,130,2022,1,0,5.0,5.0,0,1477.0,177.0,255.0,1,3,9.69536
1725,10802049,1037,80000,31,130,2022,0,0,5.0,5.0,0,1477.0,177.0,255.0,1,3,9.69536
4807,8776551,1027,79200,31,130,2022,0,0,5.0,5.0,0,1477.0,150.0,226.0,1,3,17.146428
5106,9901276,1027,79200,31,130,2022,0,0,5.0,5.0,0,1477.0,150.0,226.0,1,3,17.146428
4818,9539987,1027,79200,31,130,2022,0,0,5.0,5.0,0,1477.0,177.0,255.0,1,3,17.146428
10,10737711,1018,78505,31,130,2022,0,0,5.0,5.0,0,1477.0,150.0,226.0,1,3,25.39685
355,8607692,1018,78505,31,130,2022,0,0,5.0,5.0,0,1477.0,150.0,226.0,1,3,25.39685
2433,10677104,1042,80355,29,20,2022,1,0,5.0,7.0,0,1496.0,102.0,133.0,1,3,27.092434
348,10531856,1041,80300,29,20,2022,0,0,5.0,7.0,0,1496.0,102.0,133.0,1,3,27.110883



## 4.2 Cosine similiarity
---
https://www.kaggle.com/code/jarredpriester/unsupervised-recommendation-system-of-the-beatles/comments
https://www.kaggle.com/code/muhammadayman/recommendation-system-using-cosine-similarity?scriptVersionId=71265041
https://www.kaggle.com/code/kushbhatnagar/movie-recommendation-system/notebook

This algorithm can be used to find the similarity between vehicles based on the user's preferred monthly installment, salary, and seat capacity. The algorithm works by calculating the cosine similarity between the user's input and the vehicles in the dataset, and then recommending the most similar vehicles.

In [399]:
from sklearn.metrics.pairwise import cosine_similarity

In [428]:
cs_carlist = df_carlist03.copy()

In [429]:
cs_user_input = mock_user_input.iloc[0:1,1:] #test

In [430]:
# Calculate cosine similarity
#print(cs_user_input.ndim,cs_carlist[features].ndim)
similarities = cosine_similarity(cs_user_input,cs_carlist[features])

In [444]:
# Find the top 5 most similar vehicles
cs_recommended_items = similarities.flatten().argsort()[-10:][::-1]
print(cs_user_input)
cs_carlist.iloc[cs_recommended_items,:]

   data_installment  seat_capacity  data_transmission  engine_cc  fuel_type
0              1042              6                  0       1469          1


Unnamed: 0,vehicle_id,data_installment,listing_price,data_make,data_model,data_year,data_mileage,data_transmission,doors,seat_capacity,assembled,engine_cc,peak_power,peak_torque,engine_type,fuel_type
3648,8633210,1063,82000,16,37,2022,0,0,5.0,5.0,0,1498.0,121.0,145.0,1,3
1069,9559129,1063,82000,16,37,2023,0,0,5.0,5.0,0,1498.0,121.0,145.0,1,3
4586,9323661,1063,82000,16,37,2022,0,0,5.0,5.0,0,1498.0,121.0,145.0,1,3
1951,10562740,1063,82000,16,23,2022,0,0,5.0,7.0,0,1497.0,120.0,145.0,1,3
3823,9309442,1063,82000,16,23,2022,0,0,5.0,7.0,0,1497.0,120.0,145.0,1,3
607,7647698,1060,81800,36,139,2021,0,0,5.0,5.0,0,1496.0,107.0,140.0,1,3
5495,9831866,1060,81800,36,139,2022,0,0,5.0,5.0,0,1496.0,107.0,140.0,1,3
2750,10539555,1060,81800,36,126,2022,1,0,4.0,5.0,0,1496.0,107.0,140.0,1,3
4581,10446237,1060,81800,36,126,2022,1,0,4.0,5.0,0,1496.0,107.0,140.0,1,3
5496,9831779,1060,81800,36,139,2022,0,0,5.0,5.0,0,1496.0,107.0,140.0,1,3


## 4.3 Support Vector Machine (SVM)
----
Support Vector Machine (SVM) is a supervised learning algorithm that is typically used for classification problems. To build a vehicle recommendation system using SVM, usually we need to have a dataset that includes both the vehicles data and the user's interaction data, in order to train the model to predict which vehicles a user is likely to be interested in.

However, since we only have a set of vehicles data and a mocked user input, we need to approach the problem differently. One way to do this would be to use the vehicle features in the vehicle dataset to define a similarity measure between vehicles, and then use SVM to classify the vehicles based on their similarity to the mocked user input.

## 4.4 Information Retrieval (IR)
---
Information Retrieval (IR) is a technique used to retrieve relevant documents from a large collection of documents based on a user's query. To build a vehicle recommendation system using IR, we will treat the user input as a "query" and use the vehicle data as the "documents" and use a technique called "text-based retrieval" to rank the vehicles based on their relevance to the user input.

In [445]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the vehicle dataset
ir_carlist = df_carlist03.copy()

# Prepare the data for the vectorizer
ir_carlist['all_features'] = ir_carlist[features].apply(lambda x: ' '.join(x.astype(str)), axis=1)

TypeError: sequence item 0: expected str instance, int found

In [446]:
ir_carlist['all_features']

0        1694.0 5.0 0.0 1498.0 3.0
1        3876.0 5.0 0.0 1496.0 1.0
2        1157.0 5.0 0.0 1498.0 3.0
3         777.0 5.0 0.0 1496.0 3.0
4        1426.0 5.0 0.0 2442.0 0.0
                   ...            
5595     2511.0 6.0 0.0 2488.0 3.0
5596     1919.0 7.0 1.0 2982.0 3.0
5597    1750.0 14.0 1.0 2776.0 0.0
5598    1346.0 10.0 1.0 2771.0 0.0
5599     1102.0 5.0 1.0 1998.0 3.0
Name: all_features, Length: 4916, dtype: object

In [472]:
# Get user input
ir_user_input = mock_user_input.iloc[569:570,1:] #test
# ir_user_input = np.array(ir_user_input).flatten()

ir_user_input = ir_user_input.apply(lambda x: ' '.join(x.astype(str)), axis=1)
ir_user_input

569    1104 8 0 1042 1
dtype: object

In [465]:
# Create the vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the data
vectorizer.fit(ir_carlist['all_features'])

In [474]:
# Transform the user input and the vehicle data
user_input_vec = vectorizer.transform(ir_user_input)
vehicle_vecs = vectorizer.transform(ir_carlist['all_features'])

In [476]:
# Calculate the cosine similarity between the user input and the vehicle data
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(user_input_vec, vehicle_vecs)

569    1104 8 0 1042 1
dtype: object


Unnamed: 0,vehicle_id,data_installment,listing_price,data_make,data_model,data_year,data_mileage,data_transmission,doors,seat_capacity,assembled,engine_cc,peak_power,peak_torque,engine_type,fuel_type,all_features
2433,10677104,1042,80355,29,20,2022,1,0,5.0,7.0,0,1496.0,102.0,133.0,1,3,1042.0 7.0 0.0 1496.0 3.0
1464,9903051,1042,80360,16,23,2022,0,0,5.0,7.0,0,1497.0,120.0,145.0,1,3,1042.0 7.0 0.0 1497.0 3.0
2242,10863107,1042,80360,16,23,2022,0,0,5.0,7.0,0,1497.0,120.0,145.0,1,3,1042.0 7.0 0.0 1497.0 3.0
1446,8836859,1042,80390,27,118,2022,0,1,4.0,5.0,1,2477.0,110.0,200.0,1,0,1042.0 5.0 1.0 2477.0 0.0
3925,9003811,1042,80390,27,118,2022,0,1,4.0,5.0,1,2477.0,110.0,200.0,1,0,1042.0 5.0 1.0 2477.0 0.0


In [None]:
# Find the top 5 most similar vehicles
top_5 = similarities.flatten().argsort()[-5:][::-1]
print(ir_user_input)
ir_carlist.iloc[top_5,:]

## 4.5 Agglomerative Clustering
----

AgglomerativeClustering is a type of hierarchical clustering algorithm that can be used to group similar data points together in a dataset. It starts by treating each data point as a separate cluster and then iteratively merges the closest clusters until all data points are in one cluster or a stopping condition is reached.

In the case of building a vehicle recommendation system, we used the AgglomerativeClustering algorithm to group similar vehicles together and then recommend vehicles from the same cluster to a user based on the user input. It can be a good approach if the user's preferences are not known or are not well defined, and the goal is to recommend vehicles that are similar to the vehicle features the user is looking for.

In [478]:
from sklearn.cluster import AgglomerativeClustering

# Load the vehicle dataset
ac_carlist = df_carlist03.copy()

In [479]:
# Get user input
ac_user_input = mock_user_input.iloc[356:357,1:] #test

In [496]:
# Find the most similar vehicle to the user input
ac_similarity = np.array([np.abs(ac_carlist[features].iloc[i] - ac_user_input).sum() for i in range(ac_carlist.shape[0])])

In [497]:
ac_similarity

array([[9.770e+02, 0.000e+00, 0.000e+00, 3.360e+02, 1.000e+00],
       [3.159e+03, 0.000e+00, 0.000e+00, 3.340e+02, 1.000e+00],
       [4.400e+02, 0.000e+00, 0.000e+00, 3.360e+02, 1.000e+00],
       ...,
       [1.033e+03, 9.000e+00, 1.000e+00, 1.614e+03, 2.000e+00],
       [6.290e+02, 5.000e+00, 1.000e+00, 1.609e+03, 2.000e+00],
       [3.850e+02, 0.000e+00, 1.000e+00, 8.360e+02, 1.000e+00]])

In [552]:
# Cluster the vehicles using Agglomerative Clustering
clustering = AgglomerativeClustering(n_clusters=5, linkage='ward')
clustering.fit(ac_carlist)

In [553]:
# Get the cluster label of the most similar vehicle
cluster = clustering.labels_[ac_similarity.argmin()]
cluster

1

In [554]:
# Get the indices of all vehicles in the same cluster
indices = np.where(clustering.labels_ == cluster)[0]

In [559]:
print(ac_user_input)
ac_recommended_items = ac_carlist.iloc[indices]
ac_recommended_items

     data_installment  seat_capacity  data_transmission  engine_cc  fuel_type
356               717              5                  0       1162          2


Unnamed: 0,vehicle_id,data_installment,listing_price,data_make,data_model,data_year,data_mileage,data_transmission,doors,seat_capacity,assembled,engine_cc,peak_power,peak_torque,engine_type,fuel_type
0,11207652,1694,130700,16,38,2023,0,0,4.0,5.0,0,1498.0,182.0,240.0,1,3
1,11219593,3876,299000,35,89,2022,0,0,4.0,5.0,2,1496.0,287.0,450.0,0,1
2,11185947,1157,89260,16,37,2023,0,0,4.0,5.0,0,1498.0,121.0,145.0,1,3
7,10778299,1097,84600,16,37,2022,0,0,5.0,5.0,0,1498.0,121.0,145.0,1,3
11,11188370,2917,224990,37,68,2023,0,0,5.0,5.0,0,1984.0,245.0,370.0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4156,10797310,979,75500,29,18,2022,0,0,5.0,7.0,0,1495.0,103.0,136.0,1,3
4160,10790267,769,59300,31,79,2022,0,0,5.0,5.0,0,1597.0,109.0,150.0,1,3
4164,10789119,769,59300,31,79,2022,0,0,5.0,5.0,0,1597.0,109.0,150.0,1,3
4172,10776330,1272,98123,31,131,2022,0,0,5.0,5.0,0,1477.0,177.0,255.0,1,3


The mean_absolute_error function is used to calculate the MAE between the recommended items and the user input, and the mean_squared_error function is used to calculate the MSE between the two sets. The MSE is then passed to the square root function to get the RMSE.

In [560]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Iterate over each row in the dataframe
for i, row in ac_recommended_items.iterrows():
    # Extract the features of the recommended items and user input
    recommended_item =  np.array(row[features]).flatten()
    user_input = np.array(ac_user_input).flatten()
    mae = mean_absolute_error(recommended_item, user_input)
    rmse = np.sqrt(mean_squared_error(recommended_item, user_input))
    # Assign the Euclidean distance to the new column
    ac_recommended_items.at[i, 'mae'] = mae
    ac_recommended_items.at[i, 'rmse'] = rmse



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [561]:
ac_recommended_items[]

Unnamed: 0,vehicle_id,data_installment,listing_price,data_make,data_model,data_year,data_mileage,data_transmission,doors,seat_capacity,assembled,engine_cc,peak_power,peak_torque,engine_type,fuel_type,mae,rmse
0,11207652,1694,130700,16,38,2023,0,0,4.0,5.0,0,1498.0,182.0,240.0,1,3,262.8,462.044587
1,11219593,3876,299000,35,89,2022,0,0,4.0,5.0,2,1496.0,287.0,450.0,0,1,698.8,1420.622258
2,11185947,1157,89260,16,37,2023,0,0,4.0,5.0,0,1498.0,121.0,145.0,1,3,155.4,247.587156
7,10778299,1097,84600,16,37,2022,0,0,5.0,5.0,0,1498.0,121.0,145.0,1,3,143.4,226.846644
11,11188370,2917,224990,37,68,2023,0,0,5.0,5.0,0,1984.0,245.0,370.0,1,3,604.6,1050.303290
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4156,10797310,979,75500,29,18,2022,0,0,5.0,7.0,0,1495.0,103.0,136.0,1,3,119.6,189.493008
4160,10790267,769,59300,31,79,2022,0,0,5.0,5.0,0,1597.0,109.0,150.0,1,3,97.6,195.923454
4164,10789119,769,59300,31,79,2022,0,0,5.0,5.0,0,1597.0,109.0,150.0,1,3,97.6,195.923454
4172,10776330,1272,98123,31,131,2022,0,0,5.0,5.0,0,1477.0,177.0,255.0,1,3,174.2,285.394814


## 4.5 Neural Network

In [None]:
import numpy as np
from keras.layers import Input, Embedding, Flatten, Dot, Dense, Concatenate
from keras.models import Model

# Load the vehicle dataset
vehicles = pd.read_csv('vehicle_data.csv')

# Vectorize the data using one-hot encoding
vehicles = pd.get_dummies(vehicles, columns=['data_transmission', 'fuel_type'])

# Get the number of unique vehicles and features
n_vehicles = vehicles.shape[0]
n_features = vehicles.shape[1]

# Get user input
user_input = {'data_installment': 200, 'seat_capacity': 5, 'engine_cc': 2000,
              'data_transmission_Automatic': 1, 'fuel_type_Petrol': 1}

# Vectorize the user input
user_input = pd.DataFrame([user_input])

# Define the input layers for the vehicle and the user
vehicle_input = Input(shape=[1], name='Vehicle')
user_input = Input(shape=[n_features], name='User')

# Embed the input layers
vehicle_embedding = Embedding(n_vehicles+1, 10)(vehicle_input)
vehicle_vec = Flatten()(vehicle_embedding)

# Concatenate the input layers
conc = Concatenate()([vehicle_vec, user_input])

# Add fully connected layers
fc1 = Dense(128, activation='relu')(conc)
fc2 = Dense(32, activation='relu')(fc1)
out = Dense(1, activation='sigmoid')(fc2)

# Create the model
model = Model([vehicle_input, user_input], out)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Fit the model to the data
model.fit([np.arange(n_vehicles), np.repeat(user_input, n_vehicles, axis=0)], np.ones(n_vehicles))