# Task 2: Recommendation Engine

## Setting up the Notebook

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import Series
from sklearn.metrics.pairwise import cosine_similarity
%matplotlib inline

## Load the Data

In [2]:
car_resale_simplified_dataset = pd.read_csv('./data/preprocessed_recommendation_data.csv')

car_resale_simplified_dataset.head()


Unnamed: 0,listing_id,make,manufactured,reg_date,type_of_vehicle,transmission,power,engine_cap,no_of_owners,depreciation,coe,road_tax,dereg_value,mileage,omv,arf,price
0,1030324,bmw,2013.0,2013.0,luxury sedan,auto,135.0,1997.0,1.0,17700.0,77100.0,1210.0,47514.0,73000.0,45330.0,50462.0,71300.0
1,1026909,mercedes-benz,2016.0,2016.0,luxury sedan,auto,90.0,1595.0,1.0,15070.0,53694.0,740.0,44517.0,80000.0,27886.0,26041.0,95500.0
2,1019371,mercedes-benz,2019.0,2020.0,luxury sedan,auto,115.0,1497.0,1.0,16400.0,40690.0,684.0,80301.0,9800.0,46412.0,56977.0,197900.0
3,1031014,honda,2019.0,2019.0,mid-sized sedan,auto,92.0,1597.0,1.0,10450.0,26667.0,742.0,36453.0,40000.0,20072.0,20101.0,103200.0
4,1012998,volvo,2015.0,2015.0,hatchback,auto,90.0,1498.0,3.0,11020.0,56001.0,684.0,37311.0,77777.0,22809.0,18933.0,62500.0


## Let the users give some conditions on the features of cars

### Please select features you want to filter:  
'make', 'manufactured', 'reg_date', 'type_of_vehicle', 'transmission', 'power', 'engine_cap', 'no_of_owners', 'depreciation', 'coe', 'road_tax', 'dereg_value', 'mileage', 'omv', 'arf', 'price'

In [3]:
user_select_data = car_resale_simplified_dataset[(car_resale_simplified_dataset.price < 80000) & (car_resale_simplified_dataset.make == "bmw")]
user_select_data.head()

Unnamed: 0,listing_id,make,manufactured,reg_date,type_of_vehicle,transmission,power,engine_cap,no_of_owners,depreciation,coe,road_tax,dereg_value,mileage,omv,arf,price
0,1030324,bmw,2013.0,2013.0,luxury sedan,auto,135.0,1997.0,1.0,17700.0,77100.0,1210.0,47514.0,73000.0,45330.0,50462.0,71300.0
31,1023935,bmw,2012.0,2012.0,hatchback,auto,100.0,1598.0,2.0,27010.0,56501.0,742.0,17432.0,67000.0,27217.0,27217.0,35100.0
54,1024395,bmw,2013.0,2013.0,luxury sedan,auto,100.0,1598.0,2.0,15130.0,79223.0,742.0,31297.0,80000.0,31108.0,25552.0,48900.0
77,991635,bmw,2015.0,2016.0,luxury sedan,auto,100.0,1499.0,2.0,12710.0,44001.0,684.0,34770.0,116000.0,26979.0,24771.0,76800.0
144,1008086,bmw,2016.0,2016.0,hatchback,auto,85.0,1496.0,2.0,11800.0,49501.0,1082.0,29244.0,79000.0,23325.0,9655.0,67000.0


In [4]:
user_select_data = car_resale_simplified_dataset[(car_resale_simplified_dataset.price < 80000) & (car_resale_simplified_dataset.make == "bmw")]
user_item = user_select_data.sample(n=10)
user_item_index = user_item['listing_id'].tolist()
# user_item, user_item_index
non_zero_column_num = user_select_data.shape[1] - 1
print(non_zero_column_num)


16


### Users give their score

In [5]:
scores = [9, 6, 8, 8, 5, 6, 3, 3, 9, 8]

## Represent the features of cars

### Catogorical features and Numerical faetures

In [6]:
categorical_features = ['make', 'type_of_vehicle', 'transmission']
numerical_features = user_select_data.dtypes[user_select_data.dtypes!='object'].index
numerical_features = list(set(numerical_features) - set(['listing_id']))
categorical_features, numerical_features

(['make', 'type_of_vehicle', 'transmission'],
 ['engine_cap',
  'dereg_value',
  'omv',
  'manufactured',
  'road_tax',
  'arf',
  'power',
  'coe',
  'mileage',
  'depreciation',
  'reg_date',
  'price',
  'no_of_owners'])

### Normalize numerical features

In [7]:
user_select_data_backup = user_select_data.copy()
user_select_data[numerical_features] = user_select_data[numerical_features].apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
user_select_data[numerical_features]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,engine_cap,dereg_value,omv,manufactured,road_tax,arf,power,coe,mileage,depreciation,reg_date,price,no_of_owners
0,0.151680,0.859359,0.293194,0.666667,0.073918,0.413040,0.270270,0.752767,0.236559,0.220542,0.666667,0.861338,0.0
31,0.030881,0.219303,0.054084,0.583333,0.008151,0.179004,0.081081,0.496851,0.204301,0.403378,0.583333,0.270799,0.2
54,0.030881,0.514309,0.105449,0.666667,0.008151,0.162240,0.081081,0.779143,0.274194,0.170071,0.666667,0.495922,0.2
77,0.000908,0.588204,0.050943,0.833333,0.000000,0.154377,0.081081,0.341554,0.467742,0.122545,0.916667,0.951060,0.2
144,0.000000,0.470627,0.002706,0.916667,0.055930,0.002185,0.000000,0.409884,0.268817,0.104674,0.916667,0.791191,0.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10066,0.000000,0.586417,0.050613,0.833333,0.055930,0.103683,0.000000,0.374675,0.222473,0.112333,0.916667,0.879282,0.2
10171,0.030881,0.691525,0.065926,0.750000,0.008151,0.120034,0.081081,0.688027,0.336022,0.151610,0.833333,0.800979,0.2
10260,0.151680,1.000000,0.350723,0.666667,0.073918,0.474477,0.513514,0.956517,0.363441,0.259623,0.666667,0.969005,0.4
10274,0.151680,0.694291,0.168207,0.666667,0.073918,0.229244,0.270270,0.797617,0.532258,0.243716,0.750000,0.845024,0.2


### Encode catogrical features

In [8]:
catogories_encoded = pd.get_dummies(user_select_data[categorical_features])
df  = user_select_data.join(catogories_encoded)
df_1 = df.drop(columns=categorical_features)
df_1

Unnamed: 0,listing_id,manufactured,reg_date,power,engine_cap,no_of_owners,depreciation,coe,road_tax,dereg_value,...,omv,arf,price,make_bmw,type_of_vehicle_hatchback,type_of_vehicle_luxury sedan,type_of_vehicle_mpv,type_of_vehicle_sports car,type_of_vehicle_suv,transmission_auto
0,1030324,0.666667,0.666667,0.270270,0.151680,0.0,0.220542,0.752767,0.073918,0.859359,...,0.293194,0.413040,0.861338,1,0,1,0,0,0,1
31,1023935,0.583333,0.583333,0.081081,0.030881,0.2,0.403378,0.496851,0.008151,0.219303,...,0.054084,0.179004,0.270799,1,1,0,0,0,0,1
54,1024395,0.666667,0.666667,0.081081,0.030881,0.2,0.170071,0.779143,0.008151,0.514309,...,0.105449,0.162240,0.495922,1,0,1,0,0,0,1
77,991635,0.833333,0.916667,0.081081,0.000908,0.2,0.122545,0.341554,0.000000,0.588204,...,0.050943,0.154377,0.951060,1,0,1,0,0,0,1
144,1008086,0.916667,0.916667,0.000000,0.000000,0.2,0.104674,0.409884,0.055930,0.470627,...,0.002706,0.002185,0.791191,1,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10066,1004167,0.833333,0.916667,0.000000,0.000000,0.2,0.112333,0.374675,0.055930,0.586417,...,0.050613,0.103683,0.879282,1,1,0,0,0,0,1
10171,1024637,0.750000,0.833333,0.081081,0.030881,0.2,0.151610,0.688027,0.008151,0.691525,...,0.065926,0.120034,0.800979,1,0,1,0,0,0,1
10260,1027989,0.666667,0.666667,0.513514,0.151680,0.4,0.259623,0.956517,0.073918,1.000000,...,0.350723,0.474477,0.969005,1,0,1,0,0,0,1
10274,1027018,0.666667,0.750000,0.270270,0.151680,0.2,0.243716,0.797617,0.073918,0.694291,...,0.168207,0.229244,0.845024,1,0,1,0,0,0,1


## Create User's Profile

In [9]:
def get_user_profile(scores, item_index, df, non_zero_column_num):
    scores = np.array(scores)
    # scores_norm = (scores - scores.mean()) / (scores.std()).T
    scores_norm = (scores - scores.mean()).T
    # flag = 0
    # for i in item_index:
    #     rated_item = df[df['listing_id'] == i]
    #     if flag == 0:
    #         rated_items = rated_item
    #     rated_items.concat(rated_item, axis=0)
    #     flag += 1
    # print(rated_items)
    rated_items = df.set_index('listing_id').loc[item_index].reset_index(inplace=False)
    # print(rated_items)
    # rated_items = df[df['listing_id'] == item_index]
    rated_items_1 = rated_items.drop(columns=['listing_id'])
    user_profile = scores_norm.dot(rated_items_1)
    user_profile = user_profile/non_zero_column_num
    return user_profile

user_profile = get_user_profile(scores, user_item_index, df_1, non_zero_column_num)
user_profile.shape[0]

20

## Cos sim

In [10]:

def calculate_cos_similar(user_profile, user_item_index, df_1):
    similarity_list = []
    user_profile = np.array(user_profile).reshape(1,user_profile.shape[0])
    unrated_items = df_1[~df_1['listing_id'].isin(user_item_index)]
    for index, row in unrated_items.iterrows():
        row_cleaned = np.array(row[1:]).reshape(1,user_profile.shape[1])
        simi = cosine_similarity(row_cleaned, user_profile)
        t = (simi[0][0], row['listing_id'])
        similarity_list.append(t)
    return similarity_list

similarity_list = calculate_cos_similar(user_profile, user_item_index, df_1)
sorted_list = sorted(similarity_list,key=lambda t:t[0], reverse=True)
sorted_list[0:5]


[(0.5432842867873974, 965968.0),
 (0.5153350615521864, 1018007.0),
 (0.5071499562808945, 925694.0),
 (0.49268381393303823, 1001894.0),
 (0.446035218980605, 1002837.0)]

## Get top-k recommendations

In [11]:
def get_top_recommendations(k, sorted_list):
    top_k = sorted_list[:k]
    id_list = []
    for x in top_k:
        id_list.append(x[1])
    recommend_items = user_select_data_backup.set_index('listing_id').loc[id_list].reset_index(inplace=False)
    return recommend_items


recommend_top_k = get_top_recommendations(10, sorted_list)
recommend_top_k

Unnamed: 0,listing_id,make,manufactured,reg_date,type_of_vehicle,transmission,power,engine_cap,no_of_owners,depreciation,coe,road_tax,dereg_value,mileage,omv,arf,price
0,965968.0,bmw,2007.0,2007.0,sports car,auto,270.0,4799.0,1.0,10000.0,52008.0,7800.0,29552.0,65000.0,98872.0,108760.0,62500.0
1,1018007.0,bmw,2005.0,2005.0,sports car,auto,190.0,2996.0,5.0,15290.0,60519.0,3573.0,25733.0,190000.0,72969.0,80266.0,71500.0
2,925694.0,bmw,2006.0,2006.0,sports car,auto,190.0,2996.0,2.0,11130.0,56835.0,3573.0,28853.0,135000.0,77343.0,85078.0,62200.0
3,1001894.0,bmw,2006.0,2007.0,sports car,auto,190.0,2996.0,3.0,12450.0,52660.0,3573.0,28263.0,103000.0,86083.0,94692.0,73500.0
4,1002837.0,bmw,2007.0,2008.0,sports car,auto,200.0,2996.0,2.0,9130.0,50168.0,3335.0,31448.0,113000.0,36681.0,40350.0,64700.0
5,1006761.0,bmw,2013.0,2013.0,sports car,auto,235.0,2979.0,4.0,22220.0,67010.0,2362.0,36612.0,104000.0,38956.0,46539.0,65800.0
6,1012269.0,bmw,2013.0,2013.0,sports car,auto,235.0,2979.0,2.0,22860.0,77989.0,2362.0,44842.0,69800.0,40293.0,48411.0,77900.0
7,1026526.0,bmw,2013.0,2013.0,sports car,auto,235.0,2979.0,2.0,23310.0,77600.0,2362.0,42179.0,115000.0,40861.0,49206.0,77000.0
8,1014982.0,bmw,2008.0,2008.0,sports car,auto,160.0,2996.0,4.0,8350.0,36888.0,3097.0,25579.0,140000.0,42381.0,42381.0,63700.0
9,972264.0,bmw,2008.0,2009.0,sports car,auto,160.0,2996.0,6.0,7160.0,35411.0,3097.0,27213.0,140000.0,37703.0,37703.0,60500.0
