In [1]:
import sys
sys.path.append("..")
import pandas as pd
import numpy as np
from src.transformers import *
from src.user_item_funcs import *
from src.item_item_funcs import *
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity
%autosave 60

Autosaving every 60 seconds


### Loading data

In [2]:
train = pd.read_csv("../data/processed/train.csv", sep=",")
test = pd.read_csv("../data/processed/test.csv", sep=",")
print(train.shape)

(16728, 55)


## User-Item Similarity

### Get User Preferences

In [3]:
#Make UI here
# if you want to add more preferences. Go to user_item_funcs.py.
#For the numerical ones add the ranges and bins and add them to pref_dict_num dictionary with column name as key.
#For the categorical ones add the categories/ranges and it to pref_dict_cat with column name as key.
user_pref_dict = {'price':'$80,001 to $100,000', 'depreciation':'$11k to $12k /yr',
                  'type_of_vehicle':'luxury sedan'}

### Preparing User Row and Dataframe for similarities

In [4]:
user_pref_cols = get_user_pref_cols(user_pref_dict)

df_train_user_pref_cols = train.loc[:, ['listing_id']+user_pref_cols]


df_prepared_num = prepare_df_num(df_train_user_pref_cols, user_pref_dict, pref_dict_num)


user_row_dict = prepare_user_row_dict(df_prepared_num, user_pref_dict, pref_dict_num, pref_dict_cat)

df_prepared_normalized = normalize_df(df_prepared_num, user_pref_cols, pref_dict_num)
df_prepared_normalized.head(1)

Unnamed: 0,listing_id,price,depreciation,type_of_vehicle,price_range,depreciation_range
0,1030324,0.023712,0.017406,luxury sedan,"$70,001 to $80,000",$16k to $18k /yr


In [5]:
df_prepared = prepare_df_cat(df_prepared_num, user_pref_dict, pref_dict_cat)
df_prepared.head(1)
df_prepared_normalized = prepare_df_cat(df_prepared_normalized, user_pref_dict, pref_dict_cat)
df_prepared_normalized.head(1)

Unnamed: 0,listing_id,price,depreciation,price_range,depreciation_range,bus/mini bus,hatchback,luxury sedan,mid-sized sedan,mpv,others,sports car,stationwagon,suv,truck,van
0,1030324,0.023712,0.017406,"$70,001 to $80,000",$16k to $18k /yr,0,0,1,0,0,0,0,0,0,0,0


In [6]:
df_user_row, cols_to_be_dropped = get_user_row(user_row_dict, pref_dict_num, pref_dict_cat, df_prepared_normalized)
df_user_row

Unnamed: 0,listing_id,price,depreciation,price_range,depreciation_range,bus/mini bus,hatchback,luxury sedan,mid-sized sedan,mpv,others,sports car,stationwagon,suv,truck,van
0,0,0.030119,0.010221,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0


#### Dropping range columns

In [7]:
df_prepared_final = df_prepared_normalized.drop(cols_to_be_dropped, axis=1)
df_user_row_final = df_user_row.drop(cols_to_be_dropped, axis=1)
df_prepared_final.head(1)

Unnamed: 0,listing_id,price,depreciation,bus/mini bus,hatchback,luxury sedan,mid-sized sedan,mpv,others,sports car,stationwagon,suv,truck,van
0,1030324,0.023712,0.017406,0,0,1,0,0,0,0,0,0,0,0


In [8]:
df_user_row_final.head(1)

Unnamed: 0,listing_id,price,depreciation,bus/mini bus,hatchback,luxury sedan,mid-sized sedan,mpv,others,sports car,stationwagon,suv,truck,van
0,0,0.030119,0.010221,0,0,1,0,0,0,0,0,0,0,0


### Finding similarity between custom user's item and other items

In [9]:
x_user = df_user_row_final['listing_id'].to_numpy()
y_user = df_user_row_final.iloc[:,1:].to_numpy()
print(x_user.shape,y_user.shape)
x_items = df_prepared_final['listing_id'].to_numpy()
y_items = df_prepared_final.iloc[:,1:].to_numpy()
print(x_items.shape, y_items.shape)
similarity_user_item = cosine_similarity(y_user, y_items)[0]

(1,) (1, 13)
(16728,) (16728, 13)


In [10]:
ind_ordered = np.argsort(similarity_user_item)[::-1]

### Top k recommendations as per User-Item similarity

In [11]:
df_top_user_item = get_top_recommendations_user_item(similarity_user_item, ind_ordered, x_items, y_items, 10, train)
print('user_pref_dict: ', user_pref_dict)
df_top_user_item.loc[:, ['listing_id', 'make', 'model', 'depreciation', 'price', 'type_of_vehicle']]

ind_ordered:  [ 6514  8879  2084 ... 16346  2527 14920]
topk_indices:  [11184  2010 16429  3680  9720 15582 12157  6959  1209 15798]
user_pref_dict:  {'price': '$80,001 to $100,000', 'depreciation': '$11k to $12k /yr', 'type_of_vehicle': 'luxury sedan'}


Unnamed: 0,listing_id,make,model,depreciation,price,type_of_vehicle
11184,1024578,mazda,2,11380.0,92200.0,luxury sedan
2010,1027661,audi,a4,13400.0,86700.0,luxury sedan
16429,1024801,jaguar,xe,12590.0,84500.0,luxury sedan
3680,1025881,mercedes-benz,e250,9480.0,86700.0,luxury sedan
9720,1016561,bmw,520i,10760.0,89000.0,luxury sedan
15582,1019433,toyota,camry,11650.0,86700.0,luxury sedan
12157,1021160,mazda,2,10760.0,94400.0,luxury sedan
6959,1025230,audi,a4,13830.0,91100.0,luxury sedan
1209,1027521,honda,accord,11630.0,92300.0,luxury sedan
15798,1014530,audi,a4,13430.0,94600.0,luxury sedan


### Choose a Car from the recommended list

In [12]:
#Suppose the user chooses listing_id=1023716 from the above list. 
#We apply Item-item similarity on the chosen car and all other cars.
#Make UI here

## Item-Item Similarity

### Prepare DataFrame for item-item similarity

In [13]:
df_recommend_ii = train.loc[:, ['listing_id','make', 'vehicle_age', 'type_of_vehicle', 'depreciation',
                   'dereg_value', 'mileage', 'price', 'engine_cap',  'fuel_type_diesel',
                   'fuel_type_petrol-electric', 'fuel_type_petrol', 'fuel_type_electric','transmission_auto',
                   'transmission_manual', 'brand_rank']]

cols_to_be_normalized = ['vehicle_age', 'depreciation', 'dereg_value', 'mileage', 'engine_cap', 'price']
# cols_to_be_normalized = ['vehicle_age', 'mileage', 'price']
df_normalized_ii = get_normalized_cols_item_item(df_recommend_ii, cols_to_be_normalized)
df_normalized_ii.head(2)

Unnamed: 0,vehicle_age,depreciation,dereg_value,mileage,engine_cap,price
0,0.090909,0.017406,0.072529,0.098586,0.089796,0.023712
1,0.079545,0.010372,0.005432,0.148707,0.155315,0.014289


In [14]:
df_recommend_ii.loc[:, ['vehicle_age', 'depreciation', 'dereg_value', 'mileage', 'engine_cap', 'price']] = df_normalized_ii
# df_recommend_ii.loc[:, ['vehicle_age', 'mileage', 'price']] = df_normalized_ii
df_recommend_ii.head(1)

Unnamed: 0,listing_id,make,vehicle_age,type_of_vehicle,depreciation,dereg_value,mileage,price,engine_cap,fuel_type_diesel,fuel_type_petrol-electric,fuel_type_petrol,fuel_type_electric,transmission_auto,transmission_manual,brand_rank
0,1030324,bmw,0.090909,luxury sedan,0.017406,0.072529,0.098586,0.023712,0.089796,1,0,0,0,1,0,3


### Convert Categorical Columns to One Hot Encoding

In [15]:
df_transformed_ii = pd.get_dummies(df_recommend_ii, columns = ['make', 'type_of_vehicle', 'brand_rank'])
df_transformed_ii.head(2)

Unnamed: 0,listing_id,vehicle_age,depreciation,dereg_value,mileage,price,engine_cap,fuel_type_diesel,fuel_type_petrol-electric,fuel_type_petrol,...,type_of_vehicle_stationwagon,type_of_vehicle_suv,type_of_vehicle_truck,type_of_vehicle_van,brand_rank_1,brand_rank_2,brand_rank_3,brand_rank_4,brand_rank_5,brand_rank_6
0,1030324,0.090909,0.017406,0.072529,0.098586,0.023712,0.089796,1,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1021510,0.079545,0.010372,0.005432,0.148707,0.014289,0.155315,1,0,0,...,0,0,0,1,0,1,0,0,0,0


### Computing Item-Item similarities

In [16]:
x_items_ii = df_transformed_ii['listing_id'].to_numpy()
y_items_ii = df_transformed_ii.iloc[:,1:].to_numpy()
#Suppose the user chooses listing_id=1023716 from the above list.
listing_id_chosen = 1023716
x_chosen = listing_id_chosen
y_chosen = df_transformed_ii[df_transformed_ii['listing_id'] == 1023716].iloc[:,1:].to_numpy()

In [17]:
similarity_item_item = cosine_similarity(y_chosen, y_items_ii)[0]
indices_ii_ordered = np.argsort(similarity_item_item)[::-1]

In [18]:
print('similarity_item_item: ', similarity_item_item)
print('indices_ii_ordered: ', indices_ii_ordered)
print(similarity_item_item[indices_ii_ordered])

similarity_item_item:  [0.40132635 0.00260081 0.99856833 ... 0.2013496  0.40104318 0.20895719]
indices_ii_ordered:  [  709  9231  2812 ...  7621  9035 13218]
[1.00000000e+00 9.99998259e-01 9.99983688e-01 ... 9.96180225e-04
 9.81051501e-04 9.26393403e-04]


In [19]:
# k=11 as the result includes the given listing_id row too.
df_top_ii = get_top_recommendations_ii(similarity_item_item, indices_ii_ordered, x_items_ii, y_items_ii, 11, train)

In [20]:
train[train.listing_id == 1023716].loc[:, ['listing_id','make', 'model', 'vehicle_age', 'type_of_vehicle', 'depreciation',
                   'dereg_value', 'mileage', 'price', 'engine_cap',  'fuel_type','transmission', 'brand_rank']]

Unnamed: 0,listing_id,make,model,vehicle_age,type_of_vehicle,depreciation,dereg_value,mileage,price,engine_cap,fuel_type,transmission,brand_rank
709,1023716,mercedes-benz,c180,1.0,luxury sedan,16210.0,73334.0,8600.0,189200.0,1497.0,petrol-electric,auto,4


In [21]:
# Remove the given listing_id from recommendations 
df_top_ii.loc[:, ['listing_id','make', 'model', 'vehicle_age', 'type_of_vehicle', 'depreciation',
                   'dereg_value', 'mileage', 'price', 'engine_cap',  'fuel_type','transmission', 'brand_rank']]

Unnamed: 0,listing_id,make,model,vehicle_age,type_of_vehicle,depreciation,dereg_value,mileage,price,engine_cap,fuel_type,transmission,brand_rank
709,1023716,mercedes-benz,c180,1.0,luxury sedan,16210.0,73334.0,8600.0,189200.0,1497.0,petrol-electric,auto,4
9231,1001612,mercedes-benz,e180,1.0,luxury sedan,16990.0,72926.0,7409.0,200000.0,1497.0,petrol-electric,auto,4
2812,1010231,mercedes-benz,e180,1.0,luxury sedan,17120.0,80841.0,9000.0,205500.0,1497.0,petrol-electric,auto,4
15992,1023251,mercedes-benz,c180,2.0,luxury sedan,16130.0,69022.0,7513.0,188100.0,1497.0,petrol-electric,auto,4
2080,1027849,mercedes-benz,e180,1.0,luxury sedan,16840.0,81366.0,6100.0,203300.0,1497.0,petrol-electric,auto,4
15396,1031109,mercedes-benz,e180,1.0,luxury sedan,16660.0,81299.0,12000.0,201100.0,1497.0,petrol-electric,auto,4
10141,1020679,mercedes-benz,e180,1.0,luxury sedan,16740.0,81659.0,7870.0,203500.0,1497.0,petrol-electric,auto,4
4246,1030637,mercedes-benz,c200,2.0,luxury sedan,16900.0,68826.0,10526.0,183500.0,1497.0,petrol-electric,auto,4
1960,1026331,mercedes-benz,amg,2.0,luxury sedan,15880.0,67691.0,10590.0,181700.0,1497.0,petrol-electric,auto,4
10277,1008324,mercedes-benz,e180,2.0,luxury sedan,17140.0,78997.0,8000.0,206700.0,1497.0,petrol-electric,auto,4
