In [3]:
import sys

sys.path.append("..")

In [4]:
import pandas as pd
import numpy as np
from src.transformers import *
from sklearn.pipeline import Pipeline
from sklearn.metrics.pairwise import cosine_similarity
import faiss
import time


In [5]:
%autosave 60

Autosaving every 60 seconds


In [6]:
train = pd.read_csv("../data/processed/train.csv", sep=",")
test = pd.read_csv("../data/processed/test.csv", sep=",")
print(train.shape)
train.head(1)

(16728, 55)


Unnamed: 0,listing_id,title,make,model,description,manufactured,original_reg_date,reg_date,type_of_vehicle,category,...,vehicle_age,is_parf_car,parf,coe_rebate,dereg_value_computed,vehicle_age_bins,lifespan_restriction,features_count,accessories_count,brand_rank
0,1030324,bmw 3 series 320i gran turismo m-sport,bmw,320i,1 owner! 320i gt m-sports model! big brake kit...,2013.0,,2013-12-09 00:00:00,luxury sedan,"parf car, premium ad car, low mileage car",...,8.0,1,27754.1,16705.0,44459.1,0-10,1,6,7,3


In [7]:
price_ranges = ['Below $10,000', '$10,001 to $20,000', '$20,001 to $30,000', '$30,001 to $40,000', '$40,001 to $50,000',
               '$50,001 to $60,000', '$60,001 to $70,000', '$70,001 to $80,000', '$80,001 to $100,000', '$100,001 to $120,000',
               '$120,001 to $140,000', '$140,001 to $160,000', '$160,001 to $180,000', '$180,001 to $200,000', 'Above $200,000']
price_bins = [0, 10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 100000, 120000, 140000, 160000,
                 180000, 200000, np.inf]

depreciation_ranges = ['Below $10k /yr', '$10k to $11k /yr', '$11k to $12k /yr', '$12k to $13k /yr', '$13k to $14k /yr',
                      '$14k to $16k /yr', '$16k to $18k /yr', '$18k to $20k /yr', '$20k to $25k /yr', 'Above $25k /yr']
depreciation_bins = [0, 10000, 11000, 12000, 13000, 14000, 16000, 18000, 20000, 25000, np.inf]

type_of_vehicle_ranges = ['sports car', 'luxury sedan', 'suv', 'hatchback','mid-sized sedan', 'stationwagon',
                          'mpv', 'bus/mini bus', 'truck','others', 'van']
pref_dict_num = {'price':{"ranges":price_ranges, 'bins':price_bins}, 'depreciation':{"ranges":depreciation_ranges,
                                                                                    'bins':depreciation_bins}}
pref_dict_cat = {'type_of_vehicle': type_of_vehicle_ranges}
# print(len(depreciation_ranges), len(depreciation_bins))

#### User Preferences

In [8]:
user_pref_dict = {'price':'$180,001 to $200,000', 'depreciation':'$11k to $12k /yr',
                  'type_of_vehicle':'luxury sedan'}
print(user_pref_dict)

{'price': '$180,001 to $200,000', 'depreciation': '$11k to $12k /yr', 'type_of_vehicle': 'luxury sedan'}


In [9]:
def get_user_pref_cols(user_pref_dict):
    cols = list(user_pref_dict.keys())
    return cols

# def prepare_df(user_pref_cols, df_train_user_pref_cols, user_pref_dict, pref_dict_num, pref_dict_cat):
def prepare_df_num(df_train_user_pref_cols, user_pref_dict, pref_dict_num):
    df = df_train_user_pref_cols.copy()
    for col, val_range in user_pref_dict.items():
        if col in pref_dict_num:
            pipe = Pipeline(
                [
                    (
                        col+'_range',
                        ColumnValuesToCategory(
                            col,
                            col+"_range",
                            pref_dict_num[col]['bins'],
                            pref_dict_num[col]['ranges'],
                        ),
                    )
                ]
            )

            df = pipe.fit_transform(df)
            
#         elif col in pref_dict_cat:
#             df = pd.get_dummies(df, columns = [col], prefix='', prefix_sep='')
    return df

def prepare_df_cat(df_prepared_num, user_pref_dict, pref_dict_cat):
    df = df_prepared_num.copy()
    for col, val_range in user_pref_dict.items():
        if col in pref_dict_cat:
#             print('col: ', col)
            df = pd.get_dummies(df, columns = [col], prefix='', prefix_sep='')
            
    return df

def normalize_df(df_prepared_num, user_pref_cols, pref_dict_num):
    df_prep = df_prepared_num.copy()
    cols = []
    for col in user_pref_cols:
        if col in pref_dict_num:
            cols.append(col)
    df = df_prep.loc[:, cols]
    df_normalized = (df - df.min()) / (df.max() - df.min())
    df_prep.loc[:, cols] = df_normalized
    return df_prep

def get_col_value(val_range, pref_dict_num, col):
    ranges = pref_dict_num[col]['ranges']
    bins = pref_dict_num[col]['bins']
    index_ranges = ranges.index(val_range)
#     print('val_range: ', val_range)
#     print('ranges: ', ranges)
#     print('index_ranges: ', index_ranges)
    if index_ranges == len(bins)-2:
        value = 2*bins[index_ranges]-bins[index_ranges-1]
    else:
        value = (bins[index_ranges] + bins[index_ranges+1]) / 2
    return value

def prepare_user_row_dict(df_prepared_num, user_pref_dict, pref_dict_num, pref_dict_cat):
    row_dict ={}
    row_dict['listing_id'] = 0
    for col, val_range in user_pref_dict.items():
        if col in pref_dict_num:
            col_max = df_prepared_num.loc[:, col].max()
            col_min = df_prepared_num.loc[:, col].min()
            col_val = get_col_value(val_range, pref_dict_num, col)
            col_val_normalized = (col_val - col_min) / (col_max - col_min)
            row_dict[col] = [col_val_normalized]
        elif col in pref_dict_cat:
            row_dict[col] = [val_range]
            
#     print('row_dict: ', row_dict)
    return row_dict
    

In [10]:
user_pref_cols = get_user_pref_cols(user_pref_dict)
print('user_pref_cols: ', user_pref_cols)

df_train_user_pref_cols = train.loc[:, ['listing_id']+user_pref_cols]

# df_prepared = prepare_df(user_pref_cols, df_train_user_pref_cols, user_pref_dict, pref_dict_num, pref_dict_cat)
df_prepared_num = prepare_df_num(df_train_user_pref_cols, user_pref_dict, pref_dict_num)
print('df_prepared_num.shape: ', df_prepared_num.shape)
# df_prepared.head(1)

# user_row = prepare_user_row(df_prepared, user_pref_dict, pref_dict_num, pref_dict_cat)
user_row_dict = prepare_user_row_dict(df_prepared_num, user_pref_dict, pref_dict_num, pref_dict_cat)

# df_prepared_num = df_prepared_num.set_index(['listing_id'])
df_prepared_normalized = normalize_df(df_prepared_num, user_pref_cols, pref_dict_num)
df_prepared_normalized.head(1)

user_pref_cols:  ['price', 'depreciation', 'type_of_vehicle']
df_prepared_num.shape:  (16728, 6)


Unnamed: 0,listing_id,price,depreciation,type_of_vehicle,price_range,depreciation_range
0,1030324,0.023712,0.017406,luxury sedan,"$70,001 to $80,000",$16k to $18k /yr


In [11]:
df_prepared = prepare_df_cat(df_prepared_num, user_pref_dict, pref_dict_cat)
df_prepared.head(1)
df_prepared_normalized = prepare_df_cat(df_prepared_normalized, user_pref_dict, pref_dict_cat)
df_prepared_normalized.head(1)

Unnamed: 0,listing_id,price,depreciation,price_range,depreciation_range,bus/mini bus,hatchback,luxury sedan,mid-sized sedan,mpv,others,sports car,stationwagon,suv,truck,van
0,1030324,0.023712,0.017406,"$70,001 to $80,000",$16k to $18k /yr,0,0,1,0,0,0,0,0,0,0,0


### Create User row

In [12]:
print(user_row_dict)
user_row ={}
for col, val in user_row_dict.items():
    if col in pref_dict_num:
        user_row[col] = val[0]
    elif col in pref_dict_cat:
        for v in val:
            user_row[v] = 1
print(user_row)
df_user_row = pd.DataFrame(columns = df_prepared_normalized.columns)
print(df_user_row.dtypes)
for new_col in df_user_row.columns:
    if new_col not in user_row:
        user_row[new_col] = 0

df_user_row = df_user_row.append(user_row,ignore_index=True)
        
df_user_row


{'listing_id': 0, 'price': [0.06438459429824561], 'depreciation': [0.010220991273915613], 'type_of_vehicle': ['luxury sedan']}
{'price': 0.06438459429824561, 'depreciation': 0.010220991273915613, 'luxury sedan': 1}
listing_id            object
price                 object
depreciation          object
price_range           object
depreciation_range    object
bus/mini bus          object
hatchback             object
luxury sedan          object
mid-sized sedan       object
mpv                   object
others                object
sports car            object
stationwagon          object
suv                   object
truck                 object
van                   object
dtype: object


Unnamed: 0,listing_id,price,depreciation,price_range,depreciation_range,bus/mini bus,hatchback,luxury sedan,mid-sized sedan,mpv,others,sports car,stationwagon,suv,truck,van
0,0.0,0.064385,0.010221,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
convert_dtype_dict = {}
cols_to_be_dropped = []
for col in df_user_row.columns:
    convert_dtype_dict[col] = str(df_prepared_normalized[col].dtype)
    if col in pref_dict_num:
        cols_to_be_dropped.append(col+'_range')
# convert_dtype_dict['listing_id'] = 'int32'
print(convert_dtype_dict)
df_user_row = df_user_row.astype(convert_dtype_dict)
df_user_row.dtypes

{'listing_id': 'int64', 'price': 'float64', 'depreciation': 'float64', 'price_range': 'category', 'depreciation_range': 'category', 'bus/mini bus': 'uint8', 'hatchback': 'uint8', 'luxury sedan': 'uint8', 'mid-sized sedan': 'uint8', 'mpv': 'uint8', 'others': 'uint8', 'sports car': 'uint8', 'stationwagon': 'uint8', 'suv': 'uint8', 'truck': 'uint8', 'van': 'uint8'}


listing_id               int64
price                  float64
depreciation           float64
price_range           category
depreciation_range    category
bus/mini bus             uint8
hatchback                uint8
luxury sedan             uint8
mid-sized sedan          uint8
mpv                      uint8
others                   uint8
sports car               uint8
stationwagon             uint8
suv                      uint8
truck                    uint8
van                      uint8
dtype: object

In [14]:
df_user_row

Unnamed: 0,listing_id,price,depreciation,price_range,depreciation_range,bus/mini bus,hatchback,luxury sedan,mid-sized sedan,mpv,others,sports car,stationwagon,suv,truck,van
0,0,0.064385,0.010221,0.0,0.0,0,0,1,0,0,0,0,0,0,0,0


In [15]:
df_prepared_normalized.head(1)

Unnamed: 0,listing_id,price,depreciation,price_range,depreciation_range,bus/mini bus,hatchback,luxury sedan,mid-sized sedan,mpv,others,sports car,stationwagon,suv,truck,van
0,1030324,0.023712,0.017406,"$70,001 to $80,000",$16k to $18k /yr,0,0,1,0,0,0,0,0,0,0,0


#### Removing range columns

In [16]:
df_prepared_final = df_prepared_normalized.drop(cols_to_be_dropped, axis=1)
df_user_row_final = df_user_row.drop(cols_to_be_dropped, axis=1)
print(df_prepared_final.shape)
# df_prepared_final = df_prepared_final.set_index('listing_id')
df_prepared_final.head(10)

(16728, 14)


Unnamed: 0,listing_id,price,depreciation,bus/mini bus,hatchback,luxury sedan,mid-sized sedan,mpv,others,sports car,stationwagon,suv,truck,van
0,1030324,0.023712,0.017406,0,0,1,0,0,0,0,0,0,0,0
1,1021510,0.014289,0.010372,0,0,0,0,0,0,0,0,0,0,1
2,1026909,0.032004,0.014358,0,0,1,0,0,0,0,0,0,0,0
3,1019371,0.067092,0.015899,0,0,1,0,0,0,0,0,0,0,0
4,1031014,0.034642,0.009004,0,0,0,1,0,0,0,0,0,0,0
5,1027957,0.093099,0.023575,0,0,0,0,0,0,1,0,0,0,0
6,1012998,0.020696,0.009665,0,1,0,0,0,0,0,0,0,0,0
7,1011676,0.125137,0.025698,0,0,0,0,0,0,0,0,1,0,0
8,991798,0.022204,0.004172,0,0,0,1,0,0,0,0,0,0,0
9,985245,0.029365,0.011542,0,0,1,0,0,0,0,0,0,0,0


In [17]:
print(df_user_row_final.shape)
# df_user_row_final = df_user_row_final.set_index('listing_id')
df_user_row_final.head(1)

(1, 14)


Unnamed: 0,listing_id,price,depreciation,bus/mini bus,hatchback,luxury sedan,mid-sized sedan,mpv,others,sports car,stationwagon,suv,truck,van
0,0,0.064385,0.010221,0,0,1,0,0,0,0,0,0,0,0


## Finding similarity between custom user's item and other items

In [18]:
x_user = df_user_row_final['listing_id'].to_numpy()
y_user = df_user_row_final.iloc[:,1:].to_numpy()
print(x_user.shape,y_user.shape)
x_items = df_prepared_final['listing_id'].to_numpy()
y_items = df_prepared_final.iloc[:,1:].to_numpy()
print(x_items.shape, y_items.shape)
similarity_user_item = cosine_similarity(y_user, y_items)[0]


(1,) (1, 13)
(16728,) (16728, 13)


In [61]:
print(similarity_user_item.shape)
print(similarity_user_item)
ind_ordered = np.argsort(similarity_user_item)[::-1]
print(ind_ordered)

similarity_user_item_ordered = similarity_user_item[ind_ordered]
print(similarity_user_item_ordered.shape, ind_ordered.shape)
print('similarity_user_item_ordered: ', similarity_user_item_ordered)

(16728,)
[0.99915064 0.00102364 0.99946977 ... 0.00157627 0.00180285 0.01431582]
[ 3581  1364 15361 ... 16346  2527 14920]
(16728,) (16728,)
similarity_user_item_ordered:  [9.99991416e-01 9.99991310e-01 9.99988891e-01 ... 1.31910009e-04
 1.01029235e-04 5.99413198e-05]


In [77]:
def get_top_recommendations_user_item(similarity_user_item, ind_ordered, x_items, y_items, k, df_prepared_final):
    similarity_user_item_ordered = similarity_user_item[ind_ordered]
    x_items_ordered = x_items[ind_ordered]
    y_items_ordered = y_items[ind_ordered]
    similarity_user_item_ordered = similarity_user_item[ind_ordered]
    x_top_k = x_items_ordered[:k]
    count = 0
    per_num = 100
#     print('np.percentile(similarity_user_item, per_num): ', np.percentile(similarity_user_item, per_num))
    while count < k:
        per_num = per_num - 1
        percentile = np.percentile(similarity_user_item, per_num)
        count = (similarity_user_item_ordered > percentile).sum()
    
    print('count: ', count)
    print('ind_ordered: ', ind_ordered)
#     print('similarity_user_item_ordered[:count]: ', similarity_user_item_ordered[:count])
    
    sample_similarity_user_item_ordered = similarity_user_item_ordered[:count]
    sample_indices = ind_ordered[:count]
    sample_prob = sample_similarity_user_item_ordered / np.sum(sample_similarity_user_item_ordered)
#     print('sample_prob: ', sample_prob)
    topk_indices = np.random.choice(sample_indices, size=k, replace=False, p=sample_prob)
    print('topk_indices: ', topk_indices)
    df_result = df_prepared_final.iloc[topk_indices,:]
#     df_result = df_prepared_final.iloc[ind_ordered[:k],:]
    
    return df_result

In [86]:
df_top = get_top_recommendations_user_item(similarity_user_item, ind_ordered, x_items, y_items, 10, train)
# df_top
print('user_pref_dict: ', user_pref_dict)

count:  168
ind_ordered:  [ 3581  1364 15361 ... 16346  2527 14920]
topk_indices:  [  166 15808  1177 16617 16231  6179 14571  2266  1090  8289]
user_pref_dict:  {'price': '$180,001 to $200,000', 'depreciation': '$11k to $12k /yr', 'type_of_vehicle': 'luxury sedan'}


In [88]:
df_top.loc[:, ['listing_id', 'make', 'model', 'depreciation', 'price', 'type_of_vehicle']]

Unnamed: 0,listing_id,make,model,depreciation,price,type_of_vehicle
166,974922,mercedes-benz,amg,16440.0,192300.0,luxury sedan
15808,1022049,mercedes-benz,e200,18630.0,189000.0,luxury sedan
1177,966499,mercedes-benz,amg,15700.0,186800.0,luxury sedan
16617,1025067,mercedes-benz,cla180,15070.0,174700.0,luxury sedan
16231,986041,bmw,520d,18180.0,191300.0,luxury sedan
6179,1026590,bmw,318i,15740.0,190100.0,luxury sedan
14571,1020341,mercedes-benz,amg,15860.0,186900.0,luxury sedan
2266,1023930,bmw,420i,16720.0,179100.0,luxury sedan
1090,1028532,mercedes-benz,a200,13850.0,172700.0,luxury sedan
8289,1027429,bmw,318i,16160.0,194500.0,luxury sedan


In [30]:
# print(cosine_similarity(y_user.reshape(1,-1), y_items[3581].reshape(1,-1)))
# print(cosine_similarity(y_user.reshape(1,-1), y_items[1364].reshape(1,-1)))
# print(cosine_similarity(y_user.reshape(1,-1), y_items[15361].reshape(1,-1)))
# print(cosine_similarity(y_user.reshape(1,-1), y_items[875].reshape(1,-1)))
print(similarity_user_item_ordered[:25])
print('user_pref_dict: ', user_pref_dict)
percentile = np.percentile(similarity_user_item_ordered, 99)
print('percentile: ', percentile)
(similarity_user_item_ordered > percentile).sum()

[0.99999142 0.99999131 0.99998889 0.9999886  0.99998844 0.99998798
 0.99998783 0.99998761 0.99998732 0.99998699 0.99998674 0.99998673
 0.99998655 0.99998624 0.99998611 0.99998562 0.99998546 0.99998513
 0.99998498 0.99998435 0.99998431 0.99998421 0.99998417 0.99998356
 0.99998338]
user_pref_dict:  {'price': '$180,001 to $200,000', 'depreciation': '$11k to $12k /yr', 'type_of_vehicle': 'luxury sedan'}
percentile:  0.999962818855026


168

In [22]:
import seaborn as sns

a = pd.Series(similarity_user_item_ordered) > 0.9
a.sum()


3045