In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.cloud import bigquery
from scipy.sparse import csr_matrix
from scipy.sparse import save_npz
from fuzzywuzzy import process
import implicit
import pickle
import os
dir_path = os.path.dirname(os.getcwd())
import sys  
sys.path.append(os.path.join(dir_path, "src"))
from utils import create_map

In [113]:
products = pd.read_csv(os.path.join(dir_path, 'data', 'raw', 'product.csv'))

In [114]:
norm_df = pd.read_csv(os.path.join(dir_path, 'data', 'normalized_data.csv'))
norm_df.head()

Unnamed: 0,id,visitorId,itemId,normalized_session_duration
0,0,50743654948546081-2,GGOEGAAX0031,0.120099
1,1,5342238128015433624-1,GGOEGAAX0031,0.122196
2,2,966739175376367480-2,GGOEGAAX0031,0.428319
3,3,8321419770612068321-1,GGOEGAAX0031,0.000883
4,4,4132865605127470100-1,GGOEGAAX0031,0.526131


In [115]:
sparse_X, user_mapper, item_mapper, userIdx_id, itemIdx_id =  create_map(norm_df)


In [130]:
def get_top_items(n, df=norm_df):
    most_viewed_itemsId = df.groupby('itemId').count()
    most_viewed_itemsId.sort_values('visitorId', ascending=False).head(10)
    most_viewed_itemId_sorted = most_viewed_itemsId.sort_values('visitorId',ascending=False)
    
    ## Map product ID's with product names
    itemId_name = dict(zip(products['itemId'], products['itemName']))
    ## Inverse of 'itemId_name'
    itemName_Id = dict(zip(products['itemName'], products['itemId']))
    
    top_items_id = []
    top_items_name = []
    for i in range(0,n):
        top_items_id.append(most_viewed_itemId_sorted.index[i])
        product_name = itemId_name[most_viewed_itemId_sorted.index[i]]
        top_items_name.append(product_name)

    
    return top_items_id, top_items_name # Return the top item Ids and Item names from df 

In [131]:
top_items_id, top_items_name = get_top_items(10, norm_df)
print("Top 10 most viewed items are :\n {}".format(top_items_name))
print("\n")
print("Top 10 most viewed item Ids are :\n {}".format(top_items_id))

Top 10 most viewed items are :
 ["Android Women's Short Sleeve Hero Tee Black", 'Softsided Travel Pouch Set', 'Waterpoof Gear Bag', "Google Men's  Zip Hoodie", 'Keyboard DOT Sticker', 'Sport Bag', 'Google Laptop and Cell Phone Stickers', 'Android Toddler Short Sleeve T-shirt Pewter', 'Google Sunglasses', 'Gel Roller Pen']


Top 10 most viewed item Ids are :
 ['GGOEGAAX0104', 'GGOEGBRJ037299', 'GGOEGBRA037499', 'GGOEGAAX0358', 'GGOEGFKA022299', 'GGOEGBRJ037399', 'GGOEGFKQ020399', 'GGOEGAAX0105', 'GGOEGAAX0037', 'GGOEGDHQ015399']


### Check how sparse data is

In [132]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

In [150]:
## Finds the product name regardless of spelling mistake
def item_finder(itemName):
    all_names = products["itemName"].tolist()
    closest_match = process.extractOne(itemName,all_names)
    return closest_match[0]

## Map product ID's with product names
itemId_name = dict(zip(products['itemId'], products['itemName']))
## Inverse of 'itemId_name'
itemName_Id = dict(zip(products['itemName'], products['itemId']))

I = norm_df['itemId'].nunique()
## Map item index value to item ID
item_name = dict(zip(list(range(I)), np.unique(norm_df["itemId"])))
## Map item Id to index value
item_mapper = dict(zip(np.unique(norm_df["itemId"]), list(range(I))))


## With Item Id get product name
def get_itemName(item_idx):
    item_id = item_name[item_idx]
    product_name = itemId_name[item_id]
    return product_name

## With item name get item index value
def get_item_index(itemName):
    fuzzy_name = item_finder(itemName)
    item_id = itemName_Id[fuzzy_name]
    item_idx = item_mapper[item_id]
    return item_idx

In [133]:
V = norm_df['visitorId'].nunique()
I = norm_df['itemId'].nunique()
    
user_mapper = dict(zip(np.unique(norm_df["visitorId"]), list(range(V))))
item_mapper = dict(zip(np.unique(norm_df["itemId"]), list(range(I))))
    
user_name = dict(zip(list(range(V)), np.unique(norm_df["visitorId"])))
item_name = dict(zip(list(range(I)), np.unique(norm_df["itemId"])))
    
user_index = [user_mapper[i] for i in norm_df['visitorId']]
item_index = [item_mapper[i] for i in norm_df['itemId']]

In [8]:
X = csr_matrix((norm_df["normalized_session_duration"], (user_index, item_index)), shape=(V, I))

In [124]:
X.shape

(58531, 374)

In [125]:
sparsity = X.count_nonzero()/(sparse_X.shape[0]*sparse_X.shape[1])
print(f"Matrix sparsity: {round(sparsity*100,2)}%")

Matrix sparsity: 0.58%


In [173]:
item_sparse = cosine_similarity(sparse_X, dense_output=False)
print('pairwise sparse output:\n {}\n'.format(item_sparse))

pairwise sparse output:
   (0, 0)	1.0
  (1, 209)	0.0006046087822716545
  (1, 319)	0.006068482219230545
  (1, 307)	0.002199612909538996
  (1, 16)	0.0013616264934660685
  (1, 302)	0.002731591322448519
  (1, 295)	1.5696283344448033e-07
  (1, 72)	0.003503473616293597
  (1, 62)	0.0002785655404699201
  (1, 61)	0.00037063778760618236
  (1, 13)	0.0017013727587772245
  (1, 2)	0.0003940870077534431
  (1, 337)	0.009129177844462957
  (1, 122)	2.735950480824398e-07
  (1, 81)	0.0033021130167198093
  (1, 322)	0.00044890538976895554
  (1, 244)	0.00014480295320994874
  (1, 241)	0.00018516579327811664
  (1, 143)	0.001066094081889249
  (1, 136)	0.0006415852794069728
  (1, 134)	0.0005920906089437864
  (1, 128)	0.0006909158887276666
  (1, 114)	0.0003983841588610124
  (1, 12)	0.01303422171206371
  (1, 368)	0.041500481141509914
  :	:
  (373, 19)	0.0001838549548328389
  (373, 17)	0.00019399878766881997
  (373, 369)	0.05271340954951151
  (373, 350)	0.0030730793458416854
  (373, 243)	0.0021799711448686125
  (37

In [174]:
item_item_similar = top_n_idx_sparse(item_sparse, 5)
item_item_similar_dict = {}
for idx, val in enumerate(item_item_similar):
        item_item_similar_dict.update({idx: val.tolist()})
item_item_similar_dict

{0: [0],
 1: [373, 15, 341, 14, 1],
 2: [295, 254, 229, 255, 2],
 3: [260, 258, 257, 26, 3],
 4: [363, 260, 19, 4, 26],
 5: [26, 29, 266, 262, 5],
 6: [5, 12, 268, 269, 6],
 7: [296, 298, 8, 7, 297],
 8: [296, 35, 297, 7, 8],
 9: [310, 311, 18, 309, 9],
 10: [10],
 11: [173, 367, 18, 11, 308],
 12: [315, 17, 251, 301, 12],
 13: [302, 355, 368, 337, 13],
 14: [224, 369, 1, 345, 14],
 15: [350, 1, 20, 351, 15],
 16: [18, 308, 311, 309, 16],
 17: [12, 19, 301, 250, 17],
 18: [311, 16, 9, 11, 18],
 19: [17, 239, 260, 19, 4],
 20: [15, 351, 340, 350, 20],
 21: [251, 301, 30, 21, 353],
 22: [72, 24, 208, 23, 22],
 23: [86, 25, 24, 22, 23],
 24: [72, 22, 23, 25, 24],
 25: [88, 23, 24, 25, 72],
 26: [259, 29, 4, 3, 26],
 27: [21, 353, 301, 27, 30],
 28: [27, 254, 228, 28, 226],
 29: [259, 262, 266, 265, 29],
 30: [256, 21, 27, 353, 30],
 31: [56, 32, 89, 44, 31],
 32: [44, 90, 50, 207, 32],
 33: [110, 39, 34, 42, 33],
 34: [148, 49, 92, 33, 34],
 35: [36, 113, 107, 47, 35],
 36: [47, 125, 210,

In [176]:
item_item_similar_dict[4]
for i in item_item_similar_dict[1]:
    print(get_itemName(i))

Gunmetal Roller Ball Pen
Waterproof Gear Bag
Gel Roller Pen
Colored Pencil Set
Google Men's 100% Cotton Short Sleeve Hero Tee Navy


In [127]:
# print(sparse_X.toarray())

In [15]:
# similarities = cosine_similarity(X)
# print('pairwise dense output:\n {}\n'.format(similarities))

In [129]:
## also can output sparse matrices
similarities_sparse = cosine_similarity(X,dense_output=False)
print('pairwise sparse output:\n {}\n'.format(similarities_sparse))


pairwise sparse output:
   (1, 58529)	0.415030114156755
  (1, 56189)	0.6519914609260814
  (1, 55766)	0.6160791148857382
  (1, 54771)	0.6519914609260814
  (1, 54684)	0.506923270828922
  (1, 54294)	0.6519914609260814
  (1, 54209)	3.5989928588288325e-05
  (1, 54208)	0.304780841879642
  (1, 54118)	0.6519914609260814
  (1, 53542)	0.6519914609260814
  (1, 53278)	0.6519914609260814
  (1, 53260)	0.6519914609260814
  (1, 52838)	0.6519914609260814
  (1, 51741)	0.6519914578346309
  (1, 51197)	0.6519914609260814
  (1, 49127)	0.6519914609260814
  (1, 48543)	0.6519914609260814
  (1, 47935)	0.29171684024190303
  (1, 47585)	0.23280924426100413
  (1, 46779)	0.6519914609260814
  (1, 44903)	0.09344294710759565
  (1, 44812)	0.1870928010096155
  (1, 43890)	0.6519914609260814
  (1, 43037)	0.29420671447110947
  (1, 41239)	0.5637033288760351
  :	:
  (58530, 1799)	1.0
  (58530, 1665)	0.9999999648908703
  (58530, 1575)	0.9916110702462212
  (58530, 1511)	0.3660745382951204
  (58530, 1422)	1.0
  (58530, 1401)	1.0

In [None]:
save_npz(os.path.join(dir_path, 'data', 'user_user_similarity'), similarities_sparse)

In [177]:
def top_n_idx_sparse(matrix, n):
    '''Return index of top n values in each row of a sparse matrix'''
    top_n_idx = []
    for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]):
        n_row_pick = min(n, ri - le)
        top_n_idx.append(matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]])
    return top_n_idx

In [178]:
user_user_similar = top_n_idx_sparse(similarities_sparse, 10)
user_user_similar_dict = {}
for idx, val in enumerate(user_user_similar):
        user_user_similar_dict.update({idx: val.tolist()})
user_user_similar_dict

{0: [],
 1: [36110, 39515, 12250, 13926, 49676, 26448, 30423, 1681, 56479, 1],
 2: [15991, 38665, 50043, 38897, 17999, 51334, 38537, 5491, 17466, 2],
 3: [7803, 16684, 30563, 26752, 30562, 28298, 6365, 6080, 25211, 3],
 4: [28477, 32621, 37893, 32962, 33168, 8868, 34001, 36369, 7907, 4],
 5: [28477, 32621, 37893, 32962, 33168, 8868, 34001, 36369, 7907, 4],
 6: [25377, 27006, 27623, 26009, 24964, 25376, 25198, 25188, 25139, 6],
 7: [34318, 12151, 13712, 37362, 37506, 12792, 12757, 39032, 12230, 7],
 8: [4129, 47821, 28316, 30970, 31708, 32294, 57978, 35072, 46576, 8],
 9: [26860, 27273, 27253, 26705, 27479, 27295, 27321, 27335, 27372, 9],
 10: [11539, 13463, 57490, 39631, 2758, 13228, 55624, 22286, 24659, 10],
 11: [27350, 29203, 26937, 26045, 25935, 25218, 40462, 29178, 1132, 11],
 12: [17299, 37378, 13770, 14387, 39301, 14011, 39379, 39427, 12, 42759],
 13: [27133, 27789, 25573, 26288, 26245, 8700, 8710, 26025, 25574, 55964],
 14: [34452, 57571, 44887, 44674, 36923, 38290, 20294, 8818

In [179]:
# gets actual user ids from data based on sparse matrix position index
similar_users_final = {}
for user, similar_users in user_user_similar_dict.items():
    idx = user_name[user]
    values = []
    for value in similar_users:
        values.append(user_name[value])

    similar_users_final.update({idx: values})
similar_users_final

{'0000010278554503158-1': [],
 '0000020424342248747-1': ['6132901374101424778-1',
  '6707561055105273161-1',
  '2098669102660886301-2',
  '2384713621516170241-3',
  '849338347058364945-1',
  '4519112442186338032-1',
  '5197987748105757784-1',
  '0292514705436162768-1',
  '9670527874451274496-1',
  '0000020424342248747-1'],
 '000005103959234087-1': ['2729410989075560896-1',
  '6565341438621206544-1',
  '855598268859013226-1',
  '6605859878848616970-3',
  '3080862967628224448-4',
  '8764646379190942972-3',
  '654513487704580777-1',
  '096830421763570474-1',
  '2981307355529857789-1',
  '000005103959234087-1'],
 '0000168159078983594-1': ['1360590166363734644-1',
  '2843908057360952517-2',
  '5224940413608540493-2',
  '4573974500543961874-1',
  '5224940413608540493-1',
  '4834997823438927679-1',
  '1113357042631084599-4',
  '1065272532565323194-1',
  '4303041016027387041-3',
  '0000168159078983594-1'],
 '0000174067426171406-1': ['4868029080481437587-1',
  '5571074961202476553-2',
  '644268

In [185]:
## Liked item by a perticular user
def liked_items(visitorId, dataframe = norm_df):
  user_id = (user_name[visitorId])
  likes = dataframe[norm_df['visitorId'] == user_id]['itemId'].values.tolist()
  liked = []
  for i in likes:
    liked.append(itemId_name[i])
  return liked
liked_items(1, norm_df)

["Google Men's 100% Cotton Short Sleeve Hero Tee White",
 'Waterpoof Gear Bag',
 "Google Men's Performance 1/4 Zip Pullover Heather/Black"]

In [226]:
def recommend_product(q): ## q is user index number
    if q >= V:
        top_items_id, top_items_name = get_top_items(10, norm_df)
        print("Recommend top selling products : \n{}".format(top_items_name))
    else:
        other_likes = []
        q_likes = []
        for j in liked_items(q, norm_df):
            q_likes.append(j)
        print("Items liked by user {} are : \n {}".format(q, q_likes))
        for i in user_user_similar_dict[q]:
            if i != q:
                for p in liked_items(i, norm_df):
                    if p not in other_likes:
                        other_likes.append(p)
        print("\n")        
        print("Items liked by similar users \n{}".format(other_likes))
        recom = []
        for item in (other_likes):
            if item not in q_likes:
                recom.append(item)

        if len(recom) == 0:
            for i in q_likes:
                item_indx = get_item_index(i)
                item_item_similar_dict[item_indx]
                for i in item_item_similar_dict[1]:
        #             print(get_itemName(i))
                    if get_itemName(i) not in q_likes:
                        recom.append(get_itemName(i))

            print("\n")                
            print("New Product recommendation for you \n{}".format(recom))

        else:
            print("\n")
            print("Product recommendation for you \n{}".format(recom))
            print("\n")

In [227]:
# lo = ['Waterpoof Gear Bag', "Google Men's 100% Cotton Short Sleeve Hero Tee White", 'Waterproof Gear Bag']
# item_finder("Waterprof Gear Bag")

In [228]:
recommend_product(900000)

Recommend top selling products : 
["Android Women's Short Sleeve Hero Tee Black", 'Softsided Travel Pouch Set', 'Waterpoof Gear Bag', "Google Men's  Zip Hoodie", 'Keyboard DOT Sticker', 'Sport Bag', 'Google Laptop and Cell Phone Stickers', 'Android Toddler Short Sleeve T-shirt Pewter', 'Google Sunglasses', 'Gel Roller Pen']
