In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from scipy.sparse import save_npz
from fuzzywuzzy import process
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import scipy
import pickle
import os
dir_path = os.path.dirname(os.getcwd())
# import sys  
# sys.path.append(os.path.join(dir_path, "src"))
# from utils import create_map



In [4]:
products = pd.read_csv(os.path.join(dir_path, 'data', 'raw','product.csv'))
products.head()

Unnamed: 0,itemId,itemName,itemCategory
0,GGOEGODR017799,Recycled Mouse Pad,Home/Electronics/Electronics Accessories/
1,GGOEGFKA022299,Keyboard DOT Sticker,Home/Office/
2,GGOEGAAX0358,Google Men's Zip Hoodie,Home/Apparel/
3,GGOEGFKQ020399,Google Laptop and Cell Phone Stickers,Home/Fun/
4,GGOEGDWC020199,Engraved Ceramic Google Mug,Mugs/


In [5]:
norm_df = pd.read_csv(os.path.join(dir_path, 'data', 'normalized_data.csv'))
norm_df.head()

Unnamed: 0,id,visitorId,itemId,normalized_session_duration
0,0,50743654948546081-2,GGOEGAAX0031,0.120099
1,1,5342238128015433624-1,GGOEGAAX0031,0.122196
2,2,966739175376367480-2,GGOEGAAX0031,0.428319
3,3,8321419770612068321-1,GGOEGAAX0031,0.000883
4,4,4132865605127470100-1,GGOEGAAX0031,0.526131


In [6]:
# (data_sparse != sparse_X).nnz==0 
print("------- DATA SUMMARY -------\n")
print("Number of rows in processed data : {}".format(norm_df.shape[0]))
print("Number of columns in processed data : {}\n".format(norm_df.shape[1]))
print("Number of Customers : {}".format(norm_df['visitorId'].nunique()))
print("Total number of products : {}\n".format(norm_df['itemId'].nunique()))

------- DATA SUMMARY -------

Number of rows in processed data : 137514
Number of columns in processed data : 4

Number of Customers : 58531
Total number of products : 374



In [7]:
V = norm_df['visitorId'].nunique() # Get our unique customers
I = norm_df['itemId'].nunique()

visitor_mapper = dict(zip(np.sort(norm_df.visitorId.unique()), list(range(V))))
item_mapper = dict(zip(np.sort(norm_df.itemId.unique()), list(range(I))))

visitor_name = dict(zip(list(range(V)), np.sort(norm_df.visitorId.unique())))
item_name = dict(zip(list(range(I)), np.sort(norm_df.itemId.unique())))

# Get the associated row indices
visitor_index = [visitor_mapper[i] for i in norm_df['visitorId']]
# Get the associated column indices
item_index = [item_mapper[i] for i in norm_df['itemId']]

## Map product ID's with product names
itemId_name = dict(zip(products['itemId'], products['itemName']))
## Inverse of 'itemId_name'
itemName_Id = dict(zip(products['itemName'], products['itemId']))

In [8]:
## Finds the product name regardless of spelling mistake
def item_finder(itemName):
    all_names = products["itemName"].tolist()
    closest_match = process.extractOne(itemName,all_names)
    return closest_match[0]

## With Item Id get product name
def get_itemName(item_idx):
    item_id = item_name[item_idx]
    product_name = itemId_name[item_id]
    return product_name

## With item name get item index value
def get_item_index(itemName):
    fuzzy_name = item_finder(itemName)
    item_id = itemName_Id[fuzzy_name]
    item_idx = item_mapper[item_id]
    return item_idx

Since our goal is to use Cosine Similarity to measure how close Visitor are from each other, we need to transform our dataset from a dense to a sparse representation. In order to achieve that each Visitor needs to be represented by a single row in the dataset so that the columns are the session duration of the Visitor to each different item.

In [9]:
def create_map(norm_df, item_item = False):
    if item_item == False:
        X = csr_matrix((norm_df["normalized_session_duration"], (visitor_index,item_index)), shape=(V, I))
        return X
    else:
        X = csr_matrix((norm_df["normalized_session_duration"], (item_index, visitor_index)), shape=(I,V))
        return X

In [10]:
sparse_X =  create_map(norm_df)

### Calculating the distance among Visitors

In [11]:
# calculate similarity between each row that is similar visitors
similarities_sparse = cosine_similarity(sparse_X, dense_output=False)
print('pairwise sparse output:\n {}\n'.format(similarities_sparse))

pairwise sparse output:
   (1, 58529)	0.415030114156755
  (1, 56189)	0.6519914609260814
  (1, 55766)	0.6160791148857382
  (1, 54771)	0.6519914609260814
  (1, 54684)	0.506923270828922
  (1, 54294)	0.6519914609260814
  (1, 54209)	3.5989928588288325e-05
  (1, 54208)	0.304780841879642
  (1, 54118)	0.6519914609260814
  (1, 53542)	0.6519914609260814
  (1, 53278)	0.6519914609260814
  (1, 53260)	0.6519914609260814
  (1, 52838)	0.6519914609260814
  (1, 51741)	0.6519914578346309
  (1, 51197)	0.6519914609260814
  (1, 49127)	0.6519914609260814
  (1, 48543)	0.6519914609260814
  (1, 47935)	0.29171684024190303
  (1, 47585)	0.23280924426100413
  (1, 46779)	0.6519914609260814
  (1, 44903)	0.09344294710759565
  (1, 44812)	0.1870928010096155
  (1, 43890)	0.6519914609260814
  (1, 43037)	0.29420671447110947
  (1, 41239)	0.5637033288760351
  :	:
  (58530, 1799)	1.0
  (58530, 1665)	0.9999999648908703
  (58530, 1575)	0.9916110702462212
  (58530, 1511)	0.3660745382951204
  (58530, 1422)	1.0
  (58530, 1401)	1.0

Here we will use dense_output=False to have the output as a SciPy sparse matrix, this is a step that we are taking to make sure that our matrix fits in memory, otherwise the output would be a numpy ndarray which isn’t as efficient for storing large and sparse data.

The shape of our similarities_sparse is (# of visitorID, # of visitorID) and the values are the similarity scores computed for each Visitor against every other Visitor in the dataset.

In [12]:
similarities_sparse.shape

(58531, 58531)

Next for every Visitor we need to get the top K most similar Visitor so that we can look at which items they liked and make suggestions - that’s where the actual Collaborative Filtering happens.

The method top_n_idx_sparse below takes as input a scipy.csr_matrix and returns the top K highest indexes in each row, thats where we get the most similar visitor for each visitor in our Dataset

In [13]:
###  Return index of top n values in each row of a sparse matrix
def top_n_idx_sparse(matrix, n):
    top_n_idx = []
    for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]):
        n_row_pick = min(n, ri - le)
        top_n_idx.append(matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]])
    return top_n_idx

In [14]:
visitor_visitor_similar = top_n_idx_sparse(similarities_sparse, 5)
visitor_visitor_similar

[array([], dtype=int32),
 array([26448, 30423,  1681, 56479,     1], dtype=int32),
 array([38537, 17999,  5491, 51334,     2], dtype=int32),
 array([ 6365, 30562,  6080, 28298,     3], dtype=int32),
 array([37893, 34001, 32962, 33168,     4], dtype=int32),
 array([37893, 34001, 32962, 33168,     4], dtype=int32),
 array([25198, 24964, 25188, 25376,     6], dtype=int32),
 array([12151, 13712, 37506, 37362,     7], dtype=int32),
 array([32294, 57978, 35072, 46576,     8], dtype=int32),
 array([27321, 27479, 27335, 27295,     9], dtype=int32),
 array([ 2758, 55624, 22286, 24659,    10], dtype=int32),
 array([25935, 29178, 40462,  1132,    11], dtype=int32),
 array([13770, 14387, 39301,    12, 42759], dtype=int32),
 array([25573,  8710, 26288, 26245, 55964], dtype=int32),
 array([43422, 44674, 36923, 38290,    14], dtype=int32),
 array([47555, 47573, 53557, 23175,    15], dtype=int32),
 array([16222,   263,  1644, 18273,    16], dtype=int32),
 array([36331, 20803,  9665, 16695,    17], dty

In [15]:
### Top 5 similar visitors to the visitor
visitor_visitor_similar_dict = {}
for idx, val in enumerate(visitor_visitor_similar):
        visitor_visitor_similar_dict.update({idx: val.tolist()})
visitor_visitor_similar_dict

{0: [],
 1: [26448, 30423, 1681, 56479, 1],
 2: [38537, 17999, 5491, 51334, 2],
 3: [6365, 30562, 6080, 28298, 3],
 4: [37893, 34001, 32962, 33168, 4],
 5: [37893, 34001, 32962, 33168, 4],
 6: [25198, 24964, 25188, 25376, 6],
 7: [12151, 13712, 37506, 37362, 7],
 8: [32294, 57978, 35072, 46576, 8],
 9: [27321, 27479, 27335, 27295, 9],
 10: [2758, 55624, 22286, 24659, 10],
 11: [25935, 29178, 40462, 1132, 11],
 12: [13770, 14387, 39301, 12, 42759],
 13: [25573, 8710, 26288, 26245, 55964],
 14: [43422, 44674, 36923, 38290, 14],
 15: [47555, 47573, 53557, 23175, 15],
 16: [16222, 263, 1644, 18273, 16],
 17: [36331, 20803, 9665, 16695, 17],
 18: [41658, 42035, 42318, 42236, 18],
 19: [28033, 8362, 26247, 26254, 19],
 20: [23804, 23726, 23794, 4259, 20],
 21: [26304, 26276, 26303, 26312, 58523],
 22: [26610, 51187, 26703, 22, 5290],
 23: [43910, 43667, 43527, 19874, 58514],
 24: [31860, 2125, 8667, 24, 2852],
 25: [44909, 34213, 25, 9722, 4052],
 26: [],
 27: [32580, 58525, 28209, 27, 53156

In [14]:
## Liked item by a perticular user
def liked_items(visitorId, dataframe = norm_df):
  user_id = (visitor_name[visitorId])
  likes = dataframe[norm_df['visitorId'] == user_id]['itemId'].values.tolist()
  liked = []
  for i in likes:
    liked.append(itemId_name[i])
  return liked
liked_items(16, norm_df)

['YouTube Twill Cap',
 'Waterproof Gear Bag',
 "Google Men's 100% Cotton Short Sleeve Hero Tee Black",
 'Google Spiral Leather Journal']

In [15]:
def get_top_items(n, df=norm_df):
    most_viewed_itemsId = df.groupby('itemId').count()
#     most_viewed_itemsId.sort_values('visitorId', ascending=False).head(10)
    most_viewed_itemId_sorted = most_viewed_itemsId.sort_values('visitorId',ascending=False)

    top_items_id = []
    top_items_name = []
    for i in range(0,n):
        top_items_id.append(most_viewed_itemId_sorted.index[i])
        product_name = itemId_name[most_viewed_itemId_sorted.index[i]]
        top_items_name.append(product_name)
    return top_items_id, top_items_name # Return the top item Ids and Item names from df 

In [16]:
top_items_id, top_items_name = get_top_items(10, norm_df)
print("Top 10 most viewed items are :\n {}".format(top_items_name))
print("\n")
print("Top 10 most viewed item Ids are :\n {}".format(top_items_id))

Top 10 most viewed items are :
 ["Android Women's Short Sleeve Hero Tee Black", 'Softsided Travel Pouch Set', 'Waterpoof Gear Bag', "Google Men's  Zip Hoodie", 'Keyboard DOT Sticker', 'Sport Bag', 'Google Laptop and Cell Phone Stickers', 'Android Toddler Short Sleeve T-shirt Pewter', 'Google Sunglasses', 'Gel Roller Pen']


Top 10 most viewed item Ids are :
 ['GGOEGAAX0104', 'GGOEGBRJ037299', 'GGOEGBRA037499', 'GGOEGAAX0358', 'GGOEGFKA022299', 'GGOEGBRJ037399', 'GGOEGFKQ020399', 'GGOEGAAX0105', 'GGOEGAAX0037', 'GGOEGDHQ015399']


In [17]:
## Lets find similar items now
item_sparse_X = create_map(norm_df, item_item = True)
item_sparse_X

<374x58531 sparse matrix of type '<class 'numpy.float64'>'
	with 137514 stored elements in Compressed Sparse Row format>

In [18]:
item_sparse = cosine_similarity(item_sparse_X, dense_output=False)
print('pairwise sparse output:\n {}\n'.format(item_sparse))

pairwise sparse output:
   (0, 0)	1.0
  (1, 209)	0.0006046087822716545
  (1, 319)	0.006068482219230545
  (1, 307)	0.002199612909538996
  (1, 16)	0.0013616264934660685
  (1, 302)	0.002731591322448519
  (1, 295)	1.569628334444803e-07
  (1, 72)	0.003503473616293597
  (1, 62)	0.0002785655404699201
  (1, 61)	0.0003706377876061823
  (1, 13)	0.0017013727587772243
  (1, 2)	0.0003940870077534431
  (1, 337)	0.009129177844462957
  (1, 122)	2.7359504808243976e-07
  (1, 81)	0.0033021130167198097
  (1, 322)	0.0004489053897689555
  (1, 244)	0.00014480295320994874
  (1, 241)	0.00018516579327811664
  (1, 143)	0.001066094081889249
  (1, 136)	0.0006415852794069728
  (1, 134)	0.0005920906089437864
  (1, 128)	0.0006909158887276665
  (1, 114)	0.00039838415886101223
  (1, 12)	0.01303422171206371
  (1, 368)	0.041500481141509914
  :	:
  (373, 19)	0.00018385495483283889
  (373, 17)	0.00019399878766881995
  (373, 369)	0.052713409549511515
  (373, 350)	0.0030730793458416854
  (373, 243)	0.0021799711448686125
  (3

In [19]:
### Top 5 similar items to the item
item_item_similar = top_n_idx_sparse(item_sparse, 5)
item_item_similar_dict = {}
for idx, val in enumerate(item_item_similar):
        item_item_similar_dict.update({idx: val.tolist()})
item_item_similar_dict

{0: [0],
 1: [373, 15, 341, 14, 1],
 2: [295, 254, 229, 255, 2],
 3: [260, 258, 257, 26, 3],
 4: [363, 260, 19, 4, 26],
 5: [26, 29, 266, 262, 5],
 6: [5, 12, 268, 269, 6],
 7: [296, 298, 8, 7, 297],
 8: [296, 35, 297, 7, 8],
 9: [310, 311, 18, 309, 9],
 10: [10],
 11: [173, 367, 18, 11, 308],
 12: [315, 17, 251, 301, 12],
 13: [302, 355, 368, 337, 13],
 14: [224, 369, 1, 345, 14],
 15: [350, 1, 20, 351, 15],
 16: [18, 308, 311, 309, 16],
 17: [12, 19, 301, 250, 17],
 18: [311, 16, 9, 11, 18],
 19: [17, 239, 260, 19, 4],
 20: [15, 351, 340, 350, 20],
 21: [251, 301, 30, 21, 353],
 22: [72, 24, 208, 23, 22],
 23: [86, 25, 24, 22, 23],
 24: [72, 22, 23, 25, 24],
 25: [88, 23, 24, 25, 72],
 26: [259, 29, 4, 3, 26],
 27: [21, 353, 301, 27, 30],
 28: [27, 254, 228, 28, 226],
 29: [259, 262, 266, 265, 29],
 30: [256, 21, 27, 353, 30],
 31: [56, 32, 89, 44, 31],
 32: [44, 90, 50, 207, 32],
 33: [110, 39, 34, 42, 33],
 34: [148, 49, 92, 33, 34],
 35: [36, 113, 107, 47, 35],
 36: [47, 125, 210,

In [20]:
def recommend_product(q): ## q is Visitor index number
    if q >= V:
        top_items_id, top_items_name = get_top_items(10, norm_df)
        print("Recommend top selling products : \n{}".format(top_items_name))
    else:
        other_likes = []
        q_likes = []
        for j in liked_items(q, norm_df):
            q_likes.append(j)
        print("Items liked by visitor {} are : \n {}".format(q, q_likes))
        for i in visitor_visitor_similar_dict[q]:
            if i != q:
                for p in liked_items(i, norm_df):
                    if p not in other_likes:
                        other_likes.append(p)
        print("\n")        
        print("Items liked by similar visitors \n{}".format(other_likes))
        recom = []
        for item in (other_likes):
            if item not in q_likes:
                recom.append(item)

        if len(recom) == 0:
            for q_like in q_likes:
                item_indx = get_item_index(q_like)
                for ele in item_item_similar_dict[item_indx]:
        #             print(get_itemName(i))
                    if get_itemName(ele) not in q_likes:
                        recom.append(get_itemName(ele))

            print("\n")                
            print("New Product recommendation for you \n{}".format(recom))

        else:
            print("\n")
            print("Product recommendation for you \n{}".format(recom))
            print("\n")
        return recom

In [21]:
recommendations = recommend_product(585)

Items liked by visitor 585 are : 
 ['Google Luggage Tag', 'Waterpoof Gear Bag']


Items liked by similar visitors 
['Google Luggage Tag', 'Blue Metallic Textured Spiral Notebook Set']


Product recommendation for you 
['Blue Metallic Textured Spiral Notebook Set']




In [22]:
### Check if all the items recommended by the cosine similarily method exist in the top selling items
check =  all(item in top_items_name for item in recommendations)
 
if check is True:
    print("The list {} contains all elements of the list {}".format(top_items_name, recommendations))    
else :
    print("No, top_items_name doesn't have all elements of the recommendations.")

No, top_items_name doesn't have all elements of the recommendations.


In [153]:
# ## Save the similarities_sparse matrics
# # save_npz(os.path.join(dir_path, 'data', 'user_user_similarity'), similarities_sparse)

# visitor_visitor_similar = np.load((os.path.join(dir_path, 'data','user_user_similarity.npz')))
# print(user_user_similar)