In [32]:
import gzip
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
from itertools import product, combinations
import random

In [2]:
# These are the functions you need to read in the data. You need to have the datafiles in a data folder that is in
# the directory you are working in
#LINK IM USING FOR K-NN https://stackabuse.com/k-nearest-neighbors-algorithm-in-python-and-scikit-learn/


def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getData(name, attributes):
    i = 0
    dic = {}
    path = 'data/reviews_%s_5.json.gz' % name
    for line in parse(path):
        filtered = {}
        for k in line.keys():
            if k in attributes:
                filtered[k] = line[k]
            else:
                continue
        
        dic[i] = filtered
        i += 1
    return dic

In [3]:
# These are one of the three names you can choose to read the dataset from
names = ['Digital_Music', 'Kindle_Store', 'Video_Games']

# These are all the possible attributes that our datapoints can have. Lots of them are not that useful for us. So during
# the reading of the datafile you have to specify which of these attributes you want to include. For our purposes,
# only reviewerID, asin (product ID) and overall will be helpful. But I included them all, just in case.
attributes = ['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText', 'overall', 'summary', 
              'unixReviewTime', 'reviewTime']

In [5]:
# You read in the data for example like this:
data = getData('Digital_Music', ['reviewerID', 'asin', 'overall'])

In [6]:
# This is what our data will look like. You have a dictionary with integers as keys [0,1,2,3,4...], and as values one 
# datapoint. Each datapoint in itself is a dictionary, with as keys the attribute, and as value the value of that attribute.
for i in range(5):
    print(data[i])

{'reviewerID': 'A3EBHHCZO6V2A4', 'asin': '5555991584', 'overall': 5.0}
{'reviewerID': 'AZPWAXJG9OJXV', 'asin': '5555991584', 'overall': 5.0}
{'reviewerID': 'A38IRL0X2T4DPF', 'asin': '5555991584', 'overall': 5.0}
{'reviewerID': 'A22IK3I6U76GX0', 'asin': '5555991584', 'overall': 5.0}
{'reviewerID': 'A1AISPOIIHTHXX', 'asin': '5555991584', 'overall': 4.0}


In [30]:
def split_data(data, test_ratio=0.1):
    reviewers = set()
    for i in data.items():
        reviewers.add(i[1]['reviewerID'])
    
    l = len(reviewers)
    test_reviewers = set(random.sample(reviewers, int(l*test_ratio)))
    train_reviewers = reviewers - test_reviewers
    
    train_data = {k:v for k,v in data.items() if v['reviewerID'] in train_reviewers}
    test_data = {k:v for k,v in data.items() if v['reviewerID'] in test_reviewers}
    
    
    tr = len(train_data)
    te = len(test_data)
    print('There are %s train reviews' %tr)
    print('There are %s test reviews' %te)
    
    return train_data, test_data

In [35]:
train_data, test_data = split_data(data)

There are 58114 train reviews
There are 6592 test reviews


In [40]:
ds_train = pd.DataFrame.from_dict(train_data, orient='index')
ds_test = pd.DataFrame.from_dict(test_data, orient='index')

print('Number of colums in Dataframe : ', len(ds.columns))
print('Number of rows in Dataframe : ', len(ds.index))
ds.head()

Number of colums in Dataframe :  3
Number of rows in Dataframe :  58114


Unnamed: 0,reviewerID,asin,overall
0,A3EBHHCZO6V2A4,5555991584,5.0
1,AZPWAXJG9OJXV,5555991584,5.0
2,A38IRL0X2T4DPF,5555991584,5.0
3,A22IK3I6U76GX0,5555991584,5.0
4,A1AISPOIIHTHXX,5555991584,4.0


In [43]:
X_train = ds_train.iloc[:,:].values
X_test = ds_test.iloc[:,:].values

print(X_train[:6])
print(X_test[:6])

[['A3EBHHCZO6V2A4' '5555991584' 5.0]
 ['AZPWAXJG9OJXV' '5555991584' 5.0]
 ['A38IRL0X2T4DPF' '5555991584' 5.0]
 ['A22IK3I6U76GX0' '5555991584' 5.0]
 ['A1AISPOIIHTHXX' '5555991584' 4.0]
 ['A2P49WD75WHAG5' '5555991584' 5.0]]
[['A3UBAZKS727Z0E' '5555991584' 5.0]
 ['A8MWQY1R5YE3M' 'B00000016T' 5.0]
 ['A38NQGQW63IQJJ' 'B00000016T' 5.0]
 ['A1LDAIGOQ6N4DW' 'B00000016T' 5.0]
 ['A5KJVGJ43ZDC9' 'B00000016T' 5.0]
 ['A22N03OBDDVSEB' 'B00000016T' 5.0]]


In [51]:
def cosine_similarity(p,q):
    d = sum(pi * qi for pi,qi in zip(p, q))
    mag_p = math.sqrt(sum([pi**2 for pi in p]))
    mag_q = math.sqrt(sum([qi**2 for qi in q]))
    sim = d / ( mag_p * mag_q)
    return sim

def euclidean_similarity(p, q):
    dist = math.sqrt(sum((pi-qi)**2 for pi,qi in zip(p, q)))
    sim = 1 / (1+dist)
    return sim    

def pearson_correlation(p,q):
    # this code does not scale well to large datasets. In the following, we rely on 
    # scipy.spatial.distance.correlation() to compute long vectors
    if len(p) > 99:
        return 1 - distance.correlation(p,q)        
    
    p_mean = sum(p) / len(p)
    p_deviations = [(pi-p_mean) for pi in p]
    
    q_mean = sum(q) / len(q)
    q_deviations = [(qi-q_mean) for qi in q]
    
    cov = sum(pd * qd for pd,qd in zip(p_deviations, q_deviations))
        
    sds_product = math.sqrt(sum((pd)**2 for pd in p_deviations) * sum((qd)**2 for qd in q_deviations))
    
    if sds_product != 0:
        r = cov / sds_product
    else:
        r = 0
    return r

In [117]:
def calc_similarity(user2product, target, other_user, sim_measure, threshold=0):
    # found some explanation here https://towardsdatascience.com/introduction-to-recommender-systems-6c66cf15ada
    
    shared = list(set(user2product[target].keys()).intersection(set(user2product[other_user].keys())))
    
    if len(shared) <= threshold:
        return 0
    
    target_ratings = [v for k,v in user2product[target].items() if k in shared]# for i in [target, other_user]]
    other_user_ratings = [v for k,v in user2product[other_user].items() if k in shared]
    
    weight = len(shared)/len(user2product[target])
    similarity = weight*sim_measure(target_ratings, other_user_ratings)
    
    return similarity

In [44]:
user2product = defaultdict(dict)
for reviewerID, asin, overall in X_train:
    user2product[reviewerID][asin] = overall

In [46]:
print("no. of users:", len(user2product))
print("no of reviews in user 1:", len(list(user2product.items())[0][1]))

no. of users: 4987
no of reviews in user 1: 91


In [47]:
item2product = defaultdict(dict)
for reviewerID, asin, overall in X_train:
    item2product[asin][reviewerID] = overall
    
print("no. of items:", len(item2product))

no. of items: 3568


In [124]:
# Calculate the similarities (NB: this can take a minute)
measure2function = {"euclidean" : euclidean_similarity, "cosine": cosine_similarity, "pearson": pearson_correlation}

similarities = {"euclidean" : dict(), "cosine": dict(), "pearson": dict()}
for measure, function in measure2function.items():
    similarities[measure] = dict([(a,dict()) for a in list(user2product.keys())[:40]])
    for id1, id2, in combinations(list(user2product.keys())[:40], 2):
        similarities[measure][id1][id2] = calculate_similarity(user2product, id1, id2, function)
        
#print(similarities["cosine"].items())

In [126]:
class kNN(object):
    """ k-Nearest Neighbour """
    
    def __init__(self, x2y: dict, sim_measure:callable[(list, list), float], k = 10):
        self.x2y = x2y
        self.k = k
        self.sim_measure = sim_measure
    
    def find_similarities(self, an_id):
        similarities = dict()
        for id1, id2 in product([target], list(user2product.keys())[:40]):
            similarities[id2] = calculate_similarity(x2y, id1, id2, measure)
        return sorted(list(similarities.items()), key=lambda x:x[1], reverse = True)[:k]        

In [122]:
def find_similarities(target: str, x2y: defaultdict, measure) -> dict:
    similarities = dict()
    for id1, id2 in product([target], user2product.keys()):
        similarities[id2] = calculate_similarity(x2y, id1, id2, measure)
    return sorted(list(similarities.items()), key=lambda x:x[1], reverse = True)[:50]


find_similarities('A3EBHHCZO6V2A4', user2product, cosine_similarity)

[('A3EBHHCZO6V2A4', 1.0),
 ('A3W4D8XOGLWUN5', 0.3898344204051092),
 ('A2KW2KWKABNYNO', 0.11413379669939978),
 ('A3E2FGR7OTA351', 0.07293721044789159),
 ('A1IIUCG9TJFUWS', 0.07257356561920973),
 ('A3NZQ0KTNYMVGL', 0.05532293677181463),
 ('A5HUEE1HB4LDF', 0.05405878626922568),
 ('ATO9DDJUGNWVD', 0.05343566363558953),
 ('A19DU0YV4K2MRY', 0.05238805435415342),
 ('AXTO23M29DQX0', 0.052085885694695855),
 ('A27YS4JDS7J3IC', 0.05166326946027823),
 ('A1NC5YN34N5VRX', 0.051033473626585724),
 ('A1BVI1R4GJMABC', 0.04639591260918303),
 ('A2VOI85H5S0OH6', 0.04395604395604396),
 ('ACY9QYNDFLVBI', 0.04368719712851072),
 ('A3E0962TERBW7F', 0.043376296235997876),
 ('A28IAZZI0SNRMW', 0.042378978884816045),
 ('A24N1BAS3CU27H', 0.04203625124338991),
 ('A280KHZO2L7GKA', 0.04203625124338991),
 ('AR65NXO4GIA9H', 0.041534557473541456),
 ('A2L7Z3FH4MSWYW', 0.041534557473541456),
 ('A1LVZI3QBCW9A0', 0.041534557473541456),
 ('ADN5YE0HOKE6O', 0.04123834724043112),
 ('A1QEWOSV05RYEO', 0.04119549418956575),
 ('A2NZ5