In [39]:
import gzip
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
from operator import itemgetter
from scipy.spatial import distance
from sklearn.metrics import mean_absolute_error
from collections import defaultdict
from itertools import product, combinations
from operator import itemgetter
import random

In [40]:
# These are the functions you need to read in the data. You need to have the datafiles in a data folder that is in
# the directory you are working in
#LINK IM USING FOR K-NN https://stackabuse.com/k-nearest-neighbors-algorithm-in-python-and-scikit-learn/


def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getData(name, attributes):
    i = 0
    dic = {}
    path = 'data/reviews_%s_5.json.gz' % name
    for line in parse(path):
        filtered = {}
        for k in line.keys():
            if k in attributes:
                filtered[k] = line[k]
            else:
                continue
        
        dic[i] = filtered
        i += 1
    return dic

In [3]:
# These are one of the three names you can choose to read the dataset from
names = ['Digital_Music', 'Kindle_Store', 'Video_Games']

# These are all the possible attributes that our datapoints can have. Lots of them are not that useful for us. So during
# the reading of the datafile you have to specify which of these attributes you want to include. For our purposes,
# only reviewerID, asin (product ID) and overall will be helpful. But I included them all, just in case.
attributes = ['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText', 'overall', 'summary', 
              'unixReviewTime', 'reviewTime']

In [4]:
# You read in the data for example like this:
data = getData('Digital_Music', ['reviewerID', 'asin', 'overall'])

In [5]:
# This is what our data will look like. You have a dictionary with integers as keys [0,1,2,3,4...], and as values one 
# datapoint. Each datapoint in itself is a dictionary, with as keys the attribute, and as value the value of that attribute.
for i in range(5):
    print(data[i])

{'reviewerID': 'A3EBHHCZO6V2A4', 'asin': '5555991584', 'overall': 5.0}
{'reviewerID': 'AZPWAXJG9OJXV', 'asin': '5555991584', 'overall': 5.0}
{'reviewerID': 'A38IRL0X2T4DPF', 'asin': '5555991584', 'overall': 5.0}
{'reviewerID': 'A22IK3I6U76GX0', 'asin': '5555991584', 'overall': 5.0}
{'reviewerID': 'A1AISPOIIHTHXX', 'asin': '5555991584', 'overall': 4.0}


In [5]:
def split_data(data, test_ratio=0.1):
    reviewers = set()
    for i in data.items():
        reviewers.add(i[1]['reviewerID'])
    
    l = len(reviewers)
    test_reviewers = set(random.sample(reviewers, int(l*test_ratio)))
    train_reviewers = reviewers - test_reviewers
    
    train_data = {k:v for k,v in data.items() if v['reviewerID'] in train_reviewers}
    test_data = {k:v for k,v in data.items() if v['reviewerID'] in test_reviewers}
    
    
    tr = len(train_data)
    te = len(test_data)
    print('There are %s train reviews' %tr)
    print('There are %s test reviews' %te)
    
    return train_data, test_data

In [6]:
train_data, test_data = split_data(data)

There are 58328 train reviews
There are 6378 test reviews


In [7]:
ds_train = pd.DataFrame.from_dict(train_data, orient='index')
ds_test = pd.DataFrame.from_dict(test_data, orient='index')

print('Number of colums in training Dataframe : ', len(ds_train.columns))
print('Number of rows in training Dataframe : ', len(ds_train.index))
ds_train.head()

Number of colums in training Dataframe :  3
Number of rows in training Dataframe :  58328


Unnamed: 0,reviewerID,asin,overall
0,A3EBHHCZO6V2A4,5555991584,5.0
1,AZPWAXJG9OJXV,5555991584,5.0
2,A38IRL0X2T4DPF,5555991584,5.0
3,A22IK3I6U76GX0,5555991584,5.0
4,A1AISPOIIHTHXX,5555991584,4.0


In [8]:
X_train = ds_train.iloc[:,:].values
X_test = ds_test.iloc[:,:].values

print(X_train[:6])
print(X_test[:6])

[['A3EBHHCZO6V2A4' '5555991584' 5.0]
 ['AZPWAXJG9OJXV' '5555991584' 5.0]
 ['A38IRL0X2T4DPF' '5555991584' 5.0]
 ['A22IK3I6U76GX0' '5555991584' 5.0]
 ['A1AISPOIIHTHXX' '5555991584' 4.0]
 ['A2P49WD75WHAG5' '5555991584' 5.0]]
[['A14BTJRH9VNLJJ' '5555991584' 5.0]
 ['A3CCYAQRHUTPIQ' '5555991584' 5.0]
 ['A3UBAZKS727Z0E' '5555991584' 5.0]
 ['A1L8IB1K8ETJW9' 'B0000000ZW' 3.0]
 ['A3BLMCU4V5J2DX' 'B0000000ZW' 5.0]
 ['AEFRJ6ZBKGZV2' 'B00000016T' 5.0]]


In [9]:
def cosine_similarity(p,q):
    d = sum(pi * qi for pi,qi in zip(p, q))
    mag_p = math.sqrt(sum([pi**2 for pi in p]))
    mag_q = math.sqrt(sum([qi**2 for qi in q]))
    sim = d / ( mag_p * mag_q)
    return sim

def euclidean_similarity(p, q):
    dist = math.sqrt(sum((pi-qi)**2 for pi,qi in zip(p, q)))
    sim = 1 / (1+dist)
    return sim    

def pearson_correlation(p,q):
    # this code does not scale well to large datasets. In the following, we rely on 
    # scipy.spatial.distance.correlation() to compute long vectors
    if len(p) > 99:
        return 1 - distance.correlation(p,q)        
    
    p_mean = sum(p) / len(p)
    p_deviations = [(pi-p_mean) for pi in p]
    
    q_mean = sum(q) / len(q)
    q_deviations = [(qi-q_mean) for qi in q]
    
    cov = sum(pd * qd for pd,qd in zip(p_deviations, q_deviations))
        
    sds_product = math.sqrt(sum((pd)**2 for pd in p_deviations) * sum((qd)**2 for qd in q_deviations))
    
    if sds_product != 0:
        r = cov / sds_product
    else:
        r = 0
    return r

In [10]:
def calc_similarity(user2product, target, other_user, sim_measure, threshold=0):
    # found some explanation here https://towardsdatascience.com/introduction-to-recommender-systems-6c66cf15ada
    
    shared = list(set(user2product[target].keys()).intersection(set(user2product[other_user].keys())))
    
    if len(shared) <= threshold:
        return 0
    
    target_ratings = [v for k,v in user2product[target].items() if k in shared]# for i in [target, other_user]]
    other_user_ratings = [v for k,v in user2product[other_user].items() if k in shared]
    
    weight = len(shared)/len(user2product[target])
    similarity = weight*sim_measure(target_ratings, other_user_ratings)
    
    return similarity

In [11]:
user2item = defaultdict(dict)
for reviewerID, asin, overall in X_train:
    user2item[reviewerID][asin] = overall

user2item_test = defaultdict(dict)
for reviewerID, asin, overall in X_test:
    user2item_test[reviewerID][asin] = overall


In [18]:
print("no. of users:", len(user2item))
print("no of reviews in user 1:", len(list(user2item.items())[0][1]))

no. of users: 4987
no of reviews in user 1: 16


In [12]:
item2user = defaultdict(dict)
for reviewerID, asin, overall in X_train:
    item2user[asin][reviewerID] = overall
    
print("no. of items:", len(item2user))

item2user_test = defaultdict(dict)
for reviewerID, asin, overall in X_test:
    item2user_test[asin][reviewerID] = overall

print("no. of items:", len(item2user_test))

no. of items: 3568
no. of items: 2401


In [20]:
# Calculate the similarities (NB: this can take a minute)
measure2function = {"euclidean" : euclidean_similarity, "cosine": cosine_similarity, "pearson": pearson_correlation}

similarities = {"euclidean" : dict(), "cosine": dict(), "pearson": dict()}
for measure, function in measure2function.items():
    similarities[measure] = dict([(a,dict()) for a in list(user2item.keys())[:40]])
    for id1, id2, in combinations(list(user2item.keys())[:40], 2):
        similarities[measure][id1][id2] = calc_similarity(user2item, id1, id2, function)
        
#print(similarities["cosine"].items())

In [21]:
def rankids(useritemdict, thresh):
    rankedt = []
    rankedb = []
    scoredict = {}
    
    for k, v in useritemdict.items():
        itemdict = v
        for itemID, stars in itemdict.items():
            p = (stars, k[1])
            if itemID in scoredict:
                scoredict[itemID].append(p)
            else:
                scoredict[itemID] = [p]
    for ID, IDscortup in scoredict.items():
        starlist = []
        for scortup in IDscortup:
            starlist.append(scortup[0])
        a = np.mean(starlist)
        q = []
        for r in IDscortup:
            q.append(r[0] * r[1])
        IDandweighted = (ID, np.mean(q))
        if a > thresh:
            rankedt.append(IDandweighted)
        else:
            rankedb.append(IDandweighted)            
    
    rankedt.sort(key=itemgetter(1))
    rankedb.sort(key=itemgetter(1))
    rankedt.reverse()
    rankedb.reverse()
    return(rankedt, rankedb)

In [26]:
def recommend(userid, sim_measure, user2item, k, n):
    """This function takes a user id, similarity measure, the user2product dictionary, and k as input. It calculates the
    k most similar users. It outputs a dictionary, with as keys tuples containing the k similar users ID and their
    similarity, and as values a dictionary containing the ratings for all of their items"""
    reviewers = [user for user in user2item.keys() if user != userid]
    similarities = [(other_user, calc_similarity(user2item, userid, other_user, sim_measure)) for other_user in reviewers]
    k_similarities = sorted(similarities, key = lambda x: x[1], reverse=True)[:k]
    output = dict()
    for user_tup in k_similarities:
        output[user_tup] = user2item[user_tup[0]]
    
    lists = rankids(output, 2.49)
    #return(output)
    toplist = lists[0]
    botlist = lists[1]
    return (toplist + botlist)[:n]

In [32]:
#recommend('AZPWAXJG9OJXV', cosine_similarity, user2item, 50)

In [None]:
class kNN(object):
    """ k-Nearest Neighbour """
    
    def __init__(self, x2y: dict, sim_measure, name=None, k = 10):
        self.x2y = x2y                     # For example, user2item matrix
        self.k = k
        self.sim_measure = sim_measure     # Like cosine similarity
        if name:
            self.similarities=pickle.load(open(f"data/{name}_similarities.pkl", 'rb'))
            self.neighborhood=pickle.load(open(f"data/{name}_neighboorhood.pkl", 'rb'))
        else:
            self.similarities = None   # If the similarities are already stored
            self.neighborhood = None  # Likewise
        self.quite = False
        self.target_vector = None
    
    def find_id_similarities(self, an_id):
        """ For user-to-user part"""
        sims = dict()
        for id1, id2 in product([target], list(x2y.keys())[:40]):
            sims[id2] = calc_similarity(x2y, id1, id2, self.sim_measure)
        return sorted(list(sims.items()), key=lambda x:x[1], reverse = True)[:k]
    
    def find_all_similarities(self, reset):
        """ For item-to-item part"""
        if not self.quite: print("Making a similarity matrix...")
        if not reset: return
        sims = defaultdict(dict)
        for id1, id2 in combinations(self.x2y.keys(), 2):
            sims[id1][id2] = calc_similarity(self.x2y, id1, id2, self.sim_measure)
        self.similarities = sims
    
    def get_neighborhoods(self, reset):
        """ For every x, gets the other y similarities and makes it a dictionary"""
        if not self.quite: print(f"Setting up the {self.k} neighbourhoods...")
        if not reset: return
        self.neighborhood = dict()
        for x in self.similarities.keys():
            self.neighborhood[x] = dict(sorted(self.similarities[x].items(),
                                               key = itemgetter(1),
                                               reverse = True)[:self.k])
    
    def built_target(self, an_id):
        """ Makes a target vector from everything the user didn't rate"""
        self.target_vector = list()
        for ID, group in self.neighborhood.items():
            if an_id not in group.keys():
                self.target_vector.append(ID)
    
    def getPredictionsForItems(self, an_id, x):
        """ Creates rating predictions depending on the neighborhood of x"""
        weigthed_scores = list()
        similarities = list()
        for item, sim in self.neighborhood[x].items():
            if an_id in self.x2y[item]:
                weigthed_scores.append(sim * self.x2y[item][an_id])
                similarities.append(sim)
        if not sum(similarities):
            return 0
        return float(f"{sum(weigthed_scores) / sum(similarities):.2f}")
    
    def recommend_item(self, an_id, n = 10, test=False, reset = False):
        """ Gets the similarities, neighborhoods and the target vector
        Then gets predictions according to the vector. Finally recommends the top
        predictions."""
        self.find_all_similarities(reset)
        self.get_neighborhoods(reset)
        if not test: self.built_target(an_id)
        if not self.quite: print("Recommending...")
        recommendations = dict()
        for x in self.target_vector:
            recommendations[x] = self.getPredictionsForItems(an_id, x)
        srt = sorted(list(recommendations.items()), key=lambda x:x[1], reverse=True)
        if n:
            return dict(srt[:n])
        return dict(srt)
    
    def save_state(self, name):
        """ To save the state to prevent recomputing each time"""
        file_stream = open(f'data/{name}_similarities.pkl', 'wb')
        file_stream2 = open(f'data/{name}_neighboorhood.pkl', 'wb')
        pickle.dump(self.similarities, file_stream)
        pickle.dump(self.neighborhood, file_stream2)
                          
    def load_state(self, name):
        """ To save the state to prevent recomputing each time"""
        file_stream = open(f'data/{name}_similarities.pkl', 'rb')
        file_stream2 = open(f'data/{name}_neighboorhood.pkl', 'rb')
        self.similarities = pickle.load(file_stream)
        self.neighborhood = pickle.load(file_stream2)
    
    def evaluate(self, x2y, ratio = 0.8):
        costs = 0
        train_lengths = list()
        x2y_list = [(key, list(group.items())) for key, group in x2y.items()]
        for n, (key, group) in enumerate(x2y_list):
            train_lengths.append(int(len(x2y_list[n])*ratio))
            self.x2y[key] = dict(group[:train_lengths[-1]])
        
        self.find_all_similarities(True)
        self.get_neighborhoods(True)
        self.quite = True
        missed = 0
        for m, (outer_key, group) in enumerate(x2y_list):
            self.target_vector = {k:v for k, v in group[train_lengths[m]:]}
            recommendations = self.recommend_item(outer_key, test=True, n=0)
            cost = self.our_MSE(recommendations)
            if cost is not -1:
                costs += cost
            else:
                missed -= 1
        self.quite = False
        return costs/(len(x2y_list)+missed)
        
    def our_MSE(self, recommendations):
        costs = 0
        for key, rating in self.target_vector.items():
            costs += (rating-recommendations[key])**2
        if len(self.target_vector) is 0:
            return -1
        return costs/len(self.target_vector)

        


In [55]:
# This code initializes and populates the object
aKNN = kNN(item2user, pearson_correlation)
aKNN.find_all_similarities(reset=True)
aKNN.get_neighborhoods(reset=True)
aKNN.save_state('item2user')

Making a similarity matrix...
Setting up the 10 neighbourhoods...


In [61]:
bKNN = kNN(item2user, pearson_correlation, name = 'item2user')
recs = bKNN.recommend_item('A22IK3I6U76GX0')
# This can be all zeroes, if the user happens to be in the test set. Then change the name

# for ID, rate in recs.items():
#     print(f"Item {ID} is estimated to be rated: {rate}")
#print("Total evaluation:", aKNN.evaluate(user2item_test))
bKNN.evaluate(user2item_test)

Making a similarity matrix...
Setting up the 10 neighbourhoods...
Recommending...
Making a similarity matrix...
Setting up the 10 neighbourhoods...
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 4.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 4.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 4.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 4.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 4.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 3.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 5.0
Predicted: 0
Given: 4.0
P

In [None]:
def recommend(userid, sim_measure, user2item, k, n):
    """This function takes a user id, similarity measure, the user2product dictionary, and k as input. It calculates the
    k most similar users. It outputs a dictionary, with as keys tuples containing the k similar users ID and their
    similarity, and as values a dictionary containing the ratings for all of their items"""
    reviewers = [user for user in user2item.keys() if user != userid]
    similarities = [(other_user, calc_similarity(user2item, userid, other_user, sim_measure)) for other_user in reviewers]
    k_similarities = sorted(similarities, key = lambda x: x[1], reverse=True)[:k]
    output = dict()
    for user_tup in k_similarities:
        output[user_tup] = user2item[user_tup[0]]
    
    lists = rankids(output, 2.49)
    #return(output)
    toplist = lists[0]
    botlist = lists[1]
    return (toplist + botlist)[:n]
    

def evaluate2(x2y,k,n,measure, testdata, ratio):
    accuracies =[]
    for user,group in testdata.items():
        reckeys = recommend(user, measure, x2y, k,n)
        overlap = 0
        print(overlap)
        itemscount = 0
        for item,score in reckeys:
            itemscount += 1
            if item in group.keys():
                overlap += 2
                
            else:
                pass
        accuracy = overlap / (len(reckeys)+ itemscount)
        accuracies.append(accuracy)
    eval2 = np.mean(accuracies)  
    return eval2
    
    
print(evaluate2(user2item,10,10,cosine_similarity,user2item_test,0.9)) 

In [None]:
def optimiser(k_list, n_list, sim_measures, ratio, test_data):
    scores = []
    for k in k_list:
        for n in n_list:
            for measure in sim_measures:
                score = evaluate(x2y, k, n, measure, test_data, ratio)
                string = ''.join('k=',k,', n=',n,', measure=',measure,', ratio=',ratio)
                scores.append((string, score))