In [1]:
# These are the functions you need to read in the data. You need to have the datafiles in a data folder that is in
# the directory you are working in
#LINK IM USING FOR K-NN https://stackabuse.com/k-nearest-neighbors-algorithm-in-python-and-scikit-learn/
import gzip

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getData(name, attributes):
    i = 0
    dic = {}
    path = 'data/reviews_%s_5.json.gz' % name
    for line in parse(path):
        filtered = {}
        for k in line.keys():
            if k in attributes:
                filtered[k] = line[k]
            else:
                continue
        
        dic[i] = filtered
        i += 1
    return dic

In [2]:
# These are one of the three names you can choose to read the dataset from
names = ['Digital_Music', 'Kindle_Store', 'Video_Games']

# These are all the possible attributes that our datapoints can have. Lots of them are not that useful for us. So during
# the reading of the datafile you have to specify which of these attributes you want to include. For our purposes,
# only reviewerID, asin (product ID) and overall will be helpful. But I included them all, just in case.
attributes = ['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText', 'overall', 'summary', 
              'unixReviewTime', 'reviewTime']

In [9]:
# You read in the data for example like this:
data = getData('Digital_Music', ['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText', 'overall', 'summary', 
              'unixReviewTime', 'reviewTime'])

In [4]:
# This is what our data will look like. You have a dictionary with integers as keys [0,1,2,3,4...], and as values one 
# datapoint. Each datapoint in itself is a dictionary, with as keys the attribute, and as value the value of that attribute.
for i in range(5):
    print(data[i])

{'reviewerID': 'A3EBHHCZO6V2A4', 'asin': '5555991584', 'reviewerName': 'Amaranth "music fan"', 'helpful': [3, 3], 'reviewText': 'It\'s hard to believe "Memory of Trees" came out 11 years ago;it has held up well over the passage of time.It\'s Enya\'s last great album before the New Age/pop of "Amarantine" and "Day without rain." Back in 1995,Enya still had her creative spark,her own voice.I agree with the reviewer who said that this is her saddest album;it is melancholy,bittersweet,from the opening title song."Memory of Trees" is elegaic&majestic.;"Pax Deorum" sounds like it is from a Requiem Mass,it is a dark threnody.Unlike the reviewer who said that this has a "disconcerting" blend of spirituality&sensuality;,I don\'t find it disconcerting at all."Anywhere is" is a hopeful song,looking to possibilities."Hope has a place" is about love,but it is up to the listener to decide if it is romantic,platonic,etc.I\'ve always had a soft spot for this song."On my way home" is a triumphant endin

In [13]:
data[]

64706

In [40]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from collections import defaultdict

ds = pd.DataFrame.from_dict(data, orient='index')

print('Number of colums in Dataframe : ', len(ds.columns))
print('Number of rows in Dataframe : ', len(ds.index))
ds.head()

Number of colums in Dataframe :  9
Number of rows in Dataframe :  64706


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A3EBHHCZO6V2A4,5555991584,"Amaranth ""music fan""","[3, 3]","It's hard to believe ""Memory of Trees"" came ou...",5.0,Enya's last great album,1158019200,"09 12, 2006"
1,AZPWAXJG9OJXV,5555991584,bethtexas,"[0, 0]","A clasically-styled and introverted album, Mem...",5.0,Enya at her most elegant,991526400,"06 3, 2001"
2,A38IRL0X2T4DPF,5555991584,bob turnley,"[2, 2]",I never thought Enya would reach the sublime h...,5.0,The best so far,1058140800,"07 14, 2003"
3,A22IK3I6U76GX0,5555991584,Calle,"[1, 1]",This is the third review of an irish album I w...,5.0,Ireland produces good music.,957312000,"05 3, 2000"
4,A1AISPOIIHTHXX,5555991584,"Cloud ""...""","[1, 1]","Enya, despite being a successful recording art...",4.0,4.5; music to dream to,1200528000,"01 17, 2008"


In [13]:
X = ds.iloc[:,:].values

#print(X[:6])
X_train, X_test = train_test_split(X, test_size=0.20)
#print(X_test[:6])

In [37]:
def cosine_similarity(p,q):
    d = sum(pi * qi for pi,qi in zip(p, q))
    mag_p = math.sqrt(sum([pi**2 for pi in p]))
    mag_q = math.sqrt(sum([qi**2 for qi in q]))
    sim = d / ( mag_p * mag_q)
    return sim

def calculate_similarity(ratings, id1, id2, measure, threshold = 0):
    # get the list of shared rated items
    shared = sorted(set(ratings[id1].keys()).intersection(set(ratings[id2].keys())))

    # ignore comparisons with too few overlapping ratings (default is 0)
    if len(shared) <= threshold:
        return 0
    
    sel_ratings = [np.asarray([v for (k,v) in ratings[i].items() if k in shared]) for i in [id1, id2]]
    
    # compute similarity
    sim = measure(*sel_ratings)
    return sim

def splitdata(data, per):
    training_data = data[:int(len(data)*per/100)]
    test_data = data[int(len(data)*per/100):]
    return (training_data, test_data)

In [18]:
ds_cleaned = ds.iloc[:,np.r_[:2, 5]]
ds_cleaned.head()

Unnamed: 0,reviewerID,asin,overall
0,A3EBHHCZO6V2A4,5555991584,5.0
1,AZPWAXJG9OJXV,5555991584,5.0
2,A38IRL0X2T4DPF,5555991584,5.0
3,A22IK3I6U76GX0,5555991584,5.0
4,A1AISPOIIHTHXX,5555991584,4.0


In [20]:
cleaned_val = ds_cleaned.values

user2product = defaultdict(dict)
for reviewerID, asin, overall in cleaned_val:
    user2product[reviewerID][asin] = overall

In [None]:
print("no. of users:", len(user2product))
print("no of reviews in user 1:", len(list(user2product.items())[0][1]))
print(list(user2product.items())[0])new_list= (2,1,5,6,7,1,1,1,1,1,1)
print(len(new_list))

splitdata(new_list,80)

In [None]:
print("no. of users:", len(user2product))
print("no of reviews in user 1:", len(list(user2product.items())[0][1]))

In [28]:
print(list(user2product.items())[0])new_list= (2,1,5,6,7,1,1,1,1,1,1)
print(len(new_list))

splitdata(new_list,80)

no. of users: 5541
no of reviews in user 1: 91
('A3EBHHCZO6V2A4', {'5555991584': 5.0, 'B000002NJS': 5.0, 'B000002P5W': 5.0, 'B00000DHZJ': 5.0, 'B00005QZH2': 5.0, 'B00005YW4H': 2.0, 'B0009WFF0Y': 2.0, 'B000S50QYC': 5.0, 'B000V3L0ZK': 2.0, 'B000VS8UJY': 5.0, 'B000W0YVX0': 5.0, 'B000WL7UQ4': 2.0, 'B00137GGZC': 3.0, 'B00137KS2O': 3.0, 'B00137RGS8': 3.0, 'B0013D8BK4': 3.0, 'B0017OLUUC': 2.0, 'B001A7DDTC': 3.0, 'B001AAAT6O': 5.0, 'B001AAAT7I': 2.0, 'B001AAE9N8': 3.0, 'B001AAHQLA': 2.0, 'B001AUEMFS': 3.0, 'B001BNK58W': 2.0, 'B001DU4PHA': 3.0, 'B001FX5V8M': 3.0, 'B001FX9QS8': 1.0, 'B001GLHBIG': 4.0, 'B001IXQU3O': 3.0, 'B001IXSU8W': 5.0, 'B001IXUUNU': 5.0, 'B001JODA58': 3.0, 'B001K3JF7K': 2.0, 'B001L2BIHK': 2.0, 'B001NBO0KG': 2.0, 'B001Q6ERVA': 5.0, 'B001W3T44W': 3.0, 'B001X3EQLW': 2.0, 'B001XJTB8E': 3.0, 'B0024RI70M': 2.0, 'B00299CER2': 5.0, 'B002BPH1F4': 2.0, 'B002BPKWH8': 1.0, 'B002I53BL0': 5.0, 'B002K0OGOO': 3.0, 'B002O1QKGQ': 5.0, 'B002POXFQA': 5.0, 'B002X063LA': 5.0, 'B003DEE470': 1.0, 'B

((2, 1, 5, 6, 7, 1, 1, 1), (1, 1, 1))

In [63]:
def euclidean_similarity(p, q):
    dist = math.sqrt(sum((pi-qi)**2 for pi,qi in zip(p, q)))
    sim = 1 / (1+dist)
    return sim    

def pearson_correlation(p,q):
    # this code does not scale well to large datasets. In the following, we rely on 
    # scipy.spatial.distance.correlation() to compute long vectors
    if len(p) > 99:
        return 1 - distance.correlation(p,q)        
    
    p_mean = sum(p) / len(p)
    p_deviations = [(pi-p_mean) for pi in p]
    
    q_mean = sum(q) / len(q)
    q_deviations = [(qi-q_mean) for qi in q]
    
    cov = sum(pd * qd for pd,qd in zip(p_deviations, q_deviations))
        
    sds_product = math.sqrt(sum((pd)**2 for pd in p_deviations) * sum((qd)**2 for qd in q_deviations))
    
    if sds_product != 0:
        r = cov / sds_product
    else:
        r = 0
    return r

for id1, id2, in combinations(list(user2product.keys())[:40], 2):
    similarities[id1] = dict()
    similarities[id1][id2] = calculate_similarity(user2product, id1, id2, cosine_similarity)
for a in list(similarities.items())[:40]:
    print(a)

('A3EBHHCZO6V2A4', {'A31KXTOQNTWUVM': 0})
('AZPWAXJG9OJXV', {'A31KXTOQNTWUVM': 0})
('A38IRL0X2T4DPF', {'A31KXTOQNTWUVM': 0})
('A22IK3I6U76GX0', {'A31KXTOQNTWUVM': 0})
('A1AISPOIIHTHXX', {'A31KXTOQNTWUVM': 0})
('A2P49WD75WHAG5', {'A31KXTOQNTWUVM': 0.9938837346736188})
('A3O90G1D7I5EGG', {'A31KXTOQNTWUVM': 0})
('A3EJYJC25OJVKK', {'A31KXTOQNTWUVM': 0})
('A1DA8VOH9NR6C7', {'A31KXTOQNTWUVM': 0})
('A33TRNCQK4IUO7', {'A31KXTOQNTWUVM': 0})
('AWY3EPKEOUV1W', {'A31KXTOQNTWUVM': 0})
('A1SCJWCMQ3W3KK', {'A31KXTOQNTWUVM': 0})
('A14BTJRH9VNLJJ', {'A31KXTOQNTWUVM': 0.9999999999999999})
('A2AOZQ3WTNVVOK', {'A31KXTOQNTWUVM': 1.0})
('A1BXA3SM3AJOKL', {'A31KXTOQNTWUVM': 0})
('A3CCYAQRHUTPIQ', {'A31KXTOQNTWUVM': 0})
('AHUT55E980RDR', {'A31KXTOQNTWUVM': 0})
('A24N1BAS3CU27H', {'A31KXTOQNTWUVM': 1.0})
('A19YHEBK099R7U', {'A31KXTOQNTWUVM': 0})
('A16KCH578FG4B4', {'A31KXTOQNTWUVM': 0})
('A3UBAZKS727Z0E', {'A31KXTOQNTWUVM': 0})
('A23H9KY3Q1YJ2N', {'A31KXTOQNTWUVM': 0})
('AYZCMVSSL4X4B', {'A31KXTOQNTWUVM': 0})


In [69]:
from itertools import combinations

# Calculate the similarities (NB: this can take a minute)
measure2function = {"euclidean" : euclidean_similarity, "cosine": cosine_similarity, "correlation": pearson_correlation}

similarities = {"euclidean" : dict(), "cosine": dict(), "correlation": dict()}
for measure, function in measure2function.items():
    similarities[measure] = dict([(a,dict()) for a in list(user2product.keys())[:40]])
    for id1, id2, in combinations(list(user2product.keys())[:40], 2):
        similarities[measure][id1][id2] = calculate_similarity(user2product, id1, id2, function)
        
print(similarities["correlation"].items())

dict_items([('A3EBHHCZO6V2A4', {'AZPWAXJG9OJXV': 0, 'A38IRL0X2T4DPF': 0, 'A22IK3I6U76GX0': 0, 'A1AISPOIIHTHXX': 0, 'A2P49WD75WHAG5': 0, 'A3O90G1D7I5EGG': 0, 'A3EJYJC25OJVKK': 0, 'A1DA8VOH9NR6C7': 0, 'A33TRNCQK4IUO7': 0, 'AWY3EPKEOUV1W': 0, 'A1SCJWCMQ3W3KK': 0, 'A14BTJRH9VNLJJ': 0, 'A2AOZQ3WTNVVOK': 0, 'A1BXA3SM3AJOKL': 0, 'A3CCYAQRHUTPIQ': 0, 'AHUT55E980RDR': 1.0, 'A24N1BAS3CU27H': 0, 'A19YHEBK099R7U': 0, 'A16KCH578FG4B4': 0, 'A3UBAZKS727Z0E': 0, 'A23H9KY3Q1YJ2N': 0, 'AYZCMVSSL4X4B': 0, 'A1LQ1A8DNOTUKV': 0, 'A1W3ZAKFIDGM13': 0, 'A200C7YQJ45LRR': 0, 'A3RNC9BGR4J1ZF': 0, 'A268MFPSNVICJS': 0, 'A1L8IB1K8ETJW9': 0, 'ARFL4OAG9CI1H': 0, 'A1X93ES4DITTWK': 0, 'A1JXEJFPV9EUVS': 0, 'A141XAG09QATRB': 0, 'A3QZA3MXC75V4E': 0, 'A28N19JIB5161D': 0, 'A3BLMCU4V5J2DX': 0, 'AAQMHLS8BN9YJ': 0, 'AE9GUE3HHOX3U': 0, 'A4BPK6TYITJS': 0, 'A31KXTOQNTWUVM': 0}), ('AZPWAXJG9OJXV', {'A38IRL0X2T4DPF': 0, 'A22IK3I6U76GX0': 0, 'A1AISPOIIHTHXX': 0, 'A2P49WD75WHAG5': 0, 'A3O90G1D7I5EGG': 0, 'A3EJYJC25OJVKK': 0, 'A1DA8VOH

In [15]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(X_train, y_train)

NameError: name 'y_train' is not defined