In [1]:
import pandas as pd
import gzip
import json
import numpy as np

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('Digital_Music.json.gz')

In [2]:
df

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"12 22, 2013",A1ZCPG3D3HGRSS,0001388703,{'Format:': ' Audio CD'},mark l. massey,This is a great cd full of worship favorites!!...,Great worship cd,1387670400,,
1,5.0,True,"09 11, 2013",AC2PL52NKPL29,0001388703,{'Format:': ' Audio CD'},Norma Mushen,"So creative! Love his music - the words, the ...",Gotta listen to this!,1378857600,,
2,5.0,True,"03 2, 2013",A1SUZXBDZSDQ3A,0001388703,{'Format:': ' Audio CD'},Herbert W. Shurley,"Keith Green, gone far to early in his carreer,...",Great approach still gets the message out,1362182400,,
3,5.0,True,"12 2, 2012",A3A0W7FZXM0IZW,0001388703,{'Format:': ' Audio CD'},Mary M Raybell,Keith Green had his special comedy style of Ch...,Great A must have,1354406400,,
4,5.0,False,"01 7, 2012",A12R54MKO17TW0,0001388703,{'Format:': ' Audio CD'},J. Bynum,Keith Green / So you wanna go back to Egypt......,A great one from Keith with a guest appearance...,1325894400,6,
...,...,...,...,...,...,...,...,...,...,...,...,...
1584077,5.0,True,"03 3, 2018",AR3KABMPL5L0O,B01HJ91P94,{'Format:': ' MP3 Music'},Shar Solis,Casting Crowns....you do it so well! Awesome s...,Awesome song!,1520035200,,
1584078,4.0,True,"01 24, 2018",A2N53GHW73INDH,B01HJ91P94,{'Format:': ' MP3 Music'},PEANUT,This band has produced many inspiring Christia...,This song in particular speaks about what it's...,1516752000,,
1584079,5.0,True,"04 21, 2017",ABNKLDCCVJKW1,B01HJ91P94,{'Format:': ' MP3 Music'},Alex,Awesome band and awesome song. This is my next...,Five Stars,1492732800,,
1584080,5.0,True,"02 23, 2018",AMWSDABZWFRAT,B01HJ91IVY,{'Format:': ' MP3 Music'},sara cabuag,Excellent,Five Stars,1519344000,,


In [3]:
df_clean = df.drop(columns=['reviewTime', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'style', 'image', 'vote', 'verified'])

In [4]:
df_clean

Unnamed: 0,overall,reviewerID,asin
0,5.0,A1ZCPG3D3HGRSS,0001388703
1,5.0,AC2PL52NKPL29,0001388703
2,5.0,A1SUZXBDZSDQ3A,0001388703
3,5.0,A3A0W7FZXM0IZW,0001388703
4,5.0,A12R54MKO17TW0,0001388703
...,...,...,...
1584077,5.0,AR3KABMPL5L0O,B01HJ91P94
1584078,4.0,A2N53GHW73INDH,B01HJ91P94
1584079,5.0,ABNKLDCCVJKW1,B01HJ91P94
1584080,5.0,AMWSDABZWFRAT,B01HJ91IVY


In [5]:
#Filtering the data by number of reviews before making pivot table to ease memory constraints
#Here threshold is inclusive (n or more reviews will be kept)
user_thresh = 5
item_thresh = 3
pre_pivot = df_clean[df_clean.groupby(['reviewerID'])['reviewerID'].transform('size') >= user_thresh]
pre_pivot2 = pre_pivot[pre_pivot.groupby(['asin'])['asin'].transform('size') >= item_thresh]

In [6]:
pivot = pd.pivot_table(pre_pivot2, values = 'overall', index='reviewerID', columns = 'asin').reset_index()

In [7]:
pivot

asin,reviewerID,0001377647,0001388703,0001526146,0006920055,0006935257,0760135886,1189182785,278472414X,3426958910,...,B01HI9B8T2,B01HIH0LI8,B01HIQU3AU,B01HIUVMF6,B01HIW5RV4,B01HIY8QVU,B01HIY9CVI,B01HJ91HEC,B01HJ91LIY,B01HJ91MTW
0,A0072041HVZ3465DXUOR,,,,,,,,,,...,,,,,,,,,,
1,A0081575F2F9XQSSIYA3,,,,,,,,,,...,,,,,,,,,,
2,A0108129TLIKAX34M8AA,,,,,,,,,,...,,,,,,,,,,
3,A01241534EPLP5O3KOP5,,,,,,,,,,...,,,,,,,,,,
4,A0234545X30ULJHGZUA3,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43331,AZZHILYMITLGM,,,,,,,,,,...,,,,,,,,,,
43332,AZZI8LIBZYYIK,,,,,,,,,,...,,,,,,,,,,
43333,AZZQ6G2EQ08O6,,,,,,,,,,...,,,,,,,,,,
43334,AZZRPY6IJC7SP,,,,,,,,,,...,,,,,,,,,,


In [8]:
#Filtering again after pivot table is made
#After some columns are reviewed, there are still users with < threshold reviews
pivot = pivot.loc[pivot.count(axis = 'columns') > user_thresh]


In [9]:
#This is the final pivot table
pivot


asin,reviewerID,0001377647,0001388703,0001526146,0006920055,0006935257,0760135886,1189182785,278472414X,3426958910,...,B01HI9B8T2,B01HIH0LI8,B01HIQU3AU,B01HIUVMF6,B01HIW5RV4,B01HIY8QVU,B01HIY9CVI,B01HJ91HEC,B01HJ91LIY,B01HJ91MTW
0,A0072041HVZ3465DXUOR,,,,,,,,,,...,,,,,,,,,,
1,A0081575F2F9XQSSIYA3,,,,,,,,,,...,,,,,,,,,,
9,A0638585LHS5R1XDIOGY,,,,,,,,,,...,,,,,,,,,,
11,A0723371S65BNSU0AYV8,,,,,,,,,,...,,,,,,,,,,
14,A1006TXWG76H0N,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43326,AZYL2RTHUWR0P,,,,,,,,,,...,,,,,,,,,,
43327,AZYOVGJLQ03ML,,,,,,,,,,...,,,,,,,,,,
43328,AZYPOLLSDVG4K,,,,,,,,,,...,,,,,,,,,,
43331,AZZHILYMITLGM,,,,,,,,,,...,,,,,,,,,,


In [10]:
reviewers = pivot['reviewerID'].values
rev_unique = np.unique(reviewers)
rev_count_list = pivot.count(axis = 'columns').sort_values()
rev_values  = rev_count_list.values
rev_unique = np.unique(rev_values)
for i in range(len(rev_unique)):
    print(str(rev_unique[i] -1) + ": "+ str(np.count_nonzero(rev_values == rev_unique[i])))

5: 5687
6: 3456
7: 2285
8: 1658
9: 1179
10: 924
11: 724
12: 592
13: 441
14: 409
15: 301
16: 256
17: 235
18: 225
19: 188
20: 178
21: 153
22: 143
23: 103
24: 100
25: 80
26: 75
27: 73
28: 75
29: 56
30: 49
31: 53
32: 43
33: 44
34: 42
35: 33
36: 28
37: 29
38: 27
39: 25
40: 23
41: 24
42: 20
43: 18
44: 13
45: 10
46: 19
47: 19
48: 16
49: 8
50: 11
51: 9
52: 10
53: 13
54: 4
55: 10
56: 9
57: 6
58: 9
59: 11
60: 5
61: 5
62: 6
63: 9
64: 4
65: 4
66: 6
67: 6
68: 2
69: 3
70: 6
71: 5
72: 6
73: 3
74: 5
75: 1
76: 5
77: 4
78: 3
79: 2
80: 1
81: 3
82: 2
83: 5
84: 5
86: 1
87: 2
88: 2
90: 3
91: 2
92: 2
93: 1
94: 2
95: 1
96: 1
97: 1
98: 2
100: 2
101: 1
102: 3
103: 3
104: 1
105: 3
106: 1
107: 1
111: 1
113: 1
114: 1
115: 1
117: 1
118: 3
120: 2
121: 1
122: 2
123: 1
124: 2
127: 1
128: 2
129: 1
132: 1
133: 1
134: 1
141: 1
143: 1
146: 2
148: 1
153: 1
169: 1
181: 1
187: 1
199: 1
201: 1
203: 1
227: 1
235: 1
238: 1
375: 1
765: 1


In [11]:
c_list = pivot.count().sort_values()
values  = c_list.values
unique = np.unique(values)
#Displaying how many items (columns) have a given number of reviews (ignore the last one)
#We may want to play with datasets and see if any others are more dense
for i in range(len(unique)):
    print(str(unique[i]) + ": "+ str(np.count_nonzero(values == unique[i])))

0: 585
1: 2650
2: 5986
3: 7256
4: 4014
5: 2555
6: 1793
7: 1268
8: 1026
9: 832
10: 685
11: 509
12: 443
13: 355
14: 339
15: 252
16: 268
17: 203
18: 183
19: 138
20: 123
21: 125
22: 143
23: 95
24: 98
25: 81
26: 77
27: 62
28: 55
29: 62
30: 41
31: 46
32: 51
33: 36
34: 32
35: 39
36: 27
37: 36
38: 31
39: 26
40: 23
41: 18
42: 35
43: 10
44: 11
45: 18
46: 16
47: 14
48: 7
49: 14
50: 12
51: 14
52: 6
53: 12
54: 7
55: 9
56: 7
57: 7
58: 5
59: 7
60: 9
61: 9
62: 4
63: 5
64: 3
65: 6
66: 5
67: 7
68: 2
69: 3
70: 3
71: 8
72: 7
73: 3
74: 4
75: 2
76: 3
77: 6
78: 1
79: 5
80: 4
81: 2
82: 1
83: 3
84: 1
85: 2
86: 2
87: 3
88: 2
89: 3
90: 3
91: 2
92: 2
93: 3
94: 2
95: 4
96: 2
98: 2
99: 4
100: 1
101: 2
102: 2
103: 1
104: 1
107: 2
108: 1
109: 1
110: 1
112: 2
113: 1
114: 2
116: 2
117: 1
118: 2
120: 1
121: 1
125: 1
126: 1
128: 1
131: 1
133: 1
134: 1
135: 1
136: 1
140: 1
147: 1
152: 1
154: 1
155: 2
157: 1
158: 1
161: 1
162: 1
164: 1
165: 1
166: 1
168: 1
171: 1
172: 1
175: 1
176: 1
182: 1
186: 2
192: 1
209: 1
211: 1
234:

In [12]:
#We probably have to split things in a little more sophisticated framework, but this is a simple beginning
train=pivot.sample(frac=0.8,random_state=200)
test=pivot.drop(train.index)

In [13]:
test

asin,reviewerID,0001377647,0001388703,0001526146,0006920055,0006935257,0760135886,1189182785,278472414X,3426958910,...,B01HI9B8T2,B01HIH0LI8,B01HIQU3AU,B01HIUVMF6,B01HIW5RV4,B01HIY8QVU,B01HIY9CVI,B01HJ91HEC,B01HJ91LIY,B01HJ91MTW
11,A0723371S65BNSU0AYV8,,,,,,,,,,...,,,,,,,,,,
14,A1006TXWG76H0N,,,,,,,,,,...,,,,,,,,,,
15,A1008539FMIMQDV67RVY,,,,,,,,,,...,,,,,,,,,,
35,A102H3QWRBVJ8E,,,,,,,,,,...,,,,,,,,,,
37,A102NRMHKF9NK0,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43281,AZTXN1XYU68WI,,,,,,,,,,...,,,,,,,,,,
43287,AZU93WHAJTE01,,,,,,,,,,...,,,,,,,,,,
43293,AZV2BKOD0ER8T,,,,,,,,,,...,,,,,,,,,,
43317,AZX4W3CS3CB23,,,,,,,,,,...,,,,,,,,,,


In [14]:
train

asin,reviewerID,0001377647,0001388703,0001526146,0006920055,0006935257,0760135886,1189182785,278472414X,3426958910,...,B01HI9B8T2,B01HIH0LI8,B01HIQU3AU,B01HIUVMF6,B01HIW5RV4,B01HIY8QVU,B01HIY9CVI,B01HJ91HEC,B01HJ91LIY,B01HJ91MTW
27387,A3E4AYPJHJOSWM,,,,,,,,,,...,,,,,,,,,,
30687,A3OEIQ11HYTHM6,,,,,,,,,,...,,,,,,,,,,
2136,A16R56VGF6JJTE,,,,,,,,,,...,,,,,,,,,,
40917,AS7VXGR6UE98V,,,,,,,,,,...,,,,,,,,,,
36683,AEQXDRNCSLT8K,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17714,A2JRR7VXFO0CWW,,,,,,,,,,...,,,,,,,,,,
10407,A1WH0CHN6DUUPG,,,,,,,,,,...,,,,,,,,,,
41442,ATYFXRA2PHD6I,,,,,,,,,,...,,,,,,,,,,
23045,A30PZ24XZWOZ5D,,,,,,,,,,...,,,,,,,,,,


In [15]:
#train.to_csv('train_data.csv')

In [16]:
#test.to_csv('rtest_Data.csv')

In [17]:
#fill df with 0s
df_train = train.fillna(0)

In [18]:
def sub_mean(df):
    df_means = df.mean(axis = 1, skipna = True)
    
    for i in range(len(df)):
        df.iloc[[i]] = df.iloc[[i]]-df_means.iloc[[i]]
        
    return df

In [19]:
import numpy as np
from numpy.linalg import norm
import math


#user = "A383QK6QKCVPYW"
def cosine_sim(df, user):
    cosine_sim_val = np.zeros(len(df))
    for i in range(len(df)):


        user_row = np.array(df.loc[df["reviewerID"] == user]).flatten()[2:]
        curr_row = np.array(df.iloc[[i]]).flatten()[2:]
        dot_prod = np.dot(user_row, curr_row)
        user_row_norm = norm(user_row)
        curr_row_norm = norm(curr_row)
        norm_mul = user_row_norm * curr_row_norm
        if norm_mul == 0:
            cos_sim = 0
        else:
            cos_sim = dot_prod/norm_mul
        if math.isnan(cos_sim):
            cos_sim = 0

        #print(cos_sim)
        cosine_sim_val[i] = cos_sim
    df["cosine_sim"] = cosine_sim_val
    return df
    

In [20]:
# k = 2
# item = "1526146"
# user = "A383QK6QKCVPYW"

def predict_item_rating_avg(df, k, item, user):
    df.dropna(subset = [item])
    df = df[df.reviewerID != user]
    new_df =  df.nlargest(k, "cosine_sim")
    return new_df[item].mean(axis = 0)
    

In [21]:
# k = 2
# item = "1526146"
# user = "A383QK6QKCVPYW"

def predict_item_rating_sim(df, k, item, user):
    df.dropna(subset = [item])
    df = df[df.reviewerID != user]
    new_df =  df.nlargest(k, "cosine_sim")
    return (new_df["cosine_sim"].dot(new_df[item]))/(new_df["cosine_sim"].sum())
    

In [22]:
def predict(df, user, item):
    #sum_mean_df = sub_mean(df)
    cos_sim_df = cosine_sim(df, user)
    print(predict_item_rating_avg(cos_sim_df, 2, item, user))
    print(predict_item_rating_sim(cos_sim_df, 2, item, user))
    

In [24]:
predict(df_train, "A3E4AYPJHJOSWM", "0001377647")

0.0
0.0
