In [1]:
import pandas as pd
import gzip
import json
import numpy as np
import math
from random import randint



In [2]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('Digital_Music.json.gz')

In [3]:
df_clean = df.drop(columns=['reviewTime', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'style', 'image', 'vote', 'verified'])

In [4]:
#Filtering the data by number of reviews before making pivot table to ease memory constraints
#Here threshold is inclusive (n or more reviews will be kept)
user_thresh = 5
item_thresh = 3
pre_pivot = df_clean[df_clean.groupby(['reviewerID'])['reviewerID'].transform('size') >= user_thresh]
pre_pivot2 = pre_pivot[pre_pivot.groupby(['asin'])['asin'].transform('size') >= item_thresh]

In [5]:
#pivot = pd.pivot_table(pre_pivot2, values = 'overall', index='reviewerID', columns = 'asin').reset_index()

In [6]:
#Filtering again after pivot table is made
#After some columns are reviewed, there are still users with < threshold reviews
#pivot = pivot.loc[pivot.count(axis = 'columns') > user_thresh]

In [7]:
pre_pivot2

Unnamed: 0,overall,reviewerID,asin
4,5.0,A12R54MKO17TW0,0001388703
12,5.0,A4V08BR7LZ6D9,0001388703
13,5.0,AJO3UG6FR5C7R,0001388703
16,5.0,A27P44I54RUMDC,0001388703
36,5.0,A3INJI4T4U2JJS,0001526146
...,...,...,...
1584052,5.0,A3M73VJWQ7X900,B01HJ91MTW
1584053,5.0,A2SR3DWJR1PYR6,B01HJ91MTW
1584054,5.0,A24V7X30NIMOIY,B01HJ91MTW
1584056,5.0,A1LW10GYP2EYM1,B01HJ91MTW


In [8]:
full_dict = {}

In [9]:
for index, row in pre_pivot2.iterrows():
    if row['reviewerID'] in full_dict: 
        full_dict[row['reviewerID']].append(([row['asin'], row['overall']]))
    else:
        full_dict[row['reviewerID']] = ([[row['asin'], row['overall']]])

In [10]:
full_dict['AJO3UG6FR5C7R']

[['0001388703', 5.0], ['B000V64UXQ', 5.0], ['B001BIDEY4', 5.0]]

In [11]:
len(full_dict)

43336

In [12]:
item_dict = {}

In [13]:
for index, row in pre_pivot2.iterrows():
    if row['asin'] in item_dict: 
        item_dict[row['asin']].append(([row['reviewerID'], row['overall']]))
    else:
        item_dict[row['asin']] = ([[row['reviewerID'], row['overall']]])

In [26]:
len(item_dict)

33083

In [14]:
item_dict['0001388703']

[['A12R54MKO17TW0', 5.0],
 ['A4V08BR7LZ6D9', 5.0],
 ['AJO3UG6FR5C7R', 5.0],
 ['A27P44I54RUMDC', 5.0]]

In [15]:
list(item_dict.items())[:10]

[('0001388703',
  [['A12R54MKO17TW0', 5.0],
   ['A4V08BR7LZ6D9', 5.0],
   ['AJO3UG6FR5C7R', 5.0],
   ['A27P44I54RUMDC', 5.0]]),
 ('0001526146',
  [['A3INJI4T4U2JJS', 5.0], ['A27P44I54RUMDC', 3.0], ['A2H5Z1ZNWQMEO0', 3.0]]),
 ('0001377647',
  [['A1UNS26W21A9C2', 5.0],
   ['A21OOZLUPQ54TP', 5.0],
   ['A3FVAWZNKW9GX', 5.0],
   ['A1U2J1QFGBMEGI', 5.0]]),
 ('0006935257',
  [['A2QWA7KNDKZCOD', 5.0], ['A141HP4LYPWMSR', 5.0], ['A1U2J1QFGBMEGI', 5.0]]),
 ('0006920055',
  [['A27SJD1VM73SMM', 5.0],
   ['A1BVU7F2T8EUKS', 4.0],
   ['A3TTAAR9L631DB', 4.0],
   ['A16TZRNJAH1V0B', 5.0],
   ['A1U1V9U9IS8F5B', 5.0],
   ['A33KPK0QBSSDWS', 5.0]]),
 ('3426958910',
  [['A2TYZ821XXK2YZ', 5.0],
   ['A3OFSREZADFUDY', 5.0],
   ['A2VAMODP8M77NG', 5.0],
   ['AAKSLZ9IDTEH0', 4.0],
   ['A14W6D2RAIZ3ZN', 5.0],
   ['A3HUD6U7RWX8E8', 5.0]]),
 ('5550312085',
  [['A2I5AKN7CDTWFM', 5.0],
   ['AM12SZNS452IL', 5.0],
   ['A1P097GD6WMEBI', 5.0],
   ['A3S2EH4H28PLFG', 2.0],
   ['A3LEN0P07MGJE2', 5.0],
   ['A3A93F0J96AEL0', 5.0

In [16]:
with open('items.json', 'w') as fp:
    json.dump(item_dict, fp)

In [17]:
pruned_dict = {}
for key in full_dict:
    if len(full_dict[key]) >= 5:
        pruned_dict[key] = full_dict[key]
len(pruned_dict)

23958

In [18]:
list(pruned_dict.items())[:10]

[('A12R54MKO17TW0',
  [['0001388703', 5.0],
   ['B000VZJS84', 5.0],
   ['B000WLNUN6', 5.0],
   ['B001232RGE', 5.0],
   ['B00123KDR4', 5.0],
   ['B000VZJS84', 5.0],
   ['B000WLNUN6', 5.0],
   ['B001232RGE', 5.0],
   ['B00123KDR4', 5.0],
   ['B0019M5J6S', 5.0],
   ['B001BHWEAA', 3.0],
   ['B001FSB0C8', 5.0],
   ['B001MD0GNK', 5.0],
   ['B002R4K6AG', 3.0],
   ['B00382MONS', 5.0],
   ['B0045QJ7UO', 4.0],
   ['B01929H4VM', 5.0],
   ['B0007ZWQNC', 5.0],
   ['B000E1B00O', 5.0],
   ['B000GWCIOS', 5.0],
   ['B000MRU1B4', 5.0],
   ['B0019M3TWE', 5.0],
   ['B001PJ5SX4', 4.0],
   ['B0048P1XXQ', 5.0]]),
 ('A27P44I54RUMDC',
  [['0001388703', 5.0],
   ['0001526146', 3.0],
   ['B000E1B00O', 5.0],
   ['B004QMR6KA', 4.0],
   ['B00973AF0G', 5.0],
   ['5552256646', 4.0]]),
 ('A3INJI4T4U2JJS',
  [['0001526146', 5.0],
   ['B00136J7ZE', 5.0],
   ['B00136J7ZE', 5.0],
   ['B009CVYYFU', 5.0],
   ['B00IJUC4WK', 4.0],
   ['B00B8IONO2', 5.0]]),
 ('A3FVAWZNKW9GX',
  [['0001377647', 5.0],
   ['B000QVXDR0', 5.0],
   

In [19]:
test_dict = {}
for key in pruned_dict:
    take = math.floor(len(pruned_dict[key])/5)
    for i in range(take):
        rand = randint(0, len(pruned_dict[key]) - 1)
        if key in test_dict: 
            test_dict[key].append(pruned_dict[key].pop(rand))
        else:
            test_dict[key] = ([pruned_dict[key].pop(rand)])
        

In [20]:
train_dict = pruned_dict

In [21]:
len(test_dict)

23958

In [22]:
list(train_dict.items())[:10]

[('A12R54MKO17TW0',
  [['0001388703', 5.0],
   ['B000VZJS84', 5.0],
   ['B000WLNUN6', 5.0],
   ['B001232RGE', 5.0],
   ['B00123KDR4', 5.0],
   ['B000VZJS84', 5.0],
   ['B000WLNUN6', 5.0],
   ['B001232RGE', 5.0],
   ['B00123KDR4', 5.0],
   ['B0019M5J6S', 5.0],
   ['B001BHWEAA', 3.0],
   ['B002R4K6AG', 3.0],
   ['B00382MONS', 5.0],
   ['B0045QJ7UO', 4.0],
   ['B01929H4VM', 5.0],
   ['B0007ZWQNC', 5.0],
   ['B000E1B00O', 5.0],
   ['B000GWCIOS', 5.0],
   ['B000MRU1B4', 5.0],
   ['B0048P1XXQ', 5.0]]),
 ('A27P44I54RUMDC',
  [['0001526146', 3.0],
   ['B000E1B00O', 5.0],
   ['B004QMR6KA', 4.0],
   ['B00973AF0G', 5.0],
   ['5552256646', 4.0]]),
 ('A3INJI4T4U2JJS',
  [['B00136J7ZE', 5.0],
   ['B00136J7ZE', 5.0],
   ['B009CVYYFU', 5.0],
   ['B00IJUC4WK', 4.0],
   ['B00B8IONO2', 5.0]]),
 ('A3FVAWZNKW9GX',
  [['0001377647', 5.0],
   ['B000QVXDR0', 5.0],
   ['B000T1EJ0W', 5.0],
   ['B000TDWSYO', 5.0],
   ['B000VZO4TW', 5.0],
   ['B000W15BBK', 5.0],
   ['B0010WLQXW', 5.0],
   ['B0011Z77TA', 5.0],
   

In [23]:
list(test_dict.items())[:10]

[('A12R54MKO17TW0',
  [['B001PJ5SX4', 4.0],
   ['B0019M3TWE', 5.0],
   ['B001MD0GNK', 5.0],
   ['B001FSB0C8', 5.0]]),
 ('A27P44I54RUMDC', [['0001388703', 5.0]]),
 ('A3INJI4T4U2JJS', [['0001526146', 5.0]]),
 ('A3FVAWZNKW9GX',
  [['B000S56380', 5.0],
   ['B001386SW2', 5.0],
   ['B0010WLQXW', 5.0],
   ['B001BHHTR8', 5.0],
   ['B0011Z77TA', 5.0],
   ['B000W15BBK', 5.0]]),
 ('A27SJD1VM73SMM', [['B00JG9PEI4', 5.0], ['0006920055', 5.0]]),
 ('A1BVU7F2T8EUKS', [['B004Z4ZN4A', 3.0]]),
 ('A16TZRNJAH1V0B', [['B008723K96', 5.0]]),
 ('A1U1V9U9IS8F5B', [['B00136NNN6', 5.0]]),
 ('A33KPK0QBSSDWS', [['B000TE0MYG', 5.0]]),
 ('A2TYZ821XXK2YZ',
  [['3426958910', 5.0], ['B01DQ6ON70', 5.0], ['B01AUA1MC4', 5.0]])]

In [24]:
with open('test.json', 'w') as fp:
    json.dump(test_dict, fp)

In [25]:
with open('train.json', 'w') as fp:
    json.dump(train_dict, fp)