In [20]:
import pandas as pd
import gzip
import json
import numpy as np
import math
from random import randint



In [21]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('Digital_Music.json.gz')

In [22]:
df_clean = df.drop(columns=['reviewTime', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'style', 'image', 'vote', 'verified'])

In [23]:
#Filtering the data by number of reviews before making pivot table to ease memory constraints
#Here threshold is inclusive (n or more reviews will be kept)
user_thresh = 5
item_thresh = 3
pre_pivot = df_clean[df_clean.groupby(['reviewerID'])['reviewerID'].transform('size') >= user_thresh]
pre_pivot2 = pre_pivot[pre_pivot.groupby(['asin'])['asin'].transform('size') >= item_thresh]

In [24]:
full_dict = {}

In [25]:
for index, row in pre_pivot2.iterrows():
    if row['reviewerID'] in full_dict: 
        full_dict[row['reviewerID']].append(([row['asin'], row['overall']]))
    else:
        full_dict[row['reviewerID']] = ([[row['asin'], row['overall']]])

In [26]:
full_dict['AJO3UG6FR5C7R']

[['0001388703', 5.0], ['B000V64UXQ', 5.0], ['B001BIDEY4', 5.0]]

In [27]:
len(full_dict)

43336

In [28]:
pruned_dict = {}
for key in full_dict:
    if len(full_dict[key]) >= 5:
        pruned_dict[key] = full_dict[key]
len(pruned_dict)

23958

In [29]:
list(pruned_dict.items())[:10]

[('A12R54MKO17TW0',
  [['0001388703', 5.0],
   ['B000VZJS84', 5.0],
   ['B000WLNUN6', 5.0],
   ['B001232RGE', 5.0],
   ['B00123KDR4', 5.0],
   ['B000VZJS84', 5.0],
   ['B000WLNUN6', 5.0],
   ['B001232RGE', 5.0],
   ['B00123KDR4', 5.0],
   ['B0019M5J6S', 5.0],
   ['B001BHWEAA', 3.0],
   ['B001FSB0C8', 5.0],
   ['B001MD0GNK', 5.0],
   ['B002R4K6AG', 3.0],
   ['B00382MONS', 5.0],
   ['B0045QJ7UO', 4.0],
   ['B01929H4VM', 5.0],
   ['B0007ZWQNC', 5.0],
   ['B000E1B00O', 5.0],
   ['B000GWCIOS', 5.0],
   ['B000MRU1B4', 5.0],
   ['B0019M3TWE', 5.0],
   ['B001PJ5SX4', 4.0],
   ['B0048P1XXQ', 5.0]]),
 ('A27P44I54RUMDC',
  [['0001388703', 5.0],
   ['0001526146', 3.0],
   ['B000E1B00O', 5.0],
   ['B004QMR6KA', 4.0],
   ['B00973AF0G', 5.0],
   ['5552256646', 4.0]]),
 ('A3INJI4T4U2JJS',
  [['0001526146', 5.0],
   ['B00136J7ZE', 5.0],
   ['B00136J7ZE', 5.0],
   ['B009CVYYFU', 5.0],
   ['B00IJUC4WK', 4.0],
   ['B00B8IONO2', 5.0]]),
 ('A3FVAWZNKW9GX',
  [['0001377647', 5.0],
   ['B000QVXDR0', 5.0],
   

In [30]:
test_dict = {}
for key in pruned_dict:
    take = math.floor(len(pruned_dict[key])/5)
    for i in range(take):
        rand = randint(0, len(pruned_dict[key]) - 1)
        if key in test_dict: 
            test_dict[key].append(pruned_dict[key].pop(rand))
        else:
            test_dict[key] = ([pruned_dict[key].pop(rand)])
        

In [31]:
train_dict = pruned_dict

In [32]:
len(test_dict)

23958

In [33]:
list(train_dict.items())[:10]

[('A12R54MKO17TW0',
  [['0001388703', 5.0],
   ['B000VZJS84', 5.0],
   ['B000WLNUN6', 5.0],
   ['B001232RGE', 5.0],
   ['B000VZJS84', 5.0],
   ['B000WLNUN6', 5.0],
   ['B001232RGE', 5.0],
   ['B00123KDR4', 5.0],
   ['B001BHWEAA', 3.0],
   ['B001MD0GNK', 5.0],
   ['B002R4K6AG', 3.0],
   ['B00382MONS', 5.0],
   ['B0045QJ7UO', 4.0],
   ['B0007ZWQNC', 5.0],
   ['B000E1B00O', 5.0],
   ['B000GWCIOS', 5.0],
   ['B000MRU1B4', 5.0],
   ['B0019M3TWE', 5.0],
   ['B001PJ5SX4', 4.0],
   ['B0048P1XXQ', 5.0]]),
 ('A27P44I54RUMDC',
  [['0001388703', 5.0],
   ['0001526146', 3.0],
   ['B004QMR6KA', 4.0],
   ['B00973AF0G', 5.0],
   ['5552256646', 4.0]]),
 ('A3INJI4T4U2JJS',
  [['0001526146', 5.0],
   ['B00136J7ZE', 5.0],
   ['B00136J7ZE', 5.0],
   ['B009CVYYFU', 5.0],
   ['B00B8IONO2', 5.0]]),
 ('A3FVAWZNKW9GX',
  [['0001377647', 5.0],
   ['B000QVXDR0', 5.0],
   ['B000S56380', 5.0],
   ['B000T1EJ0W', 5.0],
   ['B000TDWSYO', 5.0],
   ['B000VZO4TW', 5.0],
   ['B000W15BBK', 5.0],
   ['B0010WLQXW', 5.0],
   

In [34]:
list(test_dict.items())[:10]

[('A12R54MKO17TW0',
  [['B001FSB0C8', 5.0],
   ['B00123KDR4', 5.0],
   ['B0019M5J6S', 5.0],
   ['B01929H4VM', 5.0]]),
 ('A27P44I54RUMDC', [['B000E1B00O', 5.0]]),
 ('A3INJI4T4U2JJS', [['B00IJUC4WK', 4.0]]),
 ('A3FVAWZNKW9GX',
  [['B001NTUDEU', 5.0],
   ['B0011Z77TA', 5.0],
   ['B00122OMC2', 5.0],
   ['B000VZO4TW', 5.0],
   ['B0013HYLT0', 5.0],
   ['B001386SW2', 5.0]]),
 ('A27SJD1VM73SMM', [['B00P7N0Y6K', 5.0], ['B00PWNBIWY', 5.0]]),
 ('A1BVU7F2T8EUKS', [['B0001PICSG', 5.0]]),
 ('A16TZRNJAH1V0B', [['B00920DU6A', 5.0]]),
 ('A1U1V9U9IS8F5B', [['B00136NNN6', 5.0]]),
 ('A33KPK0QBSSDWS', [['B001GYWIMC', 4.0]]),
 ('A2TYZ821XXK2YZ',
  [['B019EPSNT6', 5.0], ['B00FGD8OCM', 5.0], ['B00MYPUDMY', 4.0]])]

In [35]:
import json
  
with open('new_test_data.txt', 'w') as convert_file:
     convert_file.write(json.dumps(test_dict))

In [36]:
import json
  
with open('new_train_data.txt', 'w') as convert_file:
     convert_file.write(json.dumps(train_dict))