In [1]:
import pandas as pd
import gzip
import json
import numpy as np
import math
from random import randint



In [2]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('Digital_Music.json.gz')

In [3]:
df_clean = df.drop(columns=['reviewTime', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'style', 'image', 'vote', 'verified'])

In [4]:
#Filtering the data by number of reviews before making pivot table to ease memory constraints
#Here threshold is inclusive (n or more reviews will be kept)
user_thresh = 5
item_thresh = 3
pre_pivot = df_clean[df_clean.groupby(['reviewerID'])['reviewerID'].transform('size') >= user_thresh]
pre_pivot2 = pre_pivot[pre_pivot.groupby(['asin'])['asin'].transform('size') >= item_thresh]

In [5]:
pivot = pd.pivot_table(pre_pivot2, values = 'overall', index='reviewerID', columns = 'asin').reset_index()

In [6]:
#Filtering again after pivot table is made
#After some columns are reviewed, there are still users with < threshold reviews
pivot = pivot.loc[pivot.count(axis = 'columns') > user_thresh]

In [7]:
pivot

asin,reviewerID,0001377647,0001388703,0001526146,0006920055,0006935257,0760135886,1189182785,278472414X,3426958910,...,B01HI9B8T2,B01HIH0LI8,B01HIQU3AU,B01HIUVMF6,B01HIW5RV4,B01HIY8QVU,B01HIY9CVI,B01HJ91HEC,B01HJ91LIY,B01HJ91MTW
0,A0072041HVZ3465DXUOR,,,,,,,,,,...,,,,,,,,,,
1,A0081575F2F9XQSSIYA3,,,,,,,,,,...,,,,,,,,,,
9,A0638585LHS5R1XDIOGY,,,,,,,,,,...,,,,,,,,,,
11,A0723371S65BNSU0AYV8,,,,,,,,,,...,,,,,,,,,,
14,A1006TXWG76H0N,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43326,AZYL2RTHUWR0P,,,,,,,,,,...,,,,,,,,,,
43327,AZYOVGJLQ03ML,,,,,,,,,,...,,,,,,,,,,
43328,AZYPOLLSDVG4K,,,,,,,,,,...,,,,,,,,,,
43331,AZZHILYMITLGM,,,,,,,,,,...,,,,,,,,,,


In [8]:
full_dict = {}

In [9]:
for index, row in pre_pivot2.iterrows():
    if row['reviewerID'] in full_dict: 
        full_dict[row['reviewerID']].append(([row['asin'], row['overall']]))
    else:
        full_dict[row['reviewerID']] = ([[row['asin'], row['overall']]])

In [10]:
full_dict['AJO3UG6FR5C7R']

[['0001388703', 5.0], ['B000V64UXQ', 5.0], ['B001BIDEY4', 5.0]]

In [11]:
len(full_dict)

43336

In [12]:
pruned_dict = {}
for key in full_dict:
    if len(full_dict[key]) >= 5:
        pruned_dict[key] = full_dict[key]
len(pruned_dict)

23958

In [13]:
list(pruned_dict.items())[:10]

[('A12R54MKO17TW0',
  [['0001388703', 5.0],
   ['B000VZJS84', 5.0],
   ['B000WLNUN6', 5.0],
   ['B001232RGE', 5.0],
   ['B00123KDR4', 5.0],
   ['B000VZJS84', 5.0],
   ['B000WLNUN6', 5.0],
   ['B001232RGE', 5.0],
   ['B00123KDR4', 5.0],
   ['B0019M5J6S', 5.0],
   ['B001BHWEAA', 3.0],
   ['B001FSB0C8', 5.0],
   ['B001MD0GNK', 5.0],
   ['B002R4K6AG', 3.0],
   ['B00382MONS', 5.0],
   ['B0045QJ7UO', 4.0],
   ['B01929H4VM', 5.0],
   ['B0007ZWQNC', 5.0],
   ['B000E1B00O', 5.0],
   ['B000GWCIOS', 5.0],
   ['B000MRU1B4', 5.0],
   ['B0019M3TWE', 5.0],
   ['B001PJ5SX4', 4.0],
   ['B0048P1XXQ', 5.0]]),
 ('A27P44I54RUMDC',
  [['0001388703', 5.0],
   ['0001526146', 3.0],
   ['B000E1B00O', 5.0],
   ['B004QMR6KA', 4.0],
   ['B00973AF0G', 5.0],
   ['5552256646', 4.0]]),
 ('A3INJI4T4U2JJS',
  [['0001526146', 5.0],
   ['B00136J7ZE', 5.0],
   ['B00136J7ZE', 5.0],
   ['B009CVYYFU', 5.0],
   ['B00IJUC4WK', 4.0],
   ['B00B8IONO2', 5.0]]),
 ('A3FVAWZNKW9GX',
  [['0001377647', 5.0],
   ['B000QVXDR0', 5.0],
   

In [14]:
test_dict = {}
for key in pruned_dict:
    take = math.floor(len(pruned_dict[key])/5)
    for i in range(take):
        rand = randint(0, len(pruned_dict[key]) - 1)
        if key in test_dict: 
            test_dict[key].append(pruned_dict[key].pop(rand))
        else:
            test_dict[key] = ([pruned_dict[key].pop(rand)])
        

In [15]:
train_dict = pruned_dict

In [16]:
len(test_dict)

23958

In [17]:
list(train_dict.items())[:10]

[('A12R54MKO17TW0',
  [['0001388703', 5.0],
   ['B000VZJS84', 5.0],
   ['B001232RGE', 5.0],
   ['B00123KDR4', 5.0],
   ['B000VZJS84', 5.0],
   ['B000WLNUN6', 5.0],
   ['B001232RGE', 5.0],
   ['B00123KDR4', 5.0],
   ['B0019M5J6S', 5.0],
   ['B001BHWEAA', 3.0],
   ['B001FSB0C8', 5.0],
   ['B001MD0GNK', 5.0],
   ['B002R4K6AG', 3.0],
   ['B00382MONS', 5.0],
   ['B0045QJ7UO', 4.0],
   ['B0007ZWQNC', 5.0],
   ['B000E1B00O', 5.0],
   ['B000GWCIOS', 5.0],
   ['B0019M3TWE', 5.0],
   ['B0048P1XXQ', 5.0]]),
 ('A27P44I54RUMDC',
  [['0001388703', 5.0],
   ['B000E1B00O', 5.0],
   ['B004QMR6KA', 4.0],
   ['B00973AF0G', 5.0],
   ['5552256646', 4.0]]),
 ('A3INJI4T4U2JJS',
  [['0001526146', 5.0],
   ['B00136J7ZE', 5.0],
   ['B00136J7ZE', 5.0],
   ['B00IJUC4WK', 4.0],
   ['B00B8IONO2', 5.0]]),
 ('A3FVAWZNKW9GX',
  [['0001377647', 5.0],
   ['B000QVXDR0', 5.0],
   ['B000S56380', 5.0],
   ['B000T1EJ0W', 5.0],
   ['B000TDWSYO', 5.0],
   ['B000VZO4TW', 5.0],
   ['B000W15BBK', 5.0],
   ['B0010WLQXW', 5.0],
   

In [18]:
list(test_dict.items())[:10]

[('A12R54MKO17TW0',
  [['B01929H4VM', 5.0],
   ['B000WLNUN6', 5.0],
   ['B001PJ5SX4', 4.0],
   ['B000MRU1B4', 5.0]]),
 ('A27P44I54RUMDC', [['0001526146', 3.0]]),
 ('A3INJI4T4U2JJS', [['B009CVYYFU', 5.0]]),
 ('A3FVAWZNKW9GX',
  [['B00122OMC2', 5.0],
   ['B001NTUDEU', 5.0],
   ['B004ZKL9A6', 5.0],
   ['B0013HYLT0', 5.0],
   ['B00L1SLE5K', 5.0],
   ['B00138JAD6', 5.0]]),
 ('A27SJD1VM73SMM', [['B00G7HG10I', 5.0], ['B00DPJ0JYA', 5.0]]),
 ('A1BVU7F2T8EUKS', [['0760135886', 4.0]]),
 ('A16TZRNJAH1V0B', [['B00EWS7JW8', 5.0]]),
 ('A1U1V9U9IS8F5B', [['B00136NNN6', 5.0]]),
 ('A33KPK0QBSSDWS', [['B000SZIR0C', 5.0]]),
 ('A2TYZ821XXK2YZ',
  [['B012BPYS7M', 5.0], ['B00FGD8OCM', 5.0], ['B019EPSNT6', 5.0]])]

In [19]:
#Removing all elements in the test set from the pivot table, essentially making it the training set
count = 0
for key in test_dict:
    count += 1
    for i in range(len(test_dict[key])):
        if count % 5000 == 0:
            print(pivot.loc[(pivot['reviewerID'] == key), test_dict[key][i][0]])
        pivot.loc[(pivot['reviewerID'] == key), test_dict[key][i][0]] = np.nan
        if count % 5000 == 0:
            print(pivot.loc[(pivot['reviewerID'] == key), test_dict[key][i][0]])

8075    5.0
Name: B001VSX37W, dtype: float64
8075   NaN
Name: B001VSX37W, dtype: float64
8075    5.0
Name: B003A935L4, dtype: float64
8075   NaN
Name: B003A935L4, dtype: float64
8075    5.0
Name: B001226FHM, dtype: float64
8075   NaN
Name: B001226FHM, dtype: float64
8075    5.0
Name: B000Y054B6, dtype: float64
8075   NaN
Name: B000Y054B6, dtype: float64
Series([], Name: B00136Q306, dtype: float64)
Series([], Name: B00136Q306, dtype: float64)
19167    5.0
Name: B004DCYS5O, dtype: float64
19167   NaN
Name: B004DCYS5O, dtype: float64
19167    5.0
Name: B001NTI8DI, dtype: float64
19167   NaN
Name: B001NTI8DI, dtype: float64
19167    5.0
Name: B001NU6GB8, dtype: float64
19167   NaN
Name: B001NU6GB8, dtype: float64
19167    5.0
Name: B014S7GN92, dtype: float64
19167   NaN
Name: B014S7GN92, dtype: float64
19167    5.0
Name: B001O03DGS, dtype: float64
19167   NaN
Name: B001O03DGS, dtype: float64
19167    5.0
Name: B001NTZFP2, dtype: float64
19167   NaN
Name: B001NTZFP2, dtype: float64
19167   

In [20]:
#You can now use 'test' for your test set in dictionary form, and 'pivot' for your train set in pivot table form