In [1]:
import gzip
from collections import defaultdict
import scipy
import scipy.optimize
import numpy as np
import random

## Goal:
Given (user, music, format(optional)) tuple, predict the rating that the user will give to the music.

In [86]:
# useful fields:
# |name       | possible value  | analysis
# "overall":    1 - 5 (int)
# "verified":   True / False      (Don't know meaning yet)
# "reviewerID": "A1SJL3JBBILJ66"
# "asin": "     B0018CGCR4"       (music ID)
# "format":     " MP3 Music"      86.44%
#               " Audio CD"       6.37%
#               "" (undeclared)   6.95%
#               " Vinyl"          .2%
#               (others)          <.04%
# "reviewText": "THANK YOU"       .09% users doesn't provide reviewText, indcicate as ""
# "summary":    "Five Stars"      .002% users doesn't provide summary, indcicate as ""
# "image":      0 (int)           .107% users provide image
#                                 indicate number of images provided in the review
# "vote":       0 (int)           4.48% reviewers are voted by others

In [7]:
%%time
print('Analyzing original file...')

# f = gzip.open("./data.json.zip", 'rt', encoding="utf8")
f = open("./data.json", 'rt', encoding="utf8")

data = f.readline()
data = data.replace(', "verified": true, "', ', "verified": True, "')
data = data.replace(', "verified": false, "', ', "verified": False, "')
data = eval(data)

parsed_data = open("pdata.json", 'w')
for d in data:
    # unused fields
    d.pop('reviewTime', None)
    d.pop('reviewerName', None)
    d.pop('unixReviewTime', None)
    
    # overall
    d['overall'] = int(d['overall'])
    
    # style
    if 'style' in d:
        d['format'] = d['style']['Format:']
        d.pop('style', None)
    else:
        d['format'] = ""
    
    # vote
    if not 'vote' in d:
        d['vote'] = 0

    # image
    if 'image' in d:
        d['image'] = len(d['image']) 
    else:
        d['image'] = 0
        
    if not 'reviewText' in d:
        d['reviewText'] = ""        

    if not 'summary' in d:
        d['summary'] = ""
    parsed_data.write(str(d)+'\n')

parsed_data.close()
print('finished')

Analyzing original file
Finished parsing orinal file
CPU times: user 9.3 s, sys: 1.82 s, total: 11.1 s
Wall time: 11.2 s


In [18]:
%%time
print('Constructing train/valid dataset & test dataset...')

random.seed(0) 

# constraint when we building the dataset:
# ensure each user/data appear at least 4 times
f = open("./pdata.json", 'rt', encoding="utf8")

us = defaultdict(int)
ms = defaultdict(int)
train = []
for l in f:
    l = eval(l)
    train.append(l)
    us[l['reviewerID']] += 1
    ms[l['asin']] += 1

test = []
while len(train) > 150000:
    i = random.randint(0, len(train) - 1)
    temp = train[i]
    if us[temp['reviewerID']] > 4 and ms[temp['asin']] > 4:
        us[temp['reviewerID']] -= 1
        ms[temp['asin']] -= 1
        test.append(temp)
        del train[i]

# write to file
train_f = open("train.json", 'w')
for d in train:
    train_f.write(str(d)+'\n')
train_f.close()

test_f = open("test.json", 'w')
for d in test:
    test_f.write(str(d)+'\n')
test_f.close()

print('finished\n'
      'size of train/valid set: %d\t size of test set: %d' % (len(train), len(test)))

Constructing train/valid dataset & test dataset...
finished
size of train/valid set: 150000	 size of test set: 19781
CPU times: user 6.08 s, sys: 203 ms, total: 6.28 s
Wall time: 6.32 s
