In [34]:
from collections import defaultdict
import numpy as np
import random
import nltk
import string
import scipy
import scipy.optimize
import math
from sklearn import linear_model
from sklearn import metrics

# Goal:
Given (user, music, format(optional)) tuple, predict the rating that the user will give to the music.

# Part 0: Util Functions

In [2]:
# useful fields:
# |name       | possible value  | analysis
# "overall":    1 - 5 (int)
# "verified":   True / False      (Don't know meaning yet)
# "reviewerID": "A1SJL3JBBILJ66"
# "asin": "     B0018CGCR4"       (music ID)
# "format":     " MP3 Music"      86.44%
#               " Audio CD"       6.37%
#               "" (undeclared)   6.95%
#               " Vinyl"          .2%
#               (others)          <.04%
# "reviewText": "THANK YOU"       .09% users doesn't provide reviewText, indcicate as ""
# "summary":    "Five Stars"      .002% users doesn't provide summary, indcicate as ""
# "image":      0 (int)           .107% users provide image
#                                 indicate number of images provided in the review
# "vote":       0 (int)           4.48% reviewers are voted by others

In [3]:

# constrain: ensure each user/data appear at least 4 times in the training set
print('Building training/validation sets... ', end='')

f = open("./train.json", 'rt', encoding="utf8")

train = [eval(l) for l in f]
valid = train[140000:]
train = train[:140000]

f = open("./test.json", 'rt', encoding="utf8")
test = [eval(l) for l in f]

print('Finish\n'
      'Size of train      set: %d\n'
      'Size of validation set: %d\n'
      'Size of test       set: %d\n' % (len(train), len(valid), len(test)))

Building training/validation sets... Finish
Size of train      set: 140000
Size of validation set: 10000
Size of test       set: 19781



# Part 1.0: Naivest Baseline, Constant
Find the constant $\theta$ that mimize the MSE of the train set as the prediction.

In [4]:
X_t = np.ones((len(train), 1))
y_t = [d['overall'] for d in train]
X_v = np.ones((len(valid), 1))
y_v = [d['overall'] for d in valid]

mod = linear_model.LinearRegression()
mod.fit(X_t,y_t)

print(metrics.mean_squared_error(mod.predict(X_v), y_v))

0.49839595183673463


# Part 1.1 : Naive Solution 1, Global Average
Predict Value = $\theta_0$ + $\theta_1$ * (average rating that the user gives out) + $\theta_2$ * (average rating that the music receives)

Use linear regressor to optimize $\theta_0$, $\theta_1$ and $\theta_2$

In [5]:
# TODO
train[0]

{'overall': 5,
 'verified': True,
 'reviewerID': 'A1SJL3JBBILJ66',
 'asin': 'B0018CGCR4',
 'reviewText': 'THANK YOU',
 'summary': 'Five Stars',
 'format': ' MP3 Music',
 'vote': 0,
 'image': 0}

In [6]:
ratingsPerUser = defaultdict(list)
ratingsPerItem = defaultdict(list)
users = []
items = []
for d in train:
    r = int(d['overall'])
    ratingsPerUser[d['reviewerID']].append(r)
    ratingsPerItem[d['asin']].append(r)
    users.append(d['reviewerID'])
    items.append(d['asin'])

In [7]:
avgRatingsPerUser = defaultdict(int)
avgRatingsPerItem = defaultdict(int)
for u in ratingsPerUser:
    avgRatingsPerUser[u] = np.mean(ratingsPerUser[u])
for item in ratingsPerItem:
    avgRatingsPerItem[item] = np.mean(ratingsPerItem[item])

trainRatings = [int(d['overall']) for d in train]
globalAverage = sum(trainRatings) * 1.0 / len(trainRatings)

In [8]:
globalAverage

4.700542857142858

In [9]:
def feature(datum):
    feat = [1,avgRatingsPerUser[datum['reviewerID']],avgRatingsPerItem[datum['asin']]] 
    return feat
X_train = [feature(d) for d in train]
y_train = [int(d['overall']) for d in train]

In [10]:
# mod = linear_model.LogisticRegression(C=1.0)
mod = linear_model.LinearRegression()
_ = mod.fit(X_train,y_train)

In [11]:
X_valid = [feature(d) for d in valid]
y_valid = [int(d['overall']) for d in valid]
# valid_predictions = mod.predict(X_valid)
print(metrics.mean_squared_error(mod.predict(X_valid), y_valid))
# sum(valid_predictions == y_valid) /len(y2_test)

0.2983339575057852


# Part 1.2: Naive Solution 2, tf-idf
Use the similar approach in hw4:

Find tf-idf of reviews, then train a Ridge model to predict the rating. 

In [16]:
train[0]

{'overall': 5,
 'verified': True,
 'reviewerID': 'A1SJL3JBBILJ66',
 'asin': 'B0018CGCR4',
 'reviewText': 'THANK YOU',
 'summary': 'Five Stars',
 'format': ' MP3 Music',
 'vote': 0,
 'image': 0}

In [15]:
wordCount = defaultdict(int)
totalWords = 0

In [18]:
punct = string.punctuation

In [19]:
for d in train:
    t = d['reviewText']
    t = t.lower() # lowercase string
    t = [c for c in t if not (c in punct)] # non-punct characters
    t = ''.join(t) # convert back to string
    words = t.strip().split() # tokenizes
    for w in words:
        #w = stemmer.stem(w)
        totalWords += 1
        wordCount[w] += 1

In [23]:
counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()
len(counts)

76520

In [36]:
# Only choose most popular 3000 words to build the feature vector
N = 3000
words = [w[1] for w in counts[:N]]
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

In [37]:
wholeIdf = defaultdict(int)

In [38]:
for d in train:
    t = d['reviewText']
    t = t.lower() # lowercase string
    t = [c for c in t if not (c in punct)] # non-punct characters
    t = ''.join(t) # convert back to string
    words = t.strip().split() # tokenizes
    for w in words:
        if not (w in wordSet): continue
        wholeIdf[w] += 1

In [39]:
for key, value in wholeIdf.items():
    wholeIdf[key] = math.log(10000/value, 10)

In [40]:
def featureTfIdf(datum):
    feat = [0]*len(wordSet)
    t = datum['reviewText']
    t = t.lower() # lowercase string
    t = [c for c in t if not (c in punct)] # non-punct characters
    t = ''.join(t) # convert back to string
    words = t.strip().split() # tokenizes
    tf = defaultdict(int)
    for w in words:
        if not (w in wordSet): continue
        tf[w] += 1
    for w in words:
        if not (w in wordSet): continue
        feat[wordId[w]] = tf[w]*wholeIdf[w]
    feat.append(1)
    return feat

In [41]:
X = [featureTfIdf(d) for d in train]
y = [d['overall'] for d in train]

In [42]:
clf = linear_model.Ridge(1.0, fit_intercept=False) # MSE + 1.0 l2
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

In [43]:
mean_squared_error(y, predictions)

NameError: name 'mean_squared_error' is not defined

# Part 2: Solution
Sim users give high rating.


In [13]:
# TODO

# Part 3: Performance

In [14]:
#TODO

We can't prove that user will rate a music he'she never listened as the way he/she will rate a listened music.

For example, a user will only rate musics he/she like, so he/she rated every music 5 stars.

It is possible that our model will predict that the user will give high ratings to all musics.