In [5]:
from collections import defaultdict
import scipy
import scipy.optimize
import numpy as np
import random

## Goal:
Given (user, music, format(optional)) tuple, predict the rating that the user will give to the music.

In [86]:
# useful fields:
# |name       | possible value  | analysis
# "overall":    1 - 5 (int)
# "verified":   True / False      (Don't know meaning yet)
# "reviewerID": "A1SJL3JBBILJ66"
# "asin": "     B0018CGCR4"       (music ID)
# "format":     " MP3 Music"      86.44%
#               " Audio CD"       6.37%
#               "" (undeclared)   6.95%
#               " Vinyl"          .2%
#               (others)          <.04%
# "reviewText": "THANK YOU"       .09% users doesn't provide reviewText, indcicate as ""
# "summary":    "Five Stars"      .002% users doesn't provide summary, indcicate as ""
# "image":      0 (int)           .107% users provide image
#                                 indicate number of images provided in the review
# "vote":       0 (int)           4.48% reviewers are voted by others

In [87]:
%%time
print('Reading training file')

f = open("./train.json", 'rt', encoding="utf8")

data = [f.readline()]
data = data.replace(', "verified": true, "', ', "verified": True, "')
data = data.replace(', "verified": false, "', ', "verified": False, "')
data = eval(data)

parsed_data = open("pdata.json", 'w')
for d in data:
    # unused fields
    d.pop('reviewTime', None)
    d.pop('reviewerName', None)
    d.pop('unixReviewTime', None)
    
    # overall
    d['overall'] = int(d['overall'])
    
    # style
    if 'style' in d:
        d['format'] = d['style']['Format:']
        d.pop('style', None)
    else:
        d['format'] = ""
    
    # vote
    if not 'vote' in d:
        _ += 1
        d['vote'] = 0

    # image
    if 'image' in d:
        d['image'] = len(d['image']) 
    else:
        _1 +=1
        d['image'] = 0
        
    if not 'reviewText' in d:
        d['reviewText'] = ""        

    if not 'summary' in d:
        d['summary'] = ""
    parsed_data.write(str(d))

parsed_data.close()
print('Finished parsing orinal file')

1
1
1
1
3
2
1
2
1
1
1
1
1
2
1
1
1
1
1
1
1
6
1
2
1
1
1
2
1
1
1
1
2
6
2
1
1
1
1
2
1
1
1
2
1
1
1
1
1
1
1
1
1
1
1
3
1
1
2
1
1
1
1
1
1
4
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
3
1
1
1
1
1
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
1
1
5
1
2
1
1
1
1
1
1
1
1
1
1
1
1
1
3
1
1
6
2
1
1
1
1
1
1
1
2
1
1
1
1
1
3
10
1
1
1
1
1
1
1
1
1


In [88]:
print('constructing train/valid dataset & test dataset ')

# constraint when we building the dataset:
# ensure each user/data appear at least 4 times
f = open("./pdata.json", 'rt', encoding="utf8")

us = defaultdict(int)
ms = defaultdict(int)
data = []
for l in f:
    data.append(l)
    us[l['reviewerID']] += 1
    ms[l['asin']] += 1

constructing train/valid dataset & test dataset 


TypeError: string indices must be integers

In [4]:
# we can't prove that user will rate a music he'she never listened as the way he/she will rate a listened music.
# For example, a user will only rate musics he/she like, so he/she rated every music 5 stars.
# It is possible that our model will predict that the user will give a high rating to all musics.

NameError: name 'true' is not defined

In [168]:


_i = 0

for line in f:
    u, b, r = line.strip().split(',')
    us.add(u)
    bs.add(b)
    r = int(r)
    if _i < 190000:
        train.append([u, b, r])
        train_u_b[u].add(b)
        train_b_u[b].add(u)
    elif _i < 200000:
        valid.append([u, b, r])
        valid_u_b[u].add(b)
    else:
        break
    _i += 1

# add the neg samples to valid set
for u, b, r in valid:
    while True:
        _ = random.sample(bs, 1)[0]
        if not (_ in train_u_b[u] and _ in valid_u_b[u]):
            valid_neg.append([u, _])
            break
v_size = len(valid_neg) + len(valid_neg)

# Problem 1: base solution

In [169]:
# problem 1: base solution
bookCount = defaultdict(int)
totalRead = 0

for u, b, _ in train:
    bookCount[b] += 1
    totalRead += 1

mostPopular = [(bookCount[x], x) for x in bookCount]
mostPopular.sort()
mostPopular.reverse()

In [170]:
base_read_set = set()
count = 0
for ic, i in mostPopular:
    if count < totalRead / 2:
        base_read_set.add(i)
    count += ic

base_read_pred = 0
for u, b, _ in valid:
    if b in base_read_set:
        base_read_pred += 1
for u, b in valid_neg:
    if not (b in base_read_set):
        base_read_pred += 1
print("Problem 1: Accuracy is: " + str(base_read_pred / v_size))

Problem 1: Accuracy is: 0.6454


# Problem 2: change threshold of base solution

In [171]:
better_read_set = set()
count = 0
for ic, i in mostPopular:
    if count < totalRead / 1.65:
        better_read_set.add(i)
    count += ic

better_read_pred = 0
for u, b, _ in valid:
    if b in better_read_set:
        better_read_pred += 1
for u, b in valid_neg:
    if not (b in better_read_set):
        better_read_pred += 1
print("Problem 2: With threshold of 1.65, Accuracy is: " + str(better_read_pred / v_size))

Problem 2: With threshold of 1.65, Accuracy is: 0.64965


# Problem3: 

In [172]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer / denom

In [173]:
Jacc_threshold = 0.0277
Jacc_errs = 0

def Jacc_read(u, b, Jacc_threshold):
    s1 = train_u_b[u]
    for uu in train_b_u[b]:
        sim = Jaccard(s1, train_u_b[uu])
        if 1 > sim > Jacc_threshold:
            if b in train_u_b[uu]:
                return True
    return False
    
for u, b, _ in valid:
    if not Jacc_read(u, b, Jacc_threshold):
        Jacc_errs += 1

for u, b in valid_neg:
    if Jacc_read(u, b, Jacc_threshold):
        Jacc_errs += 1

print("Problem 3: With threshold of 0.0277, accuracy is: " + str(1- Jacc_errs / v_size))

Problem 3: With threshold of 0.0277, accuracy is: 0.642


# Problem4:

In [175]:
# Idea: based on the popularity of all similar users on a book, 
# determine if the user have read it or not.

def Jacc_pop_read1(u, b, t1=0.022, t2=1.27):
    s1 = train_u_b[u]
    sims = []
    uus = set()
    for bb in list(train_u_b[u]):
        uus = uus.union(train_b_u[bb])
    for uu in list(uus):
        s2 = train_u_b[uu]
        sim = Jaccard(s1, s2)
        if 1 > sim > t1:
            sims.append([sim, s2])
    _bookf = defaultdict(int)
    _f_sum = 0
    for _, uu in sims:
        for bb in uu:
            _bookf[bb] += _ 
            _f_sum += _
    _bookf = [(_bookf[_], _) for _ in _bookf]
    _bookf.sort()
    _bookf.reverse()
    
    _f_cnt = 0
    for f, bb in _bookf:
        if b == bb:
            return True
        _f_cnt += f
        if _f_cnt >= _f_sum / t2:
            return False

In [176]:
Jacc_pop_errs = 0
for u, b, _ in valid:
    if not Jacc_pop_read1(u, b):
        Jacc_pop_errs += 1
for u, b in valid_neg:
    if Jacc_pop_read1(u, b):
        Jacc_pop_errs += 1
print("Problem 4: With Jaccard threshold of 0.022, and popularity threshold of 1.27, \n"
      "Accuracy is: " + str(1 - Jacc_pop_errs / v_size))

Problem 4: With Jaccard threshold of 0.022, and popularity threshold of 1.27, 
Accuracy is: 0.6702


# Problem5:
## My kaggle username is Xincheng Shen

In [178]:
# generate file
predictions = open("predictions_Read.csv", 'w')
for l in open("./assignment1/pairs_Read.txt"):
    if l.startswith("userID"):
        # header
        predictions.write(l)
        continue
    u, b = l.strip().split('-')
    predictions.write( u + '-' + b + (",1\n" if Jacc_pop_read1(u, b) else ",0\n"))

predictions.close()

# Problem 6:

In [179]:
path = "./assignment1/train_Category.json.gz"
f = gzip.open(path, 'rt', encoding="utf8")
train2, valid2 = [], []
_ = 0
while True:
    if _ < 190000:
        train2.append(eval(f.readline()))
    elif _ < 200000:
        valid2.append(eval(f.readline()))
    else:
        break
    _ += 1

In [180]:
import nltk
import scipy.optimize
import string
from nltk.stem.porter import *

In [181]:
### Ignore capitalization and remove punctuation
### With stemming

wordCount = defaultdict(int)
punctuation = set(string.punctuation)
stemmer = PorterStemmer()
for d in train2:
    r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
    for w in r.split():
        w = stemmer.stem(w)
        wordCount[w] += 1

In [206]:
### Just take the most popular words...

counts = [(wordCount[w], w) for w in wordCount]
counts.sort()
counts.reverse()

words = [x[1] for x in counts[:1000]]
words_d = {words[i]:i for i in range(1000)}

print("Problem6: top frequent words are:")
for _ in range(10):
    print(counts[_])

Problem6: top frequent words are:
(1421439, 'the')
(858941, 'and')
(754130, 'a')
(716892, 'to')
(699884, 'i')
(622798, 'of')
(482010, 'it')
(420605, 'is')
(408740, 'in')
(370663, 'thi')


# Problem 7:

In [207]:
from sklearn import linear_model

In [208]:
def p2_extract_nD_features(dataset, dim):
    X, y, yid =[], [], []
    for d in dataset:
        _ = np.zeros(dim)
        r = ''.join([c for c in d['review_text'].lower() if not c in punctuation])
        for w in r.split():
            if w in words_d:
                _[words_d[w]] += 1
        X.append(_)
        if 'genreID' in d:
            y.append(d['genreID'])
        yid.append(d['user_id'] + '-' + d['review_id'] + ',')
    return np.array(X), np.array(y), yid

In [209]:
X, y, _ = p2_extract_nD_features(train2, 1000)
vX, vy, _ = p2_extract_nD_features(valid2, 1000)

In [210]:
mod = linear_model.LogisticRegression(C=0.001)
mod.fit(X, y)



LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [211]:
pred = mod.predict(vX)
_num_err = 0
for _ in range(len(vy)):
    if pred[_] != vy[_]:
        _num_err += 1
print('Problem 7: accuracy is: ' + str(1 - _num_err / len(vy)))

Problem 7: accuracy is: 0.5504


# Problem 8:

In [212]:
path = "./assignment1/test_Category.json.gz"
f = gzip.open(path, 'rt', encoding="utf8")
test2 = [eval(l)for l in f]

In [213]:
p8_dim = 8192
p8_C = 0.05
p8_iter = 500

words = [x[1] for x in counts[:p8_dim]]
words_d = {words[i]:i for i in range(p8_dim)}

X, y, _ = p2_extract_nD_features(train2, p8_dim)
vX, vy, _ = p2_extract_nD_features(valid2, p8_dim)
tX, ty, tid = p2_extract_nD_features(test2, p8_dim)

mod = linear_model.LogisticRegression(C=p8_C, max_iter=p8_iter)
mod.fit(X, y)



LogisticRegression(C=0.05, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [215]:
pred = mod.predict(vX)
_num_err = 0
for _ in range(len(vy)):
    if pred[_] != vy[_]:
        _num_err += 1
print('Problem 8: after tuning, accuracy is: ' + str(1 - _num_err / len(vy)))

Problem 8: after tuning, accuracy is: 0.6856


In [216]:
# generate file
pred = mod.predict(tX)

predictions = open("predictions_Rate.csv", 'w')
predictions.write('userID-reviewID,prediction\n')
for _ in range(len(pred)):
    predictions.write( tid[_] + str(pred[_])+'\n')

predictions.close()

## Again, my kaggle name is Xincheng Shen!