In [158]:
"""Hanqing Zhao A14062115"""
import numpy as np
import urllib
import scipy.optimize
import random
from sklearn import linear_model
from sklearn.metrics import accuracy_score
import gzip
from collections import defaultdict

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [5]:
f = open("data/5year.arff", 'r')

In [6]:
# Read and parse the data
while not '@data' in f.readline():
    pass

dataset = []
for l in f:
    if '?' in l: # Missing entry
        continue
    l = l.split(',')
    values = [1] + [float(x) for x in l]
    values[-1] = values[-1] > 0 # Convert to bool
    dataset.append(values)

In [7]:
X = [d[:-1] for d in dataset]
y = [d[-1] for d in dataset]

In [8]:
answers = {} # Your answers

In [43]:
def accuracy(predictions, y):
    correct = predictions == y
    accuracy = sum(correct)/len(correct)
    return accuracy

In [44]:
def BER(predictions, y):
    TP_ = np.logical_and(predictions, y)
    FP_ = np.logical_and(predictions, np.logical_not(y))
    TN_ = np.logical_and(np.logical_not(predictions), np.logical_not(y))
    FN_ = np.logical_and(np.logical_not(predictions), y)
    
    TP = sum(TP_)
    FP = sum(FP_)
    TN = sum(TN_)
    FN = sum(FN_)
    
    BER = 1 - 0.5*(TP / (TP + FN) + TN / (TN + FP))
    return BER
    

In [45]:
### Question 1

In [46]:
mod = linear_model.LogisticRegression(C=1)
mod.fit(X,y)

pred = mod.predict(X)

acc1 = accuracy(pred, y)
ber1 = BER(pred, y)

In [48]:
answers['Q1'] = [acc1, ber1] # Accuracy and balanced error rate

In [49]:
answers['Q1']

[0.9656878917848895, 0.4766851431593464]

In [50]:
assertFloatList(answers['Q1'], 2)

In [51]:
### Question 2

In [53]:
mod = linear_model.LogisticRegression(C=1, class_weight='balanced')
mod.fit(X,y)

pred = mod.predict(X)

acc2 = accuracy(pred, y)
ber2 = BER(pred, y)

In [54]:
answers['Q2'] = [acc2, ber2]

In [55]:
assertFloatList(answers['Q2'], 2)

In [56]:
### Question 3

In [57]:
random.seed(3)
random.shuffle(dataset)

In [58]:
X = [d[:-1] for d in dataset]
y = [d[-1] for d in dataset]

In [59]:
Xtrain, Xvalid, Xtest = X[:len(X)//2], X[len(X)//2:(3*len(X))//4], X[(3*len(X))//4:]
ytrain, yvalid, ytest = y[:len(X)//2], y[len(X)//2:(3*len(X))//4], y[(3*len(X))//4:]

In [60]:
len(Xtrain), len(Xvalid), len(Xtest)

(1515, 758, 758)

In [63]:
#train on the training set and get BER for all sets
mod = linear_model.LogisticRegression(C=1, class_weight='balanced')
mod.fit(Xtrain,ytrain)

pred_train = mod.predict(Xtrain)
pred_valid = mod.predict(Xvalid)
pred_test = mod.predict(Xtest)

berTrain = BER(pred_train, ytrain)
berValid = BER(pred_valid, yvalid)
berTest = BER(pred_test, ytest)



In [64]:
answers['Q3'] = [berTrain, berValid, berTest]

In [65]:
assertFloatList(answers['Q3'], 3)

In [66]:
### Question 4

In [69]:
C_list = []
C_list.append(1)
for i in range(1,5):
    C_list.append(10**(-i))
    C_list.append(10**(i))

C_list.sort()
C_list

[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]

In [74]:
#train the model with all Cs
berList = []
for c in C_list:
    mod = linear_model.LogisticRegression(C=c, class_weight='balanced')
    mod.fit(Xtrain,ytrain)
    
    #get the validation dataset's BER
    pred_valid = mod.predict(Xvalid)
    berValid = BER(pred_valid, yvalid)
    
    berList.append(berValid)

berList

[0.3281320669380371,
 0.31931252826775225,
 0.3281320669380371,
 0.3179556761646314,
 0.3159203980099503,
 0.3111714156490276,
 0.2955030044582283,
 0.29618143050978873,
 0.29618143050978873]

In [75]:
answers['Q4'] = berList

In [76]:
assertFloatList(answers['Q4'], 9)

In [73]:
### Question 5

In [77]:
#the smallerst BER is the best
ber5 = min(berList)
bestC = C_list[berList.index(ber5)]

In [78]:
answers['Q5'] = [bestC, ber5]

In [79]:
assertFloatList(answers['Q5'], 2)

In [81]:
### Question 6

In [82]:
f = gzip.open("young_adult_10000.json.gz")
dataset = []
for l in f:
    dataset.append(eval(l))

In [83]:
dataTrain = dataset[:9000]
dataTest = dataset[9000:]

In [84]:
dataTrain[0]

{'user_id': '8842281e1d1347389f2ab93d60773d4d',
 'book_id': '2767052',
 'review_id': '248c011811e945eca861b5c31a549291',
 'rating': 5,
 'review_text': "I cracked and finally picked this up. Very enjoyable quick read - couldn't put it down - it was like crack. \n I'm a bit bothered by the lack of backstory of how Panem and the Hunger Games come about. It is just kind of explained away in a few paragraphs and we are left to accept this very strange world where teenagers are pitted into an arena each year to kill each other? I was expecting it because I've seen Battle Royale, but I would have appreciated knowing more of the backstory of how the world could have come into such a odd state. \n I suppose what makes a book like this interesting is thinking about the strategy of it all. The players are going to be statistically encouraged to band together because they will last longer that way, but by definition of course any partnership will be broken, and the drama of how that unfolds is alw

In [133]:
# Some data structures you might want

usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
ratingDict = {} # To retrieve a rating for a specific user/item pair

for d in dataTrain:
    user,item = d['user_id'], d['book_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)
    ratingDict[(user,item)] = d['rating']


In [134]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    return numer/denom

In [135]:
def mostSimilar(i, N):
    similarities = []
    users = usersPerItem[i]
    for j in usersPerItem:
        if (i == j): continue
        simi = Jaccard(users, usersPerItem[j])
        similarities.append((simi, j))
    similarities.sort(reverse = True)
    return similarities[:N]

In [136]:
answers['Q6'] = mostSimilar('2767052', 10)

In [137]:
answers['Q6']

[(0.4125, '6148028'),
 (0.3411764705882353, '7260188'),
 (0.1590909090909091, '256683'),
 (0.1375, '1162543'),
 (0.11494252873563218, '11735983'),
 (0.10989010989010989, '13335037'),
 (0.10810810810810811, '28187'),
 (0.10666666666666667, '428263'),
 (0.09876543209876543, '49041'),
 (0.09782608695652174, '41865')]

In [138]:
assert len(answers['Q6']) == 10
assertFloatList([x[0] for x in answers['Q6']], 10)

In [140]:
### Question 7
itemAverages = {}
for i in usersPerItem:
    rs = [ratingDict[(u,i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)

ratingMean = sum([d['rating'] for d in dataTrain]) / len(dataTrain)

In [141]:
def predictRating(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['book_id']
        if i2 == item: continue
        ratings.append(d['rating'] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        return ratingMean

In [142]:
def MSE(predictions, labels):
    diff = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(diff) / len(diff)

In [143]:
preds = [predictRating(d['user_id'], d['book_id']) for d in dataTest]
labels = [d['rating'] for d in dataTest]
mse7 = MSE(preds, labels)

In [144]:
answers['Q7'] = mse7

In [145]:
assertFloat(answers['Q7'])

In [147]:
### Question 8

In [148]:
userAverages = {}
for u in itemsPerUser:
    rs = [ratingDict[(u,i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)

In [149]:
def predictRating2(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[item]:
        i2 = d['user_id']
        if i2 == user: continue
        ratings.append(d['rating'] - itemAverages[i2])
        similarities.append(Jaccard(itemsPerUser[user],itemsPerUser[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return userAverages[user] + sum(weightedRatings) / sum(similarities)
    else:
        return ratingMean

In [154]:
preds2 = [predictRating2(d['user_id'], d['book_id']) for d in dataTest]
labels2 = [d['rating'] for d in dataTest]
mse8 = MSE(preds2, labels2)

In [155]:
answers['Q8'] = mse8

In [156]:
assertFloat(answers['Q8'])

In [157]:
f = open("answers_hw2.txt", 'w')
f.write(str(answers) + '\n')
f.close()