In [56]:
import numpy
import urllib
import scipy.optimize
import random
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
import gzip
from collections import defaultdict

In [57]:
import warnings
warnings.filterwarnings("ignore")

In [58]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [59]:
f = open("5year.arff", 'r')

In [60]:
# Read and parse the data
while not '@data' in f.readline():
    pass

dataset = []
for l in f:
    if '?' in l: # Missing entry
        continue
    l = l.split(',')
    values = [1] + [float(x) for x in l]
    values[-1] = values[-1] > 0 # Convert to bool
    dataset.append(values)

In [61]:
X = [d[:-1] for d in dataset]
y = [d[-1] for d in dataset]

In [62]:
answers = {} # Your answers

In [63]:
def accuracy(predictions, y):
    correct = sum (p == actual for p, actual in zip(predictions, y))
    return correct / len(y)

In [64]:
def BER(predictions, y):
    TN, FP, FN, TP = confusion_matrix(y, predictions).ravel()
    return 0.5 * (FP/(TN+FP) + FN/(TP+FN))

In [65]:
### Question 1

In [66]:
mod = linear_model.LogisticRegression(C=1)
mod.fit(X,y)

pred = mod.predict(X)

In [67]:
acc1 = accuracy(pred,y)
ber1 = BER(pred,y)

In [68]:
answers['Q1'] = [acc1, ber1] # Accuracy and balanced error rate

In [69]:
assertFloatList(answers['Q1'], 2)

In [70]:
### Question 2

In [71]:
mod = linear_model.LogisticRegression(C=1, class_weight='balanced')
mod.fit(X,y)

pred = mod.predict(X)

In [72]:
acc2 = accuracy(pred,y)
ber2 = BER(pred,y)

In [73]:
answers['Q2'] = [acc2, ber2]

In [74]:
assertFloatList(answers['Q2'], 2)

In [75]:
### Question 3

In [76]:
random.seed(3)
random.shuffle(dataset)

In [77]:
X = [d[:-1] for d in dataset]
y = [d[-1] for d in dataset]

In [78]:
Xtrain, Xvalid, Xtest = X[:len(X)//2], X[len(X)//2:(3*len(X))//4], X[(3*len(X))//4:]
ytrain, yvalid, ytest = y[:len(X)//2], y[len(X)//2:(3*len(X))//4], y[(3*len(X))//4:]

In [79]:
len(Xtrain), len(Xvalid), len(Xtest)

(1515, 758, 758)

In [80]:
mod = linear_model.LogisticRegression(C=1, class_weight='balanced')
mod.fit(Xtrain,ytrain)

pred = mod.predict(Xtrain)
berTrain = BER(pred, ytrain)

In [81]:
mod.fit(Xvalid, yvalid)
pred = mod.predict(Xvalid)
berValid = BER(pred, yvalid)

In [82]:
mod.fit(Xtest, ytest)
pred = mod.predict(Xtest)
berTest = BER(pred, ytest)

In [83]:
answers['Q3'] = [berTrain, berValid, berTest]

In [84]:
assertFloatList(answers['Q3'], 3)

answers['Q3']

[0.29287226079549855, 0.31782645215481037, 0.21056751467710372]

In [85]:
### Question 4

In [86]:
C_values = [10 ** i for i in range (-4, 5)] # from 10^-4 to 10^4
berList = []
for C in C_values:
    model = linear_model.LogisticRegression(C=C, class_weight = 'balanced')
    model.fit(Xtrain, ytrain)
    pred = model.predict(Xvalid)
    berList.append(BER(pred, yvalid))

In [87]:
answers['Q4'] = berList
berList

[0.32677521483491634,
 0.31931252826775214,
 0.32948891904115785,
 0.3233830845771144,
 0.3159203980099502,
 0.3111714156490276,
 0.2955030044582283,
 0.29618143050978873,
 0.29618143050978873]

In [88]:
assertFloatList(answers['Q4'], 9)

In [89]:
### Question 5

In [90]:
ber5 = min(berList)
bestC = C_values[berList.index(ber5)]


In [91]:
answers['Q5'] = [bestC, ber5]

answers['Q5']

[100, 0.2955030044582283]

In [92]:
assertFloatList(answers['Q5'], 2)

In [93]:
### Question 6

In [94]:
f = gzip.open("young_adult_10000.json.gz")
dataset = []
for l in f:
    dataset.append(eval(l))

In [95]:
dataTrain = dataset[:9000]
dataTest = dataset[9000:]
dataTrain[0]

{'user_id': '8842281e1d1347389f2ab93d60773d4d',
 'book_id': '2767052',
 'review_id': '248c011811e945eca861b5c31a549291',
 'rating': 5,
 'review_text': "I cracked and finally picked this up. Very enjoyable quick read - couldn't put it down - it was like crack. \n I'm a bit bothered by the lack of backstory of how Panem and the Hunger Games come about. It is just kind of explained away in a few paragraphs and we are left to accept this very strange world where teenagers are pitted into an arena each year to kill each other? I was expecting it because I've seen Battle Royale, but I would have appreciated knowing more of the backstory of how the world could have come into such a odd state. \n I suppose what makes a book like this interesting is thinking about the strategy of it all. The players are going to be statistically encouraged to band together because they will last longer that way, but by definition of course any partnership will be broken, and the drama of how that unfolds is alw

In [96]:
# Some data structures you might want

usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)
ratingDict = {} # To retrieve a rating for a specific user/item pair

for d in dataTrain:
    user = d["user_id"]
    item = d["book_id"]
    rating = d["rating"]

    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)

    ratingDict[(user, item)] = rating

In [97]:
def Jaccard(s1, s2):
    intersection = len(s1 & s2)
    union = len(s1 | s2)
    return intersection / union if union else 0

In [98]:
def mostSimilar(i, N):
    simlilarity = []

    # set of users who rated item i
    target_users = usersPerItem[i]

    for item, users in usersPerItem.items():
        if item != i:
            sim = Jaccard(target_users, users)
            simlilarity.append((sim, item))

    most_simiar_items = sorted(simlilarity, reverse=True)[:N]
    return most_simiar_items

In [99]:
answers['Q6'] = mostSimilar('2767052', 10)

In [100]:
assert len(answers['Q6']) == 10
assertFloatList([x[0] for x in answers['Q6']], 10)

In [101]:
### Question 7

In [102]:
average_ratings = {item: sum([review['rating'] for review in reviews]) / len(reviews) for item, reviews in reviewsPerItem.items()}

def predict_rating(user, item):
    if item not in average_ratings:
        return sum(average_ratings.values()) / len(average_ratings)
    numerator, denominator = 0, 0

    # Loop over items the user has rated
    for j in itemsPerUser[user] - {item}:
        if j in average_ratings:  # Only consider items with an average rating
            sim = Jaccard(usersPerItem[item], usersPerItem[j])
            numerator += (ratingDict[(user, j)] - average_ratings[j]) * sim
            denominator += sim
    
    # the final predicted rating
    if denominator == 0:
        return average_ratings[item]  # If no similar items, use item's average rating
    else:
        return average_ratings[item] + numerator / denominator
    

In [103]:
def MSE(data):
    mse = 0
    for d in data:
        user, item, actual_rating = d["user_id"], d["book_id"], d["rating"]
        predicted_rating = predict_rating(user, item)
        mse += (predicted_rating - actual_rating) ** 2
    return mse / len(data)

In [104]:
mse7 = MSE(dataTest)
answers['Q7'] = mse7

In [105]:
assertFloat(answers['Q7'])

In [106]:
### Question 8

In [107]:
average_ratings_user = {user: sum([review['rating'] for review in reviews]) / len(reviews) for user, reviews in reviewsPerUser.items()}

def predict_rating(u, i):
    if u not in average_ratings_user:
        return sum(average_ratings_user.values()) / len(average_ratings_user)
    
    numerator, denominator = 0, 0

    for v in usersPerItem[item] - {user}:
        if v in average_ratings_user:  # Only consider items with an average rating
            sim = Jaccard(itemsPerUser[user], itemsPerUser[v])
            numerator += (ratingDict[(v, item)] - average_ratings_user[v]) * sim
            denominator += sim
    
    # final calculation
    if denominator == 0:
        return average_ratings_user[user]  # If no similar items, use item's average rating
    else:
        return average_ratings_user[user] + numerator / denominator
    

In [108]:
mse8 = MSE(dataTest)
answers['Q8'] = mse8

In [109]:
assertFloat(answers['Q8'])

In [110]:
f = open("answers_hw2.txt", 'w')
f.write(str(answers) + '\n')
f.close()