In [47]:
from collections import defaultdict
from sklearn import linear_model
import csv
import random
import array
import pandas as pd
import numpy as np
from zipfile import ZipFile

In [48]:
# read rating data
z = ZipFile('/Users/zezhilan/Desktop/fall quarter/cse 258/assignment 2/rating.csv.zip')
f = z.open('rating.csv')
rating_data = pd.read_csv(f)
f.close()
z.close()

In [49]:
# delete rows where user not gave their ratings
rating_data = rating_data.drop(rating_data[rating_data['rating'] == -1].index)
rating_data = rating_data.reset_index(drop = True)

In [50]:
rating_data

Unnamed: 0,user_id,anime_id,rating
0,1,8074,10
1,1,11617,10
2,1,11757,10
3,1,15451,10
4,2,11771,10
...,...,...,...
6337236,73515,16512,7
6337237,73515,17187,9
6337238,73515,22145,10
6337239,73516,790,9


In [51]:
user_id = rating_data['user_id'].to_list()
item_id = rating_data['anime_id'].to_list()
ratings = rating_data['rating'].to_list()

In [52]:
interactions = list(zip(user_id,item_id,ratings))

In [53]:
interactions[:5]

[(1, 8074, 10), (1, 11617, 10), (1, 11757, 10), (1, 15451, 10), (2, 11771, 10)]

In [54]:
random.shuffle(interactions)
len(interactions)

6337241

In [78]:
# build interactions
itemsPerUser = defaultdict(set)
usersPerItem = defaultdict(set)
ratingDict = {}
for u,i,r in interactions:
    itemsPerUser[u].add(i)
    usersPerItem[i].add(u)
    ratingDict[(u,i)] = r

In [79]:
ratingDict

{(5886, 30240): 7,
 (52391, 15809): 8,
 (44993, 20): 6,
 (29014, 856): 6,
 (10316, 13601): 9,
 (67661, 13161): 7,
 (24258, 12673): 7,
 (42455, 6747): 10,
 (20253, 11617): 7,
 (73414, 4872): 8,
 (65152, 23755): 9,
 (12169, 1535): 8,
 (38445, 8277): 7,
 (48066, 16668): 8,
 (46298, 11111): 10,
 (52423, 20767): 9,
 (73301, 2951): 10,
 (40257, 13333): 7,
 (51776, 3588): 8,
 (62130, 379): 8,
 (57811, 1709): 7,
 (19249, 3588): 7,
 (35011, 9936): 8,
 (11250, 2104): 9,
 (25966, 65): 9,
 (58802, 1): 8,
 (56760, 1535): 6,
 (23843, 996): 9,
 (53494, 3593): 7,
 (26753, 4063): 8,
 (3569, 1368): 7,
 (27706, 50): 7,
 (29080, 23283): 8,
 (20571, 28677): 7,
 (41714, 30458): 7,
 (60294, 343): 7,
 (6818, 12403): 9,
 (18091, 9917): 8,
 (29662, 4177): 9,
 (32947, 6707): 8,
 (46722, 9062): 10,
 (16572, 23283): 8,
 (62607, 28619): 7,
 (60991, 6479): 6,
 (62444, 10396): 8,
 (6483, 5150): 10,
 (5413, 11577): 9,
 (16342, 1562): 3,
 (47849, 10298): 6,
 (10419, 14833): 5,
 (65931, 194): 9,
 (65560, 21939): 9,
 (59

In [80]:
userAverages = {} # get average rating for a user
itemAverages = {} # get average rating for a anime

for u in itemsPerUser:
    rs = [ratingDict[(u,i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(u,i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)

In [81]:
# define Jaccard similarity function
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2)) 
    denom = len(s1.union(s2))
    return numer / denom

In [82]:
# define the function to calculate r(u, i)
def predictRating(user,item):
    ratings = []
    similarities = []
    for i2 in itemsPerUser[user]:
        if i2 == item: continue
        ratings.append(ratingDict[(user,i2)] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return itemAverages[item]

In [83]:
def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

In [107]:
# convert data to pandas for random sample
interactions_pandas = pd.DataFrame(interactions)

In [108]:
# randomly sample 5000 rows data
interactions_pandas_sample = interactions_pandas.sample(n=5000,random_state=10) 

In [109]:
interactions_pandas_sample

Unnamed: 0,0,1,2
235760,16584,7054,9
3369321,32858,9471,7
2108322,22745,1818,6
3175772,271,10213,8
2886078,32568,7193,7
...,...,...,...
4200055,2951,4548,9
1210602,19459,3588,8
149377,24980,3901,8
3000062,70541,16918,8


In [110]:
X = interactions_pandas_sample.iloc[:,:2]
labels = interactions_pandas_sample.iloc[:,2]

In [111]:
X = X.values.tolist()
labels = labels.values.tolist()

In [112]:
# Prediction
simPredictions = [predictRating(d[0],d[1]) for d in X]

In [113]:
# MSE for 5000 sampled data
MSE(simPredictions, labels)

1.3560720085636675

# Check MSE for the situation that we only take overall average for prediction

In [114]:
all_average = sum(rating_data['rating'].tolist())/rating_data.shape[0]
all_average_predictions = [all_average]*5000
MSE(all_average_predictions, labels)

2.4594646020558573

# Check MSE for the situation we only take ITEM AVERAGE for prediction

In [115]:
# define the function to calculate r(u, i)
def predict_ITEM_AVERAGE(user,item):
    return itemAverages[item]

In [116]:
ITEM_AVERAGE_Predictions = [predict_ITEM_AVERAGE(d[0],d[1]) for d in x]
MSE(ITEM_AVERAGE_Predictions, labels)

2.027973685772231