In [1]:
import json

In [2]:
with open("training.json") as training:
    training = json.loads(training.read())
    
with open("testing.json") as testing:
    testing = json.loads(testing.read())

In [3]:
training_dict = {}
testing_dict = {}

for user in training.keys():
    for [item, rating] in training[user]:
        if user in training_dict:
            training_dict[user][item] = rating
        else:
            training_dict[user] = {item: rating}
            
for user in testing.keys():
    for [item, rating] in testing[user]:
        if user in testing_dict:
            testing_dict[user][item] = rating
        else:
            testing_dict[user] = {item: rating}

# print(training_dict['A12R54MKO17TW0'])

In [4]:
def sub_row_mean(ratings_dict):
    sub_row_mean_ratings_dict = {}
    for user in ratings_dict:
        list_item = list(ratings_dict[user].values())
        avg = sum(list_item) / len(list_item)
        for item in ratings_dict[user].keys():
            if user in sub_row_mean_ratings_dict:
                sub_row_mean_ratings_dict[user][item] = ratings_dict[user][item] - avg
            else:
                sub_row_mean_ratings_dict[user] = {item: ratings_dict[user][item] - avg}
    
    return sub_row_mean_ratings_dict
        
# training_dict2 = sub_row_mean(training_dict)
# print(training_dict2['A12R54MKO17TW0'])
# print(training_dict['A12R54MKO17TW0'])

In [5]:
import numpy as np
from numpy.linalg import norm
import math

def cosine_sim(ratings_dict, user):
    cosine_sim_val = {}

    for other_user in ratings_dict.keys():
        dot_prod = sum(ratings_dict[user][item]*ratings_dict[other_user].get(item, 0) for item in ratings_dict[user])
        user_norm = norm(np.array([rating for rating in ratings_dict[user].values()]))
        other_user_norm = norm(np.array([rating for rating in ratings_dict[other_user].values()]))
        norm_mul = user_norm * other_user_norm
        if norm_mul == 0:
            cos_sim = 0
        else:
            cos_sim = dot_prod / norm_mul
        if math.isnan(cos_sim):
            cos_sim = 0
        cosine_sim_val[other_user] = cos_sim
    
    return cosine_sim_val

In [6]:
# k = 2
# item = "B0048P1XXQ"
# user = "A12R54MKO17TW0"
# cos_sims = cosine_sim(training_dict2, 'A12R54MKO17TW0')

def predict_item_rating_avg(ratings_dict, cos_similarities_dict, k, item, user):
    # df.dropna(subset = [item])
    # df = df[df.reviewerID != user]
    # new_df =  df.nlargest(k, "cosine_sim")
    # return new_df[item].mean(axis = 0)
    cos_similarities_dict.pop(user)
    other_users = list(cos_similarities_dict.keys())
    for other_user in other_users:
        if item not in ratings_dict[other_user]:
            cos_similarities_dict.pop(other_user)
    
    list_cos_sim = sorted(cos_similarities_dict.items(), key=lambda x: x[1], reverse=True)[:k]
    #print(list_cos_sim)
    avg_rating = 0
    
    if len(list_cos_sim) != 0:
        for pair in list_cos_sim:
            avg_rating += ratings_dict[pair[0]][item]
        
        avg_rating = avg_rating / len(list_cos_sim)
    
    return avg_rating

# training_dict["AAE2DUEMTR30I"][item] = 4.0
# print(predict_item_rating_avg(training_dict, training_dict2, cos_sims, k, item, user))
# print(training_dict["AAE2DUEMTR30I"])
# print(training_dict["AQ6J4B5WLGRJ5"])
    
    

In [7]:
# k = 2
# item = "B0048P1XXQ"
# user = "A12R54MKO17TW0"
# cos_sims = cosine_sim(training_dict2, 'A12R54MKO17TW0')

def predict_item_rating_sim(ratings_dict, cos_similarities_dict, k, item, user):
    # df.dropna(subset = [item])
    # df = df[df.reviewerID != user]
    # new_df =  df.nlargest(k, "cosine_sim")
    # return (new_df["cosine_sim"].dot(new_df[item]))/(new_df["cosine_sim"].sum())
    if user in cos_similarities_dict:
        cos_similarities_dict.pop(user)
    other_users = list(cos_similarities_dict.keys())
    for other_user in other_users:
        if item not in ratings_dict[other_user]:
            cos_similarities_dict.pop(other_user)
    
    list_cos_sim = sorted(cos_similarities_dict.items(), key=lambda x: x[1], reverse=True)[:k]
    rating = 0
    similarities = 0
    if len(list_cos_sim) != 0:
        for pair in list_cos_sim:
            rating += ratings_dict[pair[0]][item] * cos_similarities_dict[pair[0]]
            similarities += cos_similarities_dict[pair[0]]
        
        if similarities == 0:
            rating = 0
        else:
            rating = rating / similarities
    
    return rating

# training_dict["AAE2DUEMTR30I"][item] = 5.0
# print(predict_item_rating_sim(training_dict, training_dict2, cos_sims, k, item, user))
# print(training_dict["AAE2DUEMTR30I"])
# print(training_dict["AQ6J4B5WLGRJ5"])
    
    

In [8]:
def predict(ratings_dict, user, items):
    predicted_ratings_dict = {}
    sub_mean_dict = sub_row_mean(ratings_dict)
    cos_sim_dict = cosine_sim(sub_mean_dict, user)
    for item in items:
        avg_rating_prediction = predict_item_rating_avg(training_dict, cos_sim_dict.copy(), k, item, user)
        sim_rating_prediction = predict_item_rating_sim(training_dict, cos_sim_dict.copy(), k, item, user)
        
        predicted_ratings_dict[item] = (avg_rating_prediction, sim_rating_prediction)
        
    return predicted_ratings_dict

In [20]:
# import timeit

### TEST ###

k = 5
# item = ["B0048P1XXQ"]
# user = "A12R54MKO17TW0"

start = timeit.timeit()

    
N = 0
MAE_avg = 0
RMSE_avg = 0 
MAE_sim = 0
RMSE_sim = 0

for user, list_ratings in testing_dict.items():
    predicted_vals = predict(training_dict, user, list(testing_dict[user].keys()))
    orig_vals = testing_dict[user]

    #print(predicted_vals)
    #print(orig_vals)


    for item, (avg_rat, sim_rat) in predicted_vals.items():
        corr_rat = orig_vals[item]
        MAE_avg += abs(avg_rat-corr_rat)
        MAE_sim += abs(sim_rat-corr_rat)
        RMSE_avg += (avg_rat-corr_rat)**2
        RMSE_sim += (sim_rat-corr_rat)**2
        N+=1

MAE_avg = MAE_avg/N
MAE_sim = MAE_sim/N
RMSE_avg = math.sqrt(RMSE_avg/N)
RMSE_sim = math.sqrt(RMSE_sim/N)

print((MAE_avg, RMSE_avg))
print((MAE_sim, RMSE_sim))
    



end = timeit.timeit()

print(end - start) => 0.0011411383748054504

KeyboardInterrupt: 