In [1]:
import json

with open("training.json") as training:
    training = json.loads(training.read())
    
with open("testing.json") as testing:
    testing = json.loads(testing.read())

In [2]:
training_dict = {}
testing_dict = {}

for user in training.keys():
    for [item, rating] in training[user]:
        if user in training_dict:
            training_dict[user][item] = rating
        else:
            training_dict[user] = {item: rating}
            
for user in testing.keys():
    for [item, rating] in testing[user]:
        if user in testing_dict:
            testing_dict[user][item] = rating
        else:
            testing_dict[user] = {item: rating}

# print(training_dict['A12R54MKO17TW0'])

In [3]:
def sub_row_mean(ratings_dict):
    sub_row_mean_ratings_dict = {}
    for user in ratings_dict:
        list_item = list(ratings_dict[user].values())
        avg = sum(list_item) / len(list_item)
        for item in ratings_dict[user].keys():
            if user in sub_row_mean_ratings_dict:
                sub_row_mean_ratings_dict[user][item] = ratings_dict[user][item] - avg
            else:
                sub_row_mean_ratings_dict[user] = {item: ratings_dict[user][item] - avg}
    
    return sub_row_mean_ratings_dict
        
# training_dict2 = sub_row_mean(training_dict)
# print(training_dict2['A12R54MKO17TW0'])
# print(training_dict['A12R54MKO17TW0'])

In [4]:
import numpy as np
from numpy.linalg import norm
import math

def cosine_sim(ratings_dict, user):
    cosine_sim_val = {}

    for other_user in ratings_dict.keys():
        dot_prod = sum(ratings_dict[user][item]*ratings_dict[other_user].get(item, 0) for item in ratings_dict[user])
        user_norm = norm(np.array([rating for rating in ratings_dict[user].values()]))
        other_user_norm = norm(np.array([rating for rating in ratings_dict[other_user].values()]))
        norm_mul = user_norm * other_user_norm
        if norm_mul == 0:
            cos_sim = 0
        else:
            cos_sim = dot_prod / norm_mul
        if math.isnan(cos_sim):
            cos_sim = 0
        cosine_sim_val[other_user] = cos_sim
    
    return cosine_sim_val

In [5]:
# k = 2
# item = "B0048P1XXQ"
# user = "A12R54MKO17TW0"
# cos_sims = cosine_sim(training_dict2, 'A12R54MKO17TW0')

def predict_item_rating_avg(ratings_dict, cos_similarities_dict, k, item, user):
    # df.dropna(subset = [item])
    # df = df[df.reviewerID != user]
    # new_df =  df.nlargest(k, "cosine_sim")
    # return new_df[item].mean(axis = 0)
    cos_similarities_dict.pop(user)
    other_users = list(cos_similarities_dict.keys())
    for other_user in other_users:
        if item not in ratings_dict[other_user]:
            cos_similarities_dict.pop(other_user)
    
    list_cos_sim = sorted(cos_similarities_dict.items(), key=lambda x: x[1], reverse=True)[:k]
    #print(list_cos_sim)
    avg_rating = 0
    
    if len(list_cos_sim) != 0:
        for pair in list_cos_sim:
            avg_rating += ratings_dict[pair[0]][item]
        
        avg_rating = avg_rating / len(list_cos_sim)
    
    return avg_rating

# training_dict["AAE2DUEMTR30I"][item] = 4.0
# print(predict_item_rating_avg(training_dict, training_dict2, cos_sims, k, item, user))
# print(training_dict["AAE2DUEMTR30I"])
# print(training_dict["AQ6J4B5WLGRJ5"])
    
    

In [6]:
# k = 2
# item = "B0048P1XXQ"
# user = "A12R54MKO17TW0"
# cos_sims = cosine_sim(training_dict2, 'A12R54MKO17TW0')

def predict_item_rating_sim(ratings_dict, cos_similarities_dict, k, item, user):
    # df.dropna(subset = [item])
    # df = df[df.reviewerID != user]
    # new_df =  df.nlargest(k, "cosine_sim")
    # return (new_df["cosine_sim"].dot(new_df[item]))/(new_df["cosine_sim"].sum())
    if user in cos_similarities_dict:
        cos_similarities_dict.pop(user)
    other_users = list(cos_similarities_dict.keys())
    for other_user in other_users:
        if item not in ratings_dict[other_user]:
            cos_similarities_dict.pop(other_user)
    
    list_cos_sim = sorted(cos_similarities_dict.items(), key=lambda x: x[1], reverse=True)[:k]
    rating = 0
    similarities = 0
    if len(list_cos_sim) != 0:
        for pair in list_cos_sim:
            rating += ratings_dict[pair[0]][item] * cos_similarities_dict[pair[0]]
            similarities += cos_similarities_dict[pair[0]]
        
        if similarities == 0:
            rating = 0
        else:
            rating = rating / similarities
    
    return rating

# training_dict["AAE2DUEMTR30I"][item] = 5.0
# print(predict_item_rating_sim(training_dict, training_dict2, cos_sims, k, item, user))
# print(training_dict["AAE2DUEMTR30I"])
# print(training_dict["AQ6J4B5WLGRJ5"])
    
    

In [7]:
def predict(ratings_dict, user, items):
    predicted_ratings_dict = {}
    sub_mean_dict = sub_row_mean(ratings_dict)
    cos_sim_dict = cosine_sim(sub_mean_dict, user)
    for item in items:
        avg_rating_prediction = predict_item_rating_avg(training_dict, cos_sim_dict.copy(), k, item, user)
        sim_rating_prediction = predict_item_rating_sim(training_dict, cos_sim_dict.copy(), k, item, user)
        
        predicted_ratings_dict[item] = (avg_rating_prediction, sim_rating_prediction)
        
    return predicted_ratings_dict

In [12]:
all_items = set()
for user,list_ratings in training_dict.items():
    all_items.update(list(training_dict[user].keys()))

for user,list_ratings in testing_dict.items():
    all_items.update(list(testing_dict[user].keys()))
#print(len(all_items))

In [14]:
import timeit

k = 10
# item = ["B0048P1XXQ"]
user = "A12R54MKO17TW0"

start = timeit.timeit()


#for user, list_ratings in testing_dict.items():
predicted_vals = predict(training_dict, user, list(all_items))

orig_vals = training_dict[user]

for item,rating in orig_vals.items():
    predicted_vals.pop(item)

recommended_items_ratings_avg = sorted(predicted_vals.items(), key=lambda x: x[1][0], reverse=True)[:k]
recommended_items_ratings_sim = sorted(predicted_vals.items(), key=lambda x: x[1][1], reverse=True)[:k]
print((user, recommended_items_ratings_avg))
print((user, recommended_items_ratings_sim))
    
    





end = timeit.timeit()

print(end - start) #=> 0.0011411383748054504

('A12R54MKO17TW0', [('B001O3CXDO', (5.0, 0)), ('B00W4EWLEI', (5.0, 0)), ('B00E88YTZC', (5.0, 0)), ('B00NUUHDTI', (5.0, 0)), ('B0172VW7XM', (5.0, 0)), ('B005AJ8A6C', (5.0, 0)), ('B00L9JDF6M', (5.0, 0)), ('B01BAXC1HU', (5.0, 0)), ('B00KLLKSQY', (5.0, 0)), ('B006OYUTLG', (5.0, 0))])
('A12R54MKO17TW0', [('B001NCUWMU', (4.5, 9.579117156879974)), ('B000TENID8', (4.75, 6.921559163804875)), ('B00973CHWA', (4.25, 6.784869075755961)), ('B000MRU1L4', (4.0, 5.701100871358665)), ('B000V6759W', (5.0, 5.000000000000002)), ('B0015FW0MK', (5.0, 5.000000000000001)), ('B0035XJF0A', (3.888888888888889, 5.000000000000001)), ('B00A8U014C', (4.8, 5.000000000000001)), ('B00123KDCY', (4.25, 5.000000000000001)), ('B00MD63DCQ', (5.0, 5.000000000000001))])
0.00011635478585958481


In [17]:
print(testing_dict[user])

{'0001388703': 5.0, 'B000VZJS84': 5.0, 'B000WLNUN6': 5.0, 'B001232RGE': 5.0, 'B00123KDR4': 5.0, 'B001BHWEAA': 3.0, 'B001FSB0C8': 5.0, 'B002R4K6AG': 3.0, 'B00382MONS': 5.0, 'B01929H4VM': 5.0, 'B0007ZWQNC': 5.0, 'B000E1B00O': 5.0, 'B000GWCIOS': 5.0, 'B000MRU1B4': 5.0, 'B001PJ5SX4': 4.0, 'B0048P1XXQ': 5.0}


In [19]:


recommended_items_ratings_avg = recommended_items_avg
recommended_items_ratings_sim = recommended_items_avg

orig_reviewed_items = testing_dict[user].keys()
recommended_items_avg = []#recommended_items_ratings_avg.keys()
recommended_items_sim = []#recommended_items_ratings_sim.keys()

for item in recommended_items_ratings_avg:
    recommended_items_avg.append(item[0])
    
for item in recommended_items_ratings_sim:
    recommended_items_sim.append(item[0])

same_avg_items = 0
same_sim_items = 0

for item in recommended_items_avg:
    if item in orig_reviewed_items:
        same_avg_items += 1
        
for item in recommended_items_sim:
    if item in orig_reviewed_items:
        same_sim_items += 1
        
precision_avg = (same_avg_items/k) *100
precision_sim = (same_sim_items/k) * 100

print(precision_avg)
print(precision_sim)


recall_avg = (same_avg_items/len(orig_reviewed_items)) *100
recall_sim = (same_sim_items/len(orig_reviewed_items)) * 100


print(recall_avg)
print(recall_sim)
if precision_avg+recall_avg == 0:
    f_score_avg = 0
else:
    f_score_avg = (2*precision_avg*recall_avg)/(precision_avg+recall_avg)
    
if precision_sim+recall_sim == 0:
    f_score_sim = 0
else:
    f_score_sim = (2*precision_sim*recall_sim)/(precision_sim+recall_sim)

print(f_score_avg)
print(f_score_sim)




0.0
0.0
0.0
0.0
0
0
