# Rating Prediction, Baseline Model

In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

%matplotlib inline
%config InlineBackend.figure_format="retina"

train_data = pd.read_csv("data/interactions_train.csv")
valid_data = pd.read_csv("data/interactions_validation.csv")

## Rating Prediction

In [2]:
train_data["user_id"] = train_data["user_id"].astype(str)
train_data["recipe_id"] = train_data["recipe_id"].astype(str)

In [3]:
# combine each user and item together
small_df = train_data[["user_id", "recipe_id", "rating"]]

rating_dict = {}
items_per_user = defaultdict(set)
users_per_item = defaultdict(set)

for index, row in small_df.iterrows():
    user = row["user_id"]
    item = row["recipe_id"]
    
    users_per_item[item].add(user)
    items_per_user[user].add(item)
    
    rating_dict[(user, item)] = row["rating"]

In [4]:
user_avgs = {}
item_avgs = {}

for u in items_per_user:
    ratings = [rating_dict[(u, i)] for i in items_per_user[u]]
    user_avgs[u] = sum(ratings) / len(ratings)
    
for i in users_per_item:
    ratings = [rating_dict[(u, i)] for u in users_per_item[i]]
    item_avgs[i] = sum(ratings) / len(ratings)

In [5]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    
    if denom == 0:
        return 0
    
    return numer / denom

def most_similar(i, num):
    sims = []
    users = users_per_item[i]
    
    for i2 in users_per_item:
        if i2 == i:
            continue
        sim = Jaccard(users, users_per_item[i2])
        sims.append((sim, i2))
    
    sims.sort(reverse=True)
    
    return sims[:num]

In [6]:
reviews_per_user = defaultdict(list)
reviews_per_item = defaultdict(list)

for index, row in small_df.iterrows():
    user = row["user_id"]
    item = row["recipe_id"]
    
    reviews_per_item[item].append(user)
    reviews_per_user[user].append(item)

rating_mean = train_data["rating"].sum() / len(train_data["rating"])

In [7]:
def predict_rating(user, item):
    ratings = []
    sims = []
    
    for i in reviews_per_user[user]:
        if i == item:
            continue
        ratings.append(rating_dict[(user, i)] - item_avgs[i])
        sims.append(Jaccard(users_per_item[item], users_per_item[i]))
        
    if (sum(sims) > 0):
        weighted_ratings = [(x * y) for x, y in zip(ratings, sims)]
        return item_avgs[item] + sum(weighted_ratings) / sum(sims)
    
    else:
        return rating_mean

## Baseline Model

In [8]:
from IPython.display import clear_output
y_pred = []

total = valid_data.shape[0]

for index, row in valid_data[["user_id", "recipe_id", "rating"]].iterrows():
    user = row["user_id"]
    item = row["recipe_id"]
    pred = predict_rating(user, item)
    y_pred.append(pred)
    if index % 10 == 0:
        clear_output(wait=True)
        print(f"Now at index: {index}, \n Progress: {round(index/total, 3)}")

Now at index: 7020, 
 Progress: 1.0


## Model Evaluation

In [9]:
y_true = valid_data["rating"]
from sklearn.metrics import mean_squared_error
mean_squared_error(y_true, y_pred)

1.8138061805801156