# Imports

In [1]:
import sys

import numpy as np
from sklearn.metrics import mean_squared_error
from numpy import dot
from numpy.linalg import norm

sys.path.append("..")

from constants import U1BASE_PATH, U1TEST_PATH
from benchmark.data_tramsformation import convert_raw_to_matrix

# Data

In [2]:
train_ratings = convert_raw_to_matrix(U1BASE_PATH)
test_ratings = convert_raw_to_matrix(U1TEST_PATH)

# Model

In [44]:
np.random.random(10) * 5

array([4.01523187, 3.28051825, 4.59433714, 3.69621832, 1.88188565,
       2.45600504, 1.22541041, 0.64369547, 3.15491882, 3.46615595])

In [46]:
train_ratings.shape[1]

1682

In [65]:
class Model:
    def fit(self, ratings):
        pass

    def predict(self, user_id):
        pass


class RandomModel(Model):
    def __init__(self, max_score = 5) -> None:
        self.max_score = max_score
    
    def fit(self, ratings):
        self.shape = ratings.shape[1]

    def predict(self, user_id):
        prediction = np.random.random(self.shape) * self.max_score

        return prediction
    

class AvgModel(Model):
    def get_avg_film_ratings(self, film_id, ratings):
        all_ratings = ratings[:, film_id][ratings[:, film_id] != 0]
        if len(all_ratings) == 0:
            avg_rating = 0
        else:
            avg_rating = np.average(all_ratings)

        return avg_rating
    

    def fit(self, ratings):
        self.avg = [self.get_avg_film_ratings(film_id, ratings) for film_id in range(ratings.shape[1])]
        self.avg = np.array(self.avg)

    def predict(self, user_id):
        return self.avg


class ColaborativeModel(Model):
    cos_sim = lambda a, b: dot(a, b) / (norm(a) * norm(b))

    def __init__(self, num_users: int = 30) -> None:
        self.num_users = num_users

    def fit(self, ratings):
        self.ratings = ratings

    def predict(self, user_id):
        user_ratings = self.ratings[user_id]
        similarities = np.array([ColaborativeModel.cos_sim(user_ratings, rating) for rating in self.ratings])
        closest_users = list(reversed(np.argsort(similarities)))
        closest_users = closest_users[1:self.num_users + 1]
        
        closest_similarities = similarities[closest_users]
        sum_closest_similarities = sum(closest_similarities)

        closest_ratingss = self.ratings[closest_users]

        prediction = np.sum((closest_ratingss * closest_similarities[:, None]), axis=0) / sum_closest_similarities

        return prediction

In [81]:
def count_rmse(true, prediction):
    true_non_null = true != 0

    user_non_null = true[true_non_null]
    fitted_prediction = prediction[true_non_null]

    rmse = mean_squared_error(user_non_null, fitted_prediction, squared=False)

    return rmse

In [89]:
random_model = RandomModel()
random_model.fit(train_ratings)
random_model.predict(0)

array([3.00387523, 1.66690773, 3.11551852, ..., 3.92394483, 2.68972337,
       2.24981659])

In [90]:
avg_model = AvgModel()
avg_model.fit(train_ratings)
avg_model.predict(0)

array([3.89295039, 3.18095238, 3.        , ..., 2.        , 3.        ,
       3.        ])

In [91]:
colab_model = ColaborativeModel(num_users=30)
colab_model.fit(train_ratings)

pred = colab_model.predict(0)

count_rmse(test_ratings[0], pred)

2.352399447092248

In [96]:
def count_rmse_on_dataset(model, data):
    sum_rmse = 0
    num_predictions = 0

    for id, sample in enumerate(data):
        if sum(sample) == 0:
            continue

        pred = model.predict(id)
        rmse = count_rmse(sample, pred)

        sum_rmse += rmse
        num_predictions += 1

    rmse = sum_rmse / num_predictions

    return rmse

In [97]:
count_rmse_on_dataset(colab_model, test_ratings)

2.5203890678021796

In [92]:
count_rmse_on_dataset(random_model, test_ratings)

2.121163882524208

In [93]:
count_rmse_on_dataset(avg_model, test_ratings)

1.0091275841214649