In [None]:
import numpy as np
import pandas as pd

# MAP@K

In [None]:
def mean_ap(predictions, ground_truth, k=10):
    # search_results: np.array, (num_users, recommendations_num)
    # ground_truth: np.array, (num_users, recommendations_num)
    # mean of AP@k
    assert len(predictions) == len(ground_truth)
    assert all([len(set(prediction)) == len(prediction) for prediction in predictions])
    map = 0.0
    predictions = predictions[:, :k]
    for i in range(len(predictions)):
      ap = 0.0
      correct = 0
      relevant = set(ground_truth[i])

      for mj in range(predictions.shape[-1]):
        if predictions[i][mj] in relevant:
          correct += 1
          ap += correct / (mj + 1)
      map += ap / ground_truth.shape[-1]
    map /= ground_truth.shape[0]
    return map

In [None]:
mean_ap(np.array([[3, 2, 1, 6, 5, 4, 9, 8, 7]]), np.array([[9, 1, 8]]))

# RMSE

Calculated wrt ratings on known movies which are not in the train set

In [None]:
def rmse(predicted_ratings: np.array, gt_ratings: np.array):
    return np.sqrt(np.mean((predicted_ratings.reshape(-1) - gt_ratings.reshape(-1)) ** 2))

In [None]:
rmse(np.ones(3), np.ones(3) * 2)

In [None]:
rmse(np.ones(3) * 4, np.ones(3))

# Top 100 popular baseline

In [None]:
data = pd.read_csv('../data/raw/u.data', sep='\t', header=None)
data = data.rename(columns={0: 'user_id', 1: 'film_id', 2: 'rating', 3: 'time'})

In [None]:
users = pd.read_csv('../data/raw/u.user', sep='|', header=None, index_col=0)
train_users = set(users.loc[:int(0.8 * len(users)), :].index.to_list())
val_users = set(users.loc[len(train_users) + 1:, :].index.to_list())

train = data[data.user_id.isin(train_users)]
val = data[data.user_id.isin(val_users)]

In [None]:
set(train.user_id).intersection(set(val.user_id))

In [None]:
movies_popularity = train.drop(['user_id', 'time'], axis=1).groupby('film_id').sum('rating')
most_popular_movies = movies_popularity.sort_values(by='rating', ascending=False).index[:100]
most_popular_movies = np.array(most_popular_movies)
most_popular_movies

In [None]:
# To all users from the validation set recommend the most popular movies based on the users from train set
actual_found_ratings = val[val.film_id.isin(most_popular_movies)].rating.to_numpy()
predicted_rating = np.ones(len(actual_found_ratings)) * 5
print(f'RMSE for popular movies recommendation = {rmse(predicted_rating, actual_found_ratings)}')

In [None]:
users_gt_favorite = []
for user_id, user_records in val.groupby('user_id'):
    favorite_items = user_records.sort_values('rating', ascending=False).film_id.values
    users_gt_favorite.append(favorite_items[:10])

users_gt_favorite = np.array(users_gt_favorite)
predicted_recommendation = np.array([most_popular_movies for _ in range(len(users_gt_favorite))])

for level in [10, 20, 50, 100]:
    print(f'MAP @{level:3.0f} for popular movies recommendation = {mean_ap(predicted_recommendation, users_gt_favorite, level)}')

# Random 100 baseline

In [None]:
num_movies = len(data.film_id.unique())

In [None]:
actual_found_ratings = []
predicted_ratings = []
random_recommendations = []

for user_id, user_record in val.groupby('user_id'):
    rec = np.random.choice(range(num_movies), 100, replace=False)
    found_ratings = val[val.film_id.isin(rec)].rating.to_numpy()

    actual_found_ratings.extend(found_ratings)
    random_recommendations.append(rec)
    predicted_ratings.extend(np.ones(len(found_ratings)) * 5)

random_recommendations = np.array(random_recommendations)
predicted_ratings = np.array(predicted_ratings)
actual_found_ratings = np.array(actual_found_ratings)

predicted_rating = np.ones(len(actual_found_ratings)) * 5
print(f'RMSE for popular movies recommendation = {rmse(predicted_ratings, actual_found_ratings)}')
for level in [10, 20, 50, 100]:
    print(f'MAP @{level:3.0f} for popular movies recommendation = {mean_ap(random_recommendations, users_gt_favorite, level)}')