In [1]:
import numpy as np
import pandas as pd

# MAP@K

In [2]:
def mean_ap(predictions, ground_truth, k=10):
    # search_results: np.array, (num_users, recommendations_num)
    # ground_truth: np.array, (num_users, recommendations_num)
    # mean of AP@k
    assert len(predictions) == len(ground_truth)
    assert all([len(set(prediction)) == len(prediction) for prediction in predictions])
    map = 0.0
    predictions = predictions[:, :k]
    for i in range(len(predictions)):
      ap = 0.0
      correct = 0
      relevant = set(ground_truth[i])

      for mj in range(predictions.shape[-1]):
        if predictions[i][mj] in relevant:
          correct += 1
          ap += correct / (mj + 1)
      map += ap / ground_truth.shape[-1]
    map /= ground_truth.shape[0]
    return map

In [3]:
mean_ap(np.array([[3, 2, 1, 6, 5, 4, 9, 8, 7]]), np.array([[9, 1, 8]]))

0.33134920634920634

# RMSE

Calculated wrt ratings on known movies which are not in the train set

In [4]:
def rmse(predicted_ratings: np.array, gt_ratings: np.array):
    return np.sqrt(np.mean((predicted_ratings.reshape(-1) - gt_ratings.reshape(-1)) ** 2))

In [5]:
rmse(np.ones(3), np.ones(3) * 2)

1.0

In [6]:
rmse(np.ones(3) * 4, np.ones(3))

3.0

# Top 100 popular baseline

In [7]:
data = pd.read_csv('../data/raw/u.data', sep='\t', header=None)
data = data.rename(columns={0: 'user_id', 1: 'film_id', 2: 'rating', 3: 'time'})

In [8]:
users = pd.read_csv('../data/raw/u.user', sep='|', header=None, index_col=0)
train_users = set(users.loc[:int(0.8 * len(users)), :].index.to_list())
val_users = set(users.loc[len(train_users) + 1:, :].index.to_list())

train = data[data.user_id.isin(train_users)]
val = data[data.user_id.isin(val_users)]

In [9]:
set(train.user_id).intersection(set(val.user_id))

set()

In [10]:
movies_popularity = train.drop(['user_id', 'time'], axis=1).groupby('film_id').sum('rating')
most_popular_movies = movies_popularity.sort_values(by='rating', ascending=False).index[:100]
most_popular_movies = np.array(most_popular_movies)
most_popular_movies

array([ 50, 100, 181, 258, 127, 286, 174,   1,  98, 288,  56, 300, 172,
         7, 294, 313, 121, 237, 117,  79, 204, 173, 318, 222,  64, 210,
       269, 168,  22,  69, 302,  12,  96, 423,   9, 183, 357, 191, 276,
       195,  15, 151, 216, 257,  89, 483, 176, 405, 234,  28, 202, 275,
       135,  25, 132, 194, 268, 197, 238,  97, 185, 328, 196, 186, 742,
       475, 748,  82, 603, 144,  11, 153,  70, 427, 496, 118, 655, 111,
       182, 265, 228,   8, 435, 333, 180, 282, 187, 179, 134, 272, 125,
       568, 211, 215, 515, 143, 508, 474, 175, 208], dtype=int64)

In [11]:
# To all users from the validation set recommend the most popular movies based on the users from train set
actual_found_ratings = val[val.film_id.isin(most_popular_movies)].rating.to_numpy()
predicted_rating = np.ones(len(actual_found_ratings)) * 5
print(f'RMSE for popular movies recommendation = {rmse(predicted_rating, actual_found_ratings)}')

RMSE for popular movies recommendation = 1.4808899951232035


In [12]:
users_gt_favorite = []
for user_id, user_records in val.groupby('user_id'):
    favorite_items = user_records.sort_values('rating', ascending=False).film_id.values
    users_gt_favorite.append(favorite_items[:10])

users_gt_favorite = np.array(users_gt_favorite)
predicted_recommendation = np.array([most_popular_movies for _ in range(len(users_gt_favorite))])

for level in [10, 20, 50, 100]:
    print(f'MAP @{level:3.0f} for popular movies recommendation = {mean_ap(predicted_recommendation, users_gt_favorite, level)}')

MAP @ 10 for popular movies recommendation = 0.05065738641135465
MAP @ 20 for popular movies recommendation = 0.06474643578856318
MAP @ 50 for popular movies recommendation = 0.07808367452858973
MAP @100 for popular movies recommendation = 0.08768122761177957


# Random 100 baseline

In [15]:
num_movies = len(data.film_id.unique())

In [18]:
actual_found_ratings = []
predicted_ratings = []
random_recommendations = []

for user_id, user_record in val.groupby('user_id'):
    rec = np.random.choice(range(num_movies), 100, replace=False)
    found_ratings = val[val.film_id.isin(rec)].rating.to_numpy()

    actual_found_ratings.extend(found_ratings)
    random_recommendations.append(rec)
    predicted_ratings.append(np.ones(len(found_ratings) * 5))

random_recommendations = np.array(random_recommendations)

actual_found_ratings = val[val.film_id.isin(most_popular_movies)].rating.to_numpy()
predicted_rating = np.ones(len(actual_found_ratings)) * 5
print(f'RMSE for popular movies recommendation = {rmse(predicted_rating, actual_found_ratings)}')
for level in [10, 20, 50, 100]:
    print(f'MAP @{level:3.0f} for popular movies recommendation = {mean_ap(random_recommendations, users_gt_favorite, level)}')

RMSE for popular movies recommendation = 1.4808899951232035
MAP @ 10 for popular movies recommendation = 0.001377551020408163
MAP @ 20 for popular movies recommendation = 0.0018662582969182807
MAP @ 50 for popular movies recommendation = 0.002517458910285687
MAP @100 for popular movies recommendation = 0.0030849650912511445
