In [None]:
import pandas as pd
from pandas import DataFrame
import numpy as np
from tqdm import tqdm

In [None]:
import sys
sys.path.append('../')

from src.utils import MovieEncoder, normalized_average_precision
from src.constants import MOVIE_PATH, RATINGS_PATH

In [None]:
ratings = pd.read_csv(RATINGS_PATH)
ratings.head(5)

# Train Test split

In [None]:
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'] \
                                .rank(method='first', ascending=False)

train_ratings = ratings[ratings['rank_latest'] != 1]
test_ratings = ratings[ratings['rank_latest'] == 1]

# дропаем колонки которые нам уже не нужны (timestamp)
train_ratings = train_ratings[['userId', 'movieId', 'rating']]
test_ratings = test_ratings[['userId', 'movieId', 'rating']]

# Dummy baseline

In [None]:
def calculate_top_movies(df: DataFrame) -> dict:
    #movies_metric = df.groupby('movieId')['rating'].sum()
    movies_metric = df.groupby('movieId')['rating'].count()
    movie_top = movies_metric.sort_values(ascending=False).to_dict()
    return movie_top

In [None]:
top = calculate_top_movies(train_ratings)

In [None]:
encoder = MovieEncoder(movie_csv_path=MOVIE_PATH)

for idx, (movieId, rating) in enumerate(top.items()):
    print(f"{encoder.to_title(movieId)} с суммой рейтингов {rating}")
    if idx >= 10:
        break

# Check Hit Ratio @ 10

In [None]:
all_movieIds = ratings['movieId'].unique()
top_movieIDs = [idx for idx, _ in top.items()]

In [None]:
'''
Имеем список(test_user_item_set) вида UserID - Target Movie Id (фильм который он посмотрел)
Далее проходимся по этому списку и если в ТОП10 фильмах есть фильм который он посмотрел => рекоменадция считается УДАЧНОЙ
Считаем среднее по hits
'''

test_user_item_set = set(zip(test_ratings['userId'], test_ratings['movieId']))

hits = []

for (u,i) in tqdm(test_user_item_set):
    if i in top_movieIDs[:10]:
        hits.append(1)
    else:
        hits.append(0)
        
print(f"The Hit Ratio @ 10 is {np.average(hits)}")