# __CB models comparison__

##### Initial setup

In [None]:
%load_ext watermark
%load_ext autoreload

In [None]:
%autoreload 2
%watermark -v -n -m -p numpy,pandas,recmetrics,matplotlib

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import recmetrics
import seaborn as sns
import sys
import os
import pickle
from scipy.stats import spearmanr
from pathlib import Path
sys.path.append('../../../../')
from src.settings import DATA_DIR, PROJECT_DIR, RESULT_DIR

CURRENT_PATH = os.path.abspath(os.path.join(os.pardir))
print(CURRENT_PATH)
print(DATA_DIR)
print(RESULT_DIR)

In [None]:
import logging

logging.basicConfig(level=logging.INFO)

from src.data_processing.visualization.plot_utils import plot_line

In [None]:
REPORTS_CB_DIR = Path(PROJECT_DIR) / 'reports' / 'figures' / 'sec3_cb'
Path(REPORTS_CB_DIR).mkdir(parents=True, exist_ok=True)
REPORTS_CB_DIR

##### Load data

In [None]:
TEST_MOVIES_PATH = Path(DATA_DIR) / 'datasets' / 'compare_split' / 'movies_test_1k_users.csv'
TEST_RATINGS_PATH = Path(DATA_DIR) / 'datasets' / 'compare_split' / 'ratings_test_1k_users.csv'
TRAIN_RATINGS_PATH = Path(DATA_DIR) / 'datasets' / 'compare_split' / 'ratings_train_1k_users.csv'

In [None]:
test_ratings = pd.read_csv(TEST_RATINGS_PATH)

In [None]:
RELEVANI_RATE = 3.5
relevant_ratings = test_ratings.query(f'rating >={RELEVANI_RATE}')
relevant_ratings = relevant_ratings.drop(columns=['timestamp'])
relevant_ratings.reset_index(drop=True, inplace=True)

In [None]:
relevant_ratings.info()

### __Train models__

Tag model

In [None]:
from src.models.content_based.tag_model import TagModel

tag_model = TagModel(ratings_path=TRAIN_RATINGS_PATH, movies_path=TEST_MOVIES_PATH, rate_threshold=3.5)

In [None]:
tag_model.preprocess_data()

Text model

In [None]:
from src.models.content_based.text_model import TextModel

text_model = TextModel(ratings_path=TRAIN_RATINGS_PATH, movies_path=TEST_MOVIES_PATH, rate_threshold=3.5)

In [None]:
text_model.preprocess_data()

### __Compare metrics__

In [None]:
TOP_N = 10

In [None]:
test_df_format = relevant_ratings.copy().groupby('userId')['movieId'].agg(actual=(lambda x: list(set(x)))).reset_index()

In [None]:
test_df_format.head()

In [None]:
test_df_format.info()

In [None]:
def get_user_recommendation(model, user_id: int, top_n=10):
    cb_recs = model.get_recommendations_for_user(user_id, top=top_n)
    cb_recs = [movie_id for (title, movie_id, cosine) in cb_recs]
    
    return cb_recs[0:top_n]

In [None]:
from tqdm import tqdm

tag_recs = []
for user in tqdm(test_df_format.userId.values, desc='Getting recommendations', total=len(test_df_format.index)):
    funk_pred = get_user_recommendation(tag_model, user, top_n=TOP_N)
    tag_recs.append(funk_pred)
        
test_df_format['tag_pred'] = tag_recs

In [None]:
test_df_format.head()

In [None]:
from tqdm import tqdm

text_recs = []
for user in tqdm(test_df_format.userId.values, desc='Getting recommendations', total=len(test_df_format.index)):
    funk_pred = get_user_recommendation(text_model, user, top_n=TOP_N)
    text_recs.append(funk_pred)
        
test_df_format['text_pred'] = text_recs

In [None]:
test_df_format.head()

In [None]:
test_df_format.to_csv(Path(DATA_DIR) / 'datasets' / 'tag_text_compare.csv', index=False)

In [None]:
actual = test_df_format.actual.values.tolist()
tag_predictions = test_df_format.tag_pred.values.tolist()
text_predictions = test_df_format.text_pred.values.tolist()

##### Read if saved

In [None]:
test_df_format = pd.read_csv(Path(DATA_DIR) / 'datasets' / 'tag_text_compare.csv')

In [None]:
test_df_format.head()

In [None]:
import ast

actual = test_df_format.actual.values.tolist()
tag_predictions = test_df_format.tag_pred.values.tolist()
text_predictions = test_df_format.text_pred.values.tolist()

types = []
for pred_list in tag_predictions:
    types.append([int(x) for x in ast.literal_eval(pred_list)])
tag_predictions = types

types = []
for pred_list in text_predictions:
    types.append([int(x) for x in ast.literal_eval(pred_list)]) 
text_predictions = types

types = []
for pred_list in actual:
    types.append([int(x) for x in ast.literal_eval(pred_list)])  
actual = types

#### __MAP@K precision__

In [None]:
from ml_metrics import mapk

In [None]:
tag_mapk = []
for K in np.arange(1, TOP_N+1):
    tag_mapk.extend([mapk(actual, tag_predictions, k=K)])
tag_mapk

In [None]:
text_mapk = []
for K in np.arange(1, TOP_N+1):
    text_mapk.extend([mapk(actual, text_predictions, k=K)])
text_mapk

In [None]:
mapk_df = pd.DataFrame(np.column_stack([tag_mapk, text_mapk]), range(1,TOP_N+1), columns=['Tag model', 'Text model'])

ax = plot_line(mapk_df, title='Mean Average Precision at K (MAP@K) comparison', ylabel='MAP@K', xlabel='K', 
               linewidth=3.0, palette='tab10')
plt.xticks(range(1,11))
ax

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(REPORTS_CB_DIR, 'cb_map_at_k.png'), bbox_inches = "tight")

#### __MAR@K recall__

In [None]:
tag_mark = []
for K in np.arange(1, 11):
    tag_mark.extend([recmetrics.mark(actual, tag_predictions, k=K)])
tag_mark

In [None]:
text_mark = []
for K in np.arange(1, 11):
    text_mark.extend([recmetrics.mark(actual, text_predictions, k=K)])
text_mark

In [None]:
mark_df = pd.DataFrame(np.column_stack([tag_mark, text_mark]), range(1,TOP_N+1), columns=['Tag model', 'Text model'])

ax = plot_line(mark_df, title='Mean Average Recall at K (MAR@K) comparison', ylabel='MAR@K', xlabel='K', 
               linewidth=3.0, palette='tab10')
plt.xticks(range(1,11))
ax

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(REPORTS_CB_DIR, 'cb_mar_at_k.png'), bbox_inches = "tight")

#### __Coverage__

In [None]:
test_movies = pd.read_csv(TEST_MOVIES_PATH)

In [None]:
COVERAGE_TOP = 10

all_movies = test_movies.movieId.unique().tolist()
tag_coverage = []
for sublist in tag_predictions:
    tag_coverage.append(sublist[0:COVERAGE_TOP])
    
text_coverage = []
for sublist in text_predictions:
    text_coverage.append(sublist[0:COVERAGE_TOP])

tag_coverage = recmetrics.prediction_coverage(tag_coverage, all_movies)
text_coverage = recmetrics.prediction_coverage(text_coverage, all_movies)

In [None]:
fig = plt.figure(figsize=(6, 6))
with sns.axes_style("darkgrid"):
    ax = sns.barplot(x=['Tag model', 'Text model'], y=[tag_coverage, text_coverage], palette='tab10')
    ax.set_title(f'Test movies coverage in top {COVERAGE_TOP} recommendations', fontsize=12.0)
    ax.set_ylabel('coverage [%]')

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(REPORTS_CB_DIR, f'cb_coverage_for_{COVERAGE_TOP}_top.png'), bbox_inches = "tight")