# __Hybrid - merged CF and CB__

##### Initial setup

In [None]:
%load_ext watermark
%load_ext autoreload

In [None]:
%autoreload 2
%watermark -v -n -m -p numpy,pandas,recmetrics,matplotlib

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import recmetrics
import seaborn as sns
import sys
import os
import pickle
from scipy.stats import spearmanr
from pathlib import Path
sys.path.append('../../../../')
from src.settings import DATA_DIR, PROJECT_DIR, RESULT_DIR

CURRENT_PATH = os.path.abspath(os.path.join(os.pardir))
print(CURRENT_PATH)
print(DATA_DIR)
print(RESULT_DIR)

In [None]:
import logging

logging.basicConfig(level=logging.INFO)

from src.data_processing.visualization.plot_utils import plot_line

In [None]:
REPORTS_HYBRID_DIR = Path(PROJECT_DIR) / 'reports' / 'figures' / 'sec5_hybrid'
Path(REPORTS_HYBRID_DIR).mkdir(parents=True, exist_ok=True)
REPORTS_HYBRID_DIR

##### Load data

In [None]:
TEST_MOVIES_PATH = Path(DATA_DIR) / 'datasets' / 'compare_split' / 'movies_test_1k_users.csv'
TEST_RATINGS_PATH = Path(DATA_DIR) / 'datasets' / 'compare_split' / 'ratings_test_1k_users.csv'
TRAIN_RATINGS_PATH = Path(DATA_DIR) / 'datasets' / 'compare_split' / 'ratings_train_1k_users.csv'

In [None]:
test_ratings = pd.read_csv(TEST_RATINGS_PATH)

In [None]:
test_ratings.head()

In [None]:
test_ratings.info()

In [None]:
RELEVANI_RATE = 3.5
relevant_ratings = test_ratings.query(f'rating >={RELEVANI_RATE}')
relevant_ratings = relevant_ratings.drop(columns=['timestamp'])
relevant_ratings.reset_index(drop=True, inplace=True)

In [None]:
relevant_ratings.info()

##### Functions

In [None]:
def get_cf_cb_user_recommendation(df: pd.DataFrame, cb_model, user_id: int, pred_rate_col: str, rate_threshold=3.5, top_n=10):
    df = df[df.userId == user_id]
    df = df[df[pred_rate_col] >= rate_threshold]
    liked_film_indices = df.movieId.values.tolist()
    cb_recs = cb_model.get_recommendations_for_user(user_id, top=13623)
    cb_recs = [(movie_id, cosine) for (title, movie_id, cosine) in cb_recs if movie_id in liked_film_indices]
    
    recommended_items = cb_recs[0:top_n]
    recommended_items = [movie_id for (movie_id, cosine) in recommended_items]
    
    return recommended_items

## __Load models__

In [None]:
LOGS_DIR = Path(RESULT_DIR) / 'logs'
MODEL_DIR = Path(RESULT_DIR) / 'models'
CHECKPOINT_DIR = Path(RESULT_DIR) / 'checkpoints'

#### __FunkSVD__

In [None]:
FUNK_MODEL_PATH = Path(MODEL_DIR) / 'funk_svd' / 'compare_common_split' / '2020-06-01_14-44_FunkSVD-explicit-test-data.pkl'
FUNK_LOGS_DIR = Path(RESULT_DIR) / 'logs' / 'funk_svd' / 'compare_common_split'
FUNK_RESULTS_PATH = Path(FUNK_LOGS_DIR) / '2020-06-01_14-44_FunkSVD_test_data_spearman.pkl'

In [None]:
with open(FUNK_RESULTS_PATH, 'rb') as f:
    funk_results = pickle.load(f)

In [None]:
funk_true_rates = funk_results['true_rates']
funk_pred_rates = funk_results['pred_rates']

#### __Tag model__

In [None]:
from src.models.content_based.tag_model import TagModel

In [None]:
tag_model = TagModel(ratings_path=TRAIN_RATINGS_PATH, movies_path=TEST_MOVIES_PATH, rate_threshold=3.5)

In [None]:
tag_model.preprocess_data()

#### __Text model__

In [None]:
from src.models.content_based.text_model import TextModel

In [None]:
text_model = TextModel(ratings_path=TRAIN_RATINGS_PATH, movies_path=TEST_MOVIES_PATH, rate_threshold=3.5)

In [None]:
text_model.preprocess_data()

## __Compare metrics__

In [None]:
TOP_N = 100

In [None]:
test_df_format = relevant_ratings.copy().groupby('userId')['movieId'].agg(actual=(lambda x: list(set(x)))).reset_index()

In [None]:
test_df_format.info()

In [None]:
test_df_format.head()

In [None]:
test_ratings = test_ratings.assign(funk_pred_rate = funk_pred_rates) 
test_ratings.drop(columns=['timestamp'], inplace=True)
test_ratings.head()

#### __Funk with tag model__

In [None]:
from tqdm import tqdm

funk_tag_recs = []
for user in tqdm(test_df_format.userId.values, desc='Getting recommendations', total=len(test_df_format.index)):
    funk_pred = get_cf_cb_user_recommendation(test_ratings, tag_model, user, pred_rate_col='funk_pred_rate', top_n=TOP_N)
    funk_tag_recs.append(funk_pred)
        
test_df_format['funk_tag_pred'] = funk_tag_recs

In [None]:
test_df_format.head()

#### __Funk with text model__

In [None]:
from tqdm import tqdm

funk_text_recs = []
for user in tqdm(test_df_format.userId.values, desc='Getting recommendations', total=len(test_df_format.index)):
    funk_pred = get_cf_cb_user_recommendation(test_ratings, text_model, user, pred_rate_col='funk_pred_rate', top_n=TOP_N)
    funk_text_recs.append(funk_pred)
        
test_df_format['funk_text_pred'] = funk_text_recs

In [None]:
test_df_format.head()

In [None]:
test_df_format.to_csv(Path(DATA_DIR) / 'datasets' / 'hybrid_compare.csv', index=False)

In [None]:
actual = test_df_format.actual.values.tolist()
funk_tag_predictions = test_df_format.funk_tag_pred.values.tolist()
funk_text_predictions = test_df_format.funk_text_pred.values.tolist()

#### Load compare dataframe if saved

In [None]:
test_df_format = pd.read_csv(Path(DATA_DIR) / 'datasets' / 'hybrid_compare.csv')

In [None]:
test_df_format.info()

In [None]:
test_df_format.head()

In [None]:
import ast

actual = test_df_format.actual.values.tolist()
funk_tag_predictions = test_df_format.funk_tag_pred.values.tolist()
funk_text_predictions = test_df_format.funk_text_pred.values.tolist()

types = []
for pred_list in funk_tag_predictions:
    types.append([int(x) for x in ast.literal_eval(pred_list)])
funk_tag_predictions = types

types = []
for pred_list in funk_text_predictions:
    types.append([int(x) for x in ast.literal_eval(pred_list)]) 
funk_text_predictions = types

types = []
for pred_list in actual:
    types.append([int(x) for x in ast.literal_eval(pred_list)])  
actual = types

In [None]:
funk_tag_predictions[0]

### __MAP@K precision__

In [None]:
from ml_metrics import mapk

In [None]:
funk_tag_mapk = []
for K in np.arange(1, TOP_N+1):
    funk_tag_mapk.extend([mapk(actual, funk_tag_predictions, k=K)])
funk_tag_mapk

In [None]:
funk_text_mapk = []
for K in np.arange(1, TOP_N+1):
    funk_text_mapk.extend([mapk(actual, funk_text_predictions, k=K)])
funk_text_mapk

In [None]:
mapk_df = pd.DataFrame(np.column_stack([funk_tag_mapk, funk_text_mapk]), range(1,TOP_N+1), 
                       columns=['Hybrid FunkSVD-Tag', 'Hybrid FunkSVD-Text'])

ax = plot_line(mapk_df, title='Mean Average Precision at K (MAP@K) comparison', ylabel='MAP@K', xlabel='K', 
               linewidth=2.0, palette='tab10')
plt.xticks(range(1,TOP_N+1))
ax

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(REPORTS_HYBRID_DIR, 'hybrid_map_at_k.png'), bbox_inches = "tight")

### __MAR@K mean average recall at K__

In [None]:
funk_tag_mark = []
for K in np.arange(1, 11):
    funk_tag_mark.extend([recmetrics.mark(actual, funk_tag_predictions, k=K)])
funk_tag_mark

In [None]:
funk_text_mark = []
for K in np.arange(1, 11):
    funk_text_mark.extend([recmetrics.mark(actual, funk_text_predictions, k=K)])
funk_text_mark

In [None]:
mark_df = pd.DataFrame(np.column_stack([funk_tag_mark, funk_text_mark]), range(1,TOP_N+1), 
                       columns=['Hybrid FunkSVD-Tag', 'Hybrid FunkSVD-Text'])

ax = plot_line(mark_df, title='Mean Average Recall at K (MAR@K) comparison', ylabel='MAR@K', xlabel='K', 
               linewidth=2.0, palette='tab10')
plt.xticks(range(1,TOP_N+1))
ax

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(REPORTS_HYBRID_DIR, 'hybrid_mar_at_k.png'), bbox_inches = "tight")

### __Coverage__

In [None]:
test_movies = pd.read_csv(TEST_MOVIES_PATH)

In [None]:
COVERAGE_TOP = 10

all_movies = test_movies.movieId.unique().tolist()

funk_tag_coverage = []
for sublist in funk_tag_predictions:
    funk_tag_coverage.append(sublist[0:COVERAGE_TOP])
    
funk_text_coverage = []
for sublist in funk_text_predictions:
    funk_text_coverage.append(sublist[0:COVERAGE_TOP])

funk_tag_coverage = recmetrics.prediction_coverage(funk_tag_coverage, all_movies)
funk_text_coverage = recmetrics.prediction_coverage(funk_text_coverage, all_movies)

In [None]:
fig = plt.figure(figsize=(6, 6))
with sns.axes_style("darkgrid"):
    ax = sns.barplot(x=['Hybrid FunkSVD-Tag', 'Hybrid FunkSVD-Text'], 
                     y=[funk_tag_coverage, funk_text_coverage], palette='tab10')
    ax.set_title(f'Test movies coverage in top {COVERAGE_TOP} recommendations', fontsize=12.0)
    ax.set_ylabel('coverage [%]')

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(REPORTS_HYBRID_DIR, f'hybrid_coverage_for_{COVERAGE_TOP}_top.png'), bbox_inches = "tight")