# __CF models comparison__

##### Initial setup

In [None]:
%load_ext watermark
%load_ext autoreload

In [None]:
%autoreload 2
%watermark -v -n -m -p numpy,pandas,recmetrics,matplotlib

In [None]:
import sys
import os
from pathlib import Path
sys.path.append('../../../../')
from src.settings import DATA_DIR, PROJECT_DIR, RESULT_DIR

CURRENT_PATH = os.path.abspath(os.path.join(os.pardir))
print(CURRENT_PATH)
print(DATA_DIR)
print(RESULT_DIR)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import recmetrics
import seaborn as sns
import pickle

In [None]:
import logging

logging.basicConfig(level=logging.INFO)

from src.data_processing.visualization.plot_utils import plot_line

In [None]:
REPORTS_CF_DIR = Path(PROJECT_DIR) / 'reports' / 'figures' / 'sec4_cf'
Path(REPORTS_CF_DIR).mkdir(parents=True, exist_ok=True)
REPORTS_CF_DIR

##### Load data

In [None]:
TEST_MOVIES_PATH = Path(DATA_DIR) / 'datasets' / 'compare_split' / 'movies_test_1k_users.csv'
TEST_RATINGS_PATH = Path(DATA_DIR) / 'datasets' / 'compare_split' / 'ratings_test_1k_users.csv'
TRAIN_RATINGS_PATH = Path(DATA_DIR) / 'datasets' / 'compare_split' / 'ratings_train_1k_users.csv'

In [None]:
test_ratings = pd.read_csv(TEST_RATINGS_PATH)

In [None]:
test_ratings.info()

In [None]:
RELEVANI_RATE = 3.5
relevant_ratings = test_ratings.query(f'rating >={RELEVANI_RATE}')
relevant_ratings = relevant_ratings.drop(columns=['timestamp'])
relevant_ratings.reset_index(drop=True, inplace=True)

In [None]:
relevant_ratings.info()

##### Functions

In [None]:
def get_user_recommendation(df, user_id, sort_col, top_n=10):
    df = df[df.userId == user_id]
    df = df.sort_values(by=[sort_col], ascending=False)
    recommended_items = df.movieId.values[0:top_n]
    
    return recommended_items.tolist()   

In [None]:
def get_knn_user_recommendation(model, user_id: int, top_n=10):
    recs = model.get_recommendations_for_user(user_id=user_id, top=top_n)
    recs_movies = [movie_id for (title, movie_id, cosine) in recs]
    
    return recs_movies
    

### __Load models__

In [None]:
LOGS_DIR = Path(RESULT_DIR) / 'logs'
MODEL_DIR = Path(RESULT_DIR) / 'models'
CHECKPOINT_DIR = Path(RESULT_DIR) / 'checkpoints'

#### __NeuMf (301 epochs)__

In [None]:
MODEL_NAME = '2020-06-02_17-25_NeuFM_compare_split'
NEUMF_RESULTS_PATH = Path(LOGS_DIR) / 'neural_mf' / MODEL_NAME

In [None]:
with open(Path(NEUMF_RESULTS_PATH) / 'spearman.pkl', 'rb') as f:
    neumf_results = pickle.load(f)

In [None]:
neumf_true_rates = neumf_results['true_rates']
neumf_pred_rates = np.clip(neumf_results['pred_rates'].flatten(), 0.5, 5.0)

#### __FunkSVD (13 epochs)__

In [None]:
FUNK_MODEL_PATH = Path(MODEL_DIR) / 'funk_svd' / 'compare_common_split' / '2020-06-01_14-44_FunkSVD-explicit-test-data.pkl'
FUNK_LOGS_DIR = Path(RESULT_DIR) / 'logs' / 'funk_svd' / 'compare_common_split'
FUNK_RESULTS_PATH = Path(FUNK_LOGS_DIR) / '2020-06-01_14-44_FunkSVD_test_data_spearman.pkl'

In [None]:
with open(FUNK_RESULTS_PATH, 'rb') as f:
    funk_results = pickle.load(f)

In [None]:
funk_true_rates = funk_results['true_rates']
funk_pred_rates = funk_results['pred_rates']

#### __KNN__

In [None]:
from src.models.collaborative_filtering.knn import KNNModel

In [None]:
knn_model = KNNModel(ratings_path=TRAIN_RATINGS_PATH, movies_path=TEST_MOVIES_PATH, ratings_threshold=3.5, metric='cosine')

In [None]:
knn_model.preprocess_data()

In [None]:
knn_model.get_recommendations_for_user(user_id=2)

In [None]:
knn_model.get_recommendations_for_movie('The Shawshank Redemption')

In [None]:
knn_model.get_recommendations_for_movie('Iron Man')

## __Compare metrics__

In [None]:
TOP_N = 100
METRIC_TOP_N = 10

Format test dataframe with relevant items to have __list__ of actual liked films (rate >= 3.5)

In [None]:
test_df_format = relevant_ratings.copy().groupby('userId')['movieId'].agg(actual=(lambda x: list(set(x)))).reset_index()

In [None]:
test_df_format.head()

In [None]:
test_ratings = test_ratings.assign(funk_pred_rate = funk_pred_rates) 
test_ratings = test_ratings.assign(neumf_pred_rate = neumf_pred_rates) 
test_ratings.head()

In [None]:
from tqdm import tqdm
from scipy.stats import spearmanr

def get_spearman_for_knn(df: pd.DataFrame, model):
    df = df.sort_values(by=['userId'])
    true_rates = df.rating.values.tolist()
    
    unique_users = np.unique(df['userId'].values)
    
    pred_distance = []    
    true_rates = []
    for user_id in tqdm(unique_users, desc='Getting recommendations for users', total=len(unique_users)):
        knn_pred = model.get_recommendations_for_user(user_id, top=13623)
        user_df = df[df['userId'] == user_id]
        liked_film_indices = user_df.movieId.values.tolist()
        knn_pred = [(movie_id, cosine) for (title, movie_id, cosine) in knn_pred if movie_id in liked_film_indices]
        pred_movie_indices = [t[0] for t in knn_pred]
        
        rates = []
        for movie_id in pred_movie_indices:
            rate = user_df[user_df.movieId == movie_id].rating.values.tolist()
            rates.extend(rate)
        
        pred_distance.extend(knn_pred)
        true_rates.extend(rates)
        
    spearman, p_val = spearmanr(true_rates, pred_distance)

    return spearman, p_val

In [None]:
spearman, p_val = get_spearman_for_knn(test_ratings, knn_model)

In [None]:
spearman

Add list of recommendations of models from all test data films

In [None]:
funk_recs = []
for user in test_df_format.userId.values:
    funk_pred = get_user_recommendation(test_ratings, user, sort_col='funk_pred_rate', top_n=TOP_N)
    funk_recs.append(funk_pred)
        
test_df_format['funk_pred'] = funk_recs

In [None]:
neumf_recs = []
for user in test_df_format.userId.values:
    neumf_pred = get_user_recommendation(test_ratings, user, sort_col='neumf_pred_rate', top_n=TOP_N)
    neumf_recs.append(neumf_pred)
        
test_df_format['neumf_pred'] = neumf_recs

In [None]:
from tqdm import tqdm

knn_recs = []
for user in tqdm(test_df_format.userId.values, desc='Getting recommendations', total=len(test_df_format.index)):
    knn_pred = get_knn_user_recommendation(knn_model, user, top_n=TOP_N)
    knn_recs.append(knn_pred)
        
test_df_format['knn_pred'] = knn_recs

In [None]:
test_df_format.head()

In [None]:
test_df_format.to_csv(Path(DATA_DIR) / 'datasets' / 'cf_models_compare.csv', index=False)

In [None]:
actual = test_df_format.actual.values.tolist()
funk_predictions = test_df_format.funk_pred.values.tolist()
neumf_predictions = test_df_format.neumf_pred.values.tolist()
knn_predictions = test_df_format.knn_pred.values.tolist()

#### Load compare dataframe if saved

In [None]:
test_df_format = pd.read_csv(Path(DATA_DIR) / 'datasets' / 'cf_models_compare.csv')

In [None]:
import ast

actual = test_df_format.actual.values.tolist()
funk_predictions = test_df_format.funk_pred.values.tolist()
neumf_predictions = test_df_format.neumf_pred.values.tolist()
knn_predictions = test_df_format.knn_pred.values.tolist()

types = []
for pred_list in funk_predictions:
    types.append([int(x) for x in ast.literal_eval(pred_list)])
funk_predictions = types

types = []
for pred_list in neumf_predictions:
    types.append([int(x) for x in ast.literal_eval(pred_list)]) 
neumf_predictions = types

types = []
for pred_list in knn_predictions:
    types.append([int(x) for x in ast.literal_eval(pred_list)]) 
knn_predictions = types

types = []
for pred_list in actual:
    types.append([int(x) for x in ast.literal_eval(pred_list)])  
actual = types

### __MAP@K mean average precision at K__

In [None]:
from ml_metrics import mapk

In [None]:
neumf_mapk = []
for K in np.arange(1, METRIC_TOP_N+1):
    neumf_mapk.extend([mapk(actual, neumf_predictions[0:METRIC_TOP_N], k=K)])
neumf_mapk

In [None]:
funk_mapk = []
for K in np.arange(1, METRIC_TOP_N+1):
    funk_mapk.extend([mapk(actual, funk_predictions[0:METRIC_TOP_N], k=K)])
funk_mapk

In [None]:
knn_mapk = []
for K in np.arange(1, METRIC_TOP_N+1):
    knn_mapk.extend([mapk(actual, knn_predictions[0:METRIC_TOP_N], k=K)])
knn_mapk

In [None]:
mapk_df = pd.DataFrame(np.column_stack([knn_mapk, funk_mapk, neumf_mapk]), range(1,METRIC_TOP_N+1), columns=['KNN', 'FunkSVD', 'NeuMF'])

ax = plot_line(mapk_df, title='Mean Average Precision at K (MAP@K) comparison', ylabel='MAP@K', xlabel='K', 
               linewidth=2.0, palette='tab10')
plt.xticks(range(1,METRIC_TOP_N+1))
ax

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(REPORTS_CF_DIR, 'cf_map_at_k.png'), bbox_inches = "tight")

### __MAR@K mean average recall at K__

In [None]:
neumf_mark = []
for K in np.arange(1, METRIC_TOP_N+1):
    neumf_mark.extend([recmetrics.mark(actual, neumf_predictions[0:METRIC_TOP_N], k=K)])
neumf_mark

In [None]:
funk_mark = []
for K in np.arange(1, METRIC_TOP_N+1):
    funk_mark.extend([recmetrics.mark(actual, funk_predictions[0:METRIC_TOP_N], k=K)])
funk_mark

In [None]:
knn_mark = []
for K in np.arange(1, METRIC_TOP_N+1):
    knn_mark.extend([recmetrics.mark(actual, knn_predictions[0:METRIC_TOP_N], k=K)])
knn_mark

In [None]:
mark_df = pd.DataFrame(np.column_stack([knn_mark, funk_mark, neumf_mark]), range(1,METRIC_TOP_N+1), columns=['KNN', 'FunkSVD', 'NeuMF'])

ax = plot_line(mark_df, title='Mean Average Recall at K (MAR@K) comparison', ylabel='MAR@K', xlabel='K', 
               linewidth=2.0, palette='tab10')
plt.xticks(range(1,METRIC_TOP_N+1))
ax

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(REPORTS_CF_DIR, 'cf_mar_at_k.png'), bbox_inches = "tight")

### __Coverage__

In [None]:
test_movies = pd.read_csv(TEST_MOVIES_PATH)

In [None]:
COVERAGE_TOP = 10

all_movies = test_movies.movieId.unique().tolist()

funk_coverage = []
for sublist in funk_predictions:
    funk_coverage.append(sublist[0:COVERAGE_TOP])
    
neumf_coverage = []
for sublist in neumf_predictions:
    neumf_coverage.append(sublist[0:COVERAGE_TOP])

knn_coverage = []
for sublist in knn_predictions:
    knn_coverage.append(sublist[0:COVERAGE_TOP])
    
funk_coverage = recmetrics.prediction_coverage(funk_coverage, all_movies)
neumf_coverage = recmetrics.prediction_coverage(neumf_coverage, all_movies)
knn_coverage = recmetrics.prediction_coverage(knn_coverage, all_movies)

In [None]:
fig = plt.figure(figsize=(6, 6))
with sns.axes_style("darkgrid"):
    ax = sns.barplot(x=['KNN', 'FunkSVD', 'NeuMF'], 
                     y=[knn_coverage, funk_coverage, neumf_coverage], palette='tab10')
    ax.set_title(f'Test movies coverage in top {COVERAGE_TOP} recommendations', fontsize=12.0)
    ax.set_ylabel('coverage [%]')

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(REPORTS_CF_DIR, f'cf_coverage_for_{COVERAGE_TOP}_top.png'), bbox_inches = "tight")