# __Funk SVD matrix factorization result visualizations__

##### Initial setup

In [None]:
%load_ext watermark
%load_ext autoreload

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import pickle
import os
from pathlib import Path
from pandas import option_context
sys.path.append('../../../../')
from src.settings import DATA_DIR, PROJECT_DIR,  RESULT_DIR

CURRENT_PATH = os.path.abspath(os.path.join(os.pardir))
print(CURRENT_PATH)
print(DATA_DIR)
print(RESULT_DIR)

In [None]:
import logging

logging.basicConfig(level=logging.INFO)

In [None]:
from src.data_processing.visualization.plot_utils import plot_counts, plot_violin, plot_line, plot_distribution

from src.data_processing.process.funk_svd_results_process import (build_spearman_drop_frame, build_true_pred_rate_frame, 
                                                                  prepare_logs_frame)
                                                   
    
from src.data_processing.process.ratings_df_utils import (prepare_user_unrated_movies_table, prepare_user_predict_rated_movies,
                                                          prepare_user_rated_genres, merge_user_rating_with_movies, 
                                                          prepare_user_rated_movies_table)

from src.experiments.collaborative_filtering.funk_svd_exp import load_model

from src.data_processing.dataframe_utils import (start_pipeline, drop_unnecessary_cols, expand_column, unpivot_dataframe,
                                                remove_nan, rename_cols, reset_index)

##### Load data

In [None]:
MERGED_DATA_PATH = Path(DATA_DIR) / 'processed' / 'merged_ml25m_kaggle'
MERGED_DATA_PATH

In [None]:
movies = pd.read_csv(Path(MERGED_DATA_PATH) / 'movies_merged.csv')
ratings = pd.read_csv(Path(MERGED_DATA_PATH) /  'ratings_merged.csv', dtype={
                                                                         'userId': np.int32,
                                                                         'movieId': np.int32,
                                                                         'rating': np.float32,
                                                                         'timestamp': np.int32,
                                                                     })

In [None]:
REPORTS_CF_DIR = Path(PROJECT_DIR, 'reports/figures/sec4_cf/funk_svd')
REPORTS_CF_DIR

In [None]:
MODEL_DIR = Path(RESULT_DIR) / 'models' / 'funk_svd' / 'drop_rates'
LOGS_DIR = Path(RESULT_DIR) / 'logs' / 'funk_svd' / 'drop_rates'

In [None]:
SPEARMAN_PATH = Path(LOGS_DIR) / '2020-06-01_14-58_FunkSVD_spearman-drop-rates.pkl'

In [None]:
with open(SPEARMAN_PATH, 'rb') as f:
    results = pickle.load(f)

## __Learning curve__

In [None]:
exp_name = '2020-06-01_14-58_FunkSVD-45-factors-0.01-lr-0.02-reg-13-epochs-0.5-DROP_RATE.pkl'
LOGS_PATH = Path(LOGS_DIR) / exp_name

In [None]:
with open(LOGS_PATH, 'rb') as f:
    train_logs = pickle.load(f)

In [None]:
train_df = prepare_logs_frame(train_logs)
train_df.head()

In [None]:
from matplotlib.ticker import MaxNLocator

ax = plot_line(data=train_df, x='epoch', y='val_loss', title='Learning curve of FunkSVD model', xlabel='train epoch', 
               ylabel='mean square error')
ax.xaxis.set_major_locator(MaxNLocator(integer=True))

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(REPORTS_CF_DIR, 'val_loss_curve.png'), bbox_inches = "tight")

## __Spearman correlations__

In [None]:
spearman_df_1 = build_spearman_drop_frame(results, drop_rate=0.1)
spearman_df_3 = build_spearman_drop_frame(results, drop_rate=0.3)
spearman_df_5 = build_spearman_drop_frame(results, drop_rate=0.5)
spearman_df_7 = build_spearman_drop_frame(results, drop_rate=0.7)

In [None]:
spearman_df = pd.concat([spearman_df_1, spearman_df_3, spearman_df_5, spearman_df_7])

In [None]:
spearman_df.head()

In [None]:
with sns.axes_style("darkgrid"):
    g = sns.FacetGrid(spearman_df, row="drop_rate", height=4, aspect=2.5)
    g.map(sns.distplot, "spearman_correlation")
    g.fig.suptitle('Spearman correlation distribution across different drop rates', y=1.005, fontsize=14.0)
    for ax in g.axes.flatten():
        ax.tick_params(labelbottom=True)

In [None]:
fig = g.fig
fig.savefig(os.path.join(REPORTS_CF_DIR, 'spearman_dist_plots.png'), bbox_inches = "tight")

## __True and pred rates compare__

In [None]:
true_pred_df_1 = build_true_pred_rate_frame(results, drop_rate=0.1)
true_pred_df_3 = build_true_pred_rate_frame(results, drop_rate=0.3)
true_pred_df_5 = build_true_pred_rate_frame(results, drop_rate=0.5)
true_pred_df_7 = build_true_pred_rate_frame(results, drop_rate=0.7)

In [None]:
true_pred_df = pd.concat([true_pred_df_1, true_pred_df_3, true_pred_df_5, true_pred_df_7])

In [None]:
true_pred_df.head()

In [None]:
true_pred_df.info()

In [None]:
with sns.axes_style("darkgrid"):
    g = sns.FacetGrid(true_pred_df, col="drop_rate", col_wrap=2, height=6, aspect=1)
    g.map_dataframe(sns.violinplot, x="true_rate", y="pred_rate", palette='viridis')
    
    g.fig.suptitle('True and predicted rates distribution across different drop rates', y=1.005, fontsize=14.0)

In [None]:
fig = g.fig
fig.savefig(Path(REPORTS_CF_DIR) / 'true_pred_rate_violin_plots.png', bbox_inches = "tight")

## __Example users recommendations__

In [None]:
exp_name = '2020-05-23_20-57_FunkSVD-45-factors-0.01-lr-0.02-reg-13-epochs-True-shuffle.pkl'
MODEL_PATH = os.path.join(MODEL_DIR, exp_name)
LOGS_PATH = os.path.join(LOGS_DIR, exp_name)

In [None]:
with open(MODEL_PATH, 'rb') as f:
    model_weights = pickle.load(f)

In [None]:
model = load_model(model_weights)

##### User 847 with 2701 rate amount and rate avg =	3.458719

In [None]:
USER_ID = 847

In [None]:
user_unrated_df = prepare_user_unrated_movies_table(ratings, USER_ID)

In [None]:
user_unrated_df.info()

In [None]:
predict_rates = model.predict(user_unrated_df)

In [None]:
user_pred_df = prepare_user_predict_rated_movies(user_unrated_df, movies, predict_rates)

In [None]:
user_pred_df.head()

In [None]:
TOP_VAL = 3.75

user_top_list = user_pred_df[user_pred_df['predict_rate'] >= TOP_VAL]

In [None]:
len(user_top_list)

In [None]:
user_pred_genres = (user_top_list
                      .pipe(start_pipeline)
                      .pipe(drop_unnecessary_cols, columns=['plot_keywords', 'predict_rate'])
                      .pipe(expand_column,
                            keep_cols=['userId', 'movieId', 'title', 'release_date'], expand_col='genres')
                      .pipe(unpivot_dataframe,
                            keep_cols=['userId', 'movieId', 'title', 'release_date'])
                      .pipe(drop_unnecessary_cols, columns=['variable'])
                      .pipe(remove_nan, columns=['value'])
                      .pipe(rename_cols, colmap_dict={'value': 'genre'})
                      .pipe(reset_index))

In [None]:
user_rated_movies = (ratings
                   .pipe(prepare_user_rated_movies_table, USER_ID)
                   .pipe(merge_user_rating_with_movies, movies))

user_rated_genres = (user_rated_movies
                           .pipe(prepare_user_rated_genres))

In [None]:
user_rated_genres = user_rated_genres[user_rated_genres['rating'] >= TOP_VAL]
user_rated_genres = user_rated_genres.drop(columns=['timestamp', 'rating'])
user_rated_genres['status'] = [f'rated above {TOP_VAL}'] * len(user_rated_genres)
user_pred_genres['status'] = [f'recommended - predicted rate above {TOP_VAL}'] * len(user_pred_genres)

In [None]:
len(user_rated_genres)

In [None]:
pred_rated_concat = pd.concat([user_rated_genres, user_pred_genres])

In [None]:
ax = plot_counts(pred_rated_concat, count_col='genre', hue='status', rotate=True, palette='cubehelix',
                 title=f'Genre count between movies highly rated and recommended for user {USER_ID}')
ax

In [None]:
fig = ax.get_figure()
fig.savefig(os.path.join(REPORTS_CF_DIR, 'high_rate_recommend_compare.png'), bbox_inches = "tight")