# Matrix Factorization 

## Setup
I am using cython because it allows me to speed up the training of the model

### Cython -- WIP

In [41]:
%load_ext Cython
%%cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


UsageError: Line magic function `%%cython` not found.


### Imports

In [47]:
%cd ./RecSys

c:\Progetto_Ing_Informatica\Master\RecSys


In [43]:
import time
import numpy as np
import pandas as pd
from numpy.ma import MaskedArray
import sklearn.utils.fixes
import scipy.sparse as sps
import functions
from Evaluation.Evaluator import EvaluatorHoldout

## Data

### Paths

In [44]:
URM_PATH= 'Data/interactions_and_impressions.csv'

### Load data

In [48]:
# Load the data
URM_all_dataframe = pd.read_csv(filepath_or_buffer=URM_PATH,
                                sep=",",
                                header=0, engine='python')
URM_all_dataframe.columns = ["UserID", "ItemID", "Impressions", "Data"]

print("The number of interactions is {}".format(len(URM_all_dataframe)))

The number of interactions is 5826506


In [49]:
URM_all_dataframe.head(5)

Unnamed: 0,UserID,ItemID,Impressions,Data
0,0,11,012345678910111213141516171819,1
1,0,21,,0
2,0,21,,0
3,0,21,20212223242526272829,0
4,0,21,,1


In [50]:
userID_unique = URM_all_dataframe["UserID"].unique()
itemID_unique = URM_all_dataframe["ItemID"].unique()

### Costants

In [51]:
n_users = len(userID_unique)
n_items = len(itemID_unique)
n_iteractions = len(URM_all_dataframe)

### Analyse

In [52]:
print("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print("Number of interactions\t {}".format(n_iteractions))

Number of items	 24507, Number of users	 41629
Max ID items	 24506, Max Id users	 41628

Number of interactions	 5826506


In [53]:
print("Average interactions per user {:.2f}".format(n_iteractions / n_users))
print("Average interactions per item {:.2f}\n".format(n_iteractions / n_items))

Average interactions per user 139.96
Average interactions per item 237.75



In [54]:
print("Sparsity {:.2f} %".format((1 - float((n_iteractions) / (n_items * n_users))) * 100))

Sparsity 99.43 %


## Paramenters

In [55]:
VALIDATION_RATIO = 0.15
TEST_RATIO = 0.1

NUM_LATENT_FACTORS = 10
LEARNING_RATE = 1e-4
REGULARIZATION= 1e-5

DESTINATION_PATH = 'Data/data_target_users_test.csv'

WISE_USER = False # If True, select the number of interactions one user at a time. Otherwise, globally.

## Implict URM

In [56]:
#Create a binary matrix with one per each interaction
URM_all = sps.coo_matrix((np.ones(len(URM_all_dataframe["Data"].values)),
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values)))
URM_all = URM_all.tocsr()  # to obtain fast access to rows (users)

URM_train, URM_validation, URM_test = functions.split_train_in_three_percetanges(URM_all, WISE_USER, VALIDATION_RATIO, TEST_RATIO)



### Create the evaluator

In [57]:
# create an evaluator object to evaluate validation set
# will use it for hyperparameter tuning
evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])

EvaluatorHoldout: Ignoring 968 ( 2.3%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 2825 ( 6.8%) Users that have less than 1 test interactions


## Bayesian Search

In [58]:
import Recommenders.MatrixFactorization.IALSRecommender as recsys

recommender_class = recsys.IALSRecommender


In [59]:
import os

output_folder_path = "result_experiments/"

# If directory does not exist, create
if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)

In [60]:
#start Hyperparameter tuning
n_cases = 50
n_random_starts = int(n_cases*0.3)
metric_to_optimize = "MAP"
cutoff_to_optimize = 10
from skopt.space import Real, Integer, Categorical


hyperparameters_range_dictionary = {
    "l1_ratio": Real(low = 0.001, high = 0.01, prior = 'log-uniform'), #prior = log-uniform means that valeus
                                                # are sampled uniformly between log(lower, base) and log(upper, base)
                                                # (default base is 10)
    "alpha": Real(low = 0.01, high = 0.1, prior = 'log-uniform'), #low and high are the lower bound and the upper bound
    "positive_only": Categorical([True]),
    "topK": Integer(200,450)
}

#Setup the early stopping --> to save a lot of computational time
earlystopping_keywargs = {"validation_every_n": 5,
                          "stop_on_validation": True,
                          "evaluator_object": evaluator_validation,
                          "lower_validations_allowed": 5,
                          "validation_metric": metric_to_optimize,
                          }

ModuleNotFoundError: No module named 'skopt'

In [None]:
from HyperparameterTuning.SearchBayesianSkopt import SearchBayesianSkopt

#create a bayesian optimizer object, we pass the recommender and the evaluator
hyperparameterSearch = SearchBayesianSkopt(recommender_class,
                                         evaluator_validation=evaluator_validation,
                                         evaluator_test=evaluator_test)
                                         
from HyperparameterTuning.SearchAbstractClass import SearchInputRecommenderArgs

#provide data needed to create instance of model (one on URM_train, the other on URM_all)
recommender_input_args = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train],     # For a CBF model simply put [URM_train, ICM_train]
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = earlystopping_keywargs
)
recommender_input_args_last_test = SearchInputRecommenderArgs(
    CONSTRUCTOR_POSITIONAL_ARGS = [URM_train_validation],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = earlystopping_keywargs
)
#let's run the bayesian search
hyperparameterSearch.search(recommender_input_args = recommender_input_args,
                       recommender_input_args_last_test = recommender_input_args_last_test,
                       hyperparameter_search_space = hyperparameters_range_dictionary,
                       n_cases = n_cases,
                       n_random_starts = n_random_starts,
                       save_model = "last",
                       output_folder_path = output_folder_path, # Where to save the results
                       output_file_name_root = recommender_class.RECOMMENDER_NAME, # How to call the files
                       metric_to_optimize = metric_to_optimize,
                       cutoff_to_optimize = cutoff_to_optimize,
                      )


In [None]:
from Recommenders.DataIO import DataIO

#explore the results of the search
data_loader = DataIO(folder_path = output_folder_path)
search_metadata = data_loader.load_data(recommender_class.RECOMMENDER_NAME + "_metadata.zip")

search_metadata.keys()
hyperparameters_df = search_metadata["hyperparameters_df"]
result_on_validation_df = search_metadata["result_on_validation_df"]
best_hyperparameters = search_metadata["hyperparameters_best"]

In [None]:
print(hyperparameters_df)
print(result_on_validation_df)
print(best_hyperparameters)

## Funk SVD

### Build the RecSys

In [None]:
recommender = recsys.IALSRecommender(URM_all)

### Train the model

In [None]:
recommender.fit(self, epochs = 300,
            num_factors = NUM_LATENT_FACTORS,
            confidence_scaling = "linear",
            alpha = 1.0,
            epsilon = 1.0,
            reg = REGULARIZATION,
            init_mean=0.0,
            init_std=0.1,)

### Save the model

In [None]:
recommender.save_model(output_folder_path, file_name=recommender.RECOMMENDER_NAME + "_my_own_save.zip")

In [None]:
test_users = pd.read_csv(DESTINATION_PATH)

user_id = test_users['user_id']
recommendations = []
for user in user_id:
    recommendations.append(recommender.recommend(user, cutoff=10))
for index in range(len(recommendations)):
    recommendations[index] = np.array(recommendations[index])

test_users['item_list'] = recommendations
test_users['item_list'] = pd.DataFrame(
    [str(line).strip('[').strip(']').replace("'", "") for line in test_users['item_list']])
test_users.to_csv('submission.csv', index=False)