# RecSys2019_DeepLearning_Evaluation

https://github.com/MaurizioFD/RecSys2019_DeepLearning_Evaluation

Установка:
pip install -r requirements.txt

$ pip install scikit-optimize

## Neural Collaborative Filtering
https://dl.acm.org/doi/pdf/10.1145/3038912.3052569

In [2]:
from Base.BaseRecommender import BaseRecommender
from Base.Incremental_Training_Early_Stopping import Incremental_Training_Early_Stopping


import numpy as np
import scipy.sparse as sps
from Base.DataIO import DataIO
import os
from keras.regularizers import l1, l2
from keras.models import Model, load_model, save_model, clone_model
from keras.layers import Embedding, Input, Dense, Reshape, Flatten, Dropout, Concatenate, Multiply
from keras.optimizers import Adagrad, Adam, SGD, RMSprop

In [3]:
def MLP_get_model(num_users, num_items, layers = [20,10], reg_layers=[0,0]):
    assert len(layers) == len(reg_layers)
    num_layer = len(layers) #Number of layers in the MLP
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')

    MLP_Embedding_User = Embedding(input_dim = num_users, output_dim = int(layers[0]/2), name = 'user_embedding',
                                   embeddings_initializer = 'random_normal', embeddings_regularizer = l2(reg_layers[0]), input_length=1)
    MLP_Embedding_Item = Embedding(input_dim = num_items, output_dim = int(layers[0]/2), name = 'item_embedding',
                                   embeddings_initializer = 'random_normal', embeddings_regularizer = l2(reg_layers[0]), input_length=1)

    # Crucial to flatten an embedding vector!
    user_latent = Flatten()(MLP_Embedding_User(user_input))
    item_latent = Flatten()(MLP_Embedding_Item(item_input))

    # The 0-th layer is the concatenation of embedding layers
    vector = Concatenate()([user_latent, item_latent])

    # MLP layers
    for idx in range(1, num_layer):
        layer = Dense(layers[idx], kernel_regularizer = l2(reg_layers[idx]), activation='relu', name = 'layer%d' %idx)
        vector = layer(vector)

    # Final prediction layer
    prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform', name = 'prediction')(vector)

    model = Model(inputs=[user_input, item_input],
                  outputs=prediction)

    return model


def GMF_get_model(num_users, num_items, latent_dim, regs=[0,0]):
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')

    MF_Embedding_User = Embedding(input_dim = num_users, output_dim = latent_dim, name = 'user_embedding',
                                  embeddings_initializer = 'random_normal', embeddings_regularizer = l2(regs[0]), input_length=1)
    MF_Embedding_Item = Embedding(input_dim = num_items, output_dim = latent_dim, name = 'item_embedding',
                                  embeddings_initializer = 'random_normal', embeddings_regularizer = l2(regs[1]), input_length=1)

    # Crucial to flatten an embedding vector!
    user_latent = Flatten()(MF_Embedding_User(user_input))
    item_latent = Flatten()(MF_Embedding_Item(item_input))

    # Element-wise product of user and item embeddings
    predict_vector = Multiply()([user_latent, item_latent])

    # Final prediction layer
    prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform', name = 'prediction')(predict_vector)

    model = Model(inputs=[user_input, item_input],
                outputs=prediction)

    return model


def NeuCF_get_model(num_users, num_items, mf_dim=10, layers=[10], reg_layers=[0], reg_mf=0.0):
    assert len(layers) == len(reg_layers)
    num_layer = len(layers) #Number of layers in the MLP
    # Input variables
    user_input = Input(shape=(1,), dtype='int32', name = 'user_input')
    item_input = Input(shape=(1,), dtype='int32', name = 'item_input')

    # Embedding layer
    MF_Embedding_User = Embedding(input_dim = num_users, output_dim = mf_dim, name = 'mf_embedding_user',
                                  embeddings_initializer = 'random_normal', embeddings_regularizer = l2(reg_mf), input_length=1)
    MF_Embedding_Item = Embedding(input_dim = num_items, output_dim = mf_dim, name = 'mf_embedding_item',
                                  embeddings_initializer = 'random_normal', embeddings_regularizer = l2(reg_mf), input_length=1)

    MLP_Embedding_User = Embedding(input_dim = num_users, output_dim = int(layers[0]/2), name = "mlp_embedding_user",
                                   embeddings_initializer = 'random_normal', embeddings_regularizer = l2(reg_layers[0]), input_length=1)
    MLP_Embedding_Item = Embedding(input_dim = num_items, output_dim = int(layers[0]/2), name = 'mlp_embedding_item',
                                   embeddings_initializer = 'random_normal', embeddings_regularizer = l2(reg_layers[0]), input_length=1)

    # MF part
    mf_user_latent = Flatten()(MF_Embedding_User(user_input))
    mf_item_latent = Flatten()(MF_Embedding_Item(item_input))
    mf_vector = Multiply()([mf_user_latent, mf_item_latent]) # element-wise multiply

    # MLP part
    mlp_user_latent = Flatten()(MLP_Embedding_User(user_input))
    mlp_item_latent = Flatten()(MLP_Embedding_Item(item_input))
    mlp_vector = Concatenate()([mlp_user_latent, mlp_item_latent])
    for idx in range(1, num_layer):
        layer = Dense(layers[idx], kernel_regularizer= l2(reg_layers[idx]), activation='relu', name="layer%d" %idx)
        mlp_vector = layer(mlp_vector)

    # Concatenate MF and MLP parts
    predict_vector = Concatenate()([mf_vector, mlp_vector])

    # Final prediction layer
    prediction = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform', name = "prediction")(predict_vector)

    model = Model(inputs=[user_input, item_input],
                  outputs=prediction)

    return model



def load_pretrain_model(model, gmf_model, mlp_model, num_layers):
    # MF embeddings
    gmf_user_embeddings = gmf_model.get_layer('user_embedding').get_weights()
    gmf_item_embeddings = gmf_model.get_layer('item_embedding').get_weights()
    model.get_layer('mf_embedding_user').set_weights(gmf_user_embeddings)
    model.get_layer('mf_embedding_item').set_weights(gmf_item_embeddings)

    # MLP embeddings
    mlp_user_embeddings = mlp_model.get_layer('user_embedding').get_weights()
    mlp_item_embeddings = mlp_model.get_layer('item_embedding').get_weights()
    model.get_layer('mlp_embedding_user').set_weights(mlp_user_embeddings)
    model.get_layer('mlp_embedding_item').set_weights(mlp_item_embeddings)

    # MLP layers
    for i in range(1, num_layers):
        mlp_layer_weights = mlp_model.get_layer('layer%d' %i).get_weights()
        model.get_layer('layer%d' %i).set_weights(mlp_layer_weights)

    # Prediction weights
    gmf_prediction = gmf_model.get_layer('prediction').get_weights()
    mlp_prediction = mlp_model.get_layer('prediction').get_weights()
    new_weights = np.concatenate((gmf_prediction[0], mlp_prediction[0]), axis=0)
    new_b = gmf_prediction[1] + mlp_prediction[1]
    model.get_layer('prediction').set_weights([0.5*new_weights, 0.5*new_b])
    return model


def get_train_instances(train, num_negatives, num_items):
    user_input, item_input, labels = [],[],[]
    num_users = train.shape[0]
    for (u, i) in train.keys():
        # positive instance
        user_input.append(u)
        item_input.append(i)
        labels.append(1)
        # negative instances
        for t in range(num_negatives):
            j = np.random.randint(num_items)
            while (u, j) in train.keys():#train.has_key((u, j)):
                j = np.random.randint(num_items)
            user_input.append(u)
            item_input.append(j)
            labels.append(0)
    return user_input, item_input, labels



def set_learner(model, learning_rate, learner):

    if learner.lower() == "adagrad":
        model.compile(optimizer=Adagrad(lr=learning_rate), loss='binary_crossentropy')
    elif learner.lower() == "rmsprop":
        model.compile(optimizer=RMSprop(lr=learning_rate), loss='binary_crossentropy')
    elif learner.lower() == "adam":
        model.compile(optimizer=Adam(lr=learning_rate), loss='binary_crossentropy')
    else:
        model.compile(optimizer=SGD(lr=learning_rate), loss='binary_crossentropy')

    return model


def deep_clone_model(source_model):

    destination_model = clone_model(source_model)
    destination_model.set_weights(source_model.get_weights())

    return destination_model



class NeuMF_RecommenderWrapper(BaseRecommender, Incremental_Training_Early_Stopping):


    RECOMMENDER_NAME = "NeuMF_RecommenderWrapper"

    def __init__(self, URM_train):
        super(NeuMF_RecommenderWrapper, self).__init__(URM_train)

        self._train = sps.dok_matrix(self.URM_train)
        self.n_users, self.n_items = self.URM_train.shape

        self._item_indices = np.arange(0, self.n_items, dtype=np.int)
        self._user_ones_vector = np.ones_like(self._item_indices)


    def _compute_item_score(self, user_id_array, items_to_compute=None):

        item_scores = - np.ones((len(user_id_array), self.n_items)) * np.inf

        for user_index in range(len(user_id_array)):

            user_id = user_id_array[user_index]

            # The prediction requires a list of two arrays user_id, item_id of equal length
            # To compute the recommendations for a single user, we must provide its index as many times as the
            # number of items
            item_score_user = self.model.predict([self._user_ones_vector*user_id, self._item_indices],
                                                 batch_size=100, verbose=0)


            if items_to_compute is not None:
                item_scores[user_index, items_to_compute] = item_score_user.ravel()[items_to_compute]
            else:
                item_scores[user_index, :] = item_score_user.ravel()


        return item_scores


    def get_early_stopping_final_epochs_dict(self):
        """
        This function returns a dictionary to be used as optimal parameters in the .fit() function
        It provides the flexibility to deal with multiple early-stopping in a single algorithm
        e.g. in NeuMF there are three model componets each with its own optimal number of epochs
        the return dict would be {"epochs": epochs_best_neumf, "epochs_gmf": epochs_best_gmf, "epochs_mlp": epochs_best_mlp}
        :return:
        """

        return {"epochs": self.epochs_best, "epochs_gmf": self.epochs_best_gmf, "epochs_mlp": self.epochs_best_mlp}




    def fit(self,
            epochs = 100,
            epochs_gmf=100,
            epochs_mlp=100,
            batch_size = 256,
            num_factors = 8,
            layers = [64,32,16,8],
            reg_mf = 0.0,
            reg_layers = [0,0,0,0],
            num_negatives = 4,
            learning_rate = 1e-3,
            learning_rate_pretrain = 1e-3,
            learner = 'sgd',
            learner_pretrain = 'adam',
            pretrain = True,
            root_folder_pretrain = None,
            **earlystopping_kwargs):
        """

        :param epochs:
        :param batch_size:
        :param num_factors: Embedding size of MF model
        :param layers: MLP layers. Note that the first layer is the concatenation of user and item embeddings. So layers[0]/2 is the embedding size.
        :param reg_mf: Regularization for MF embeddings.
        :param reg_layers: Regularization for each MLP layer. reg_layers[0] is the regularization for embeddings.
        :param num_negatives: Number of negative instances to pair with a positive instance.
        :param learning_rate:
        :param learning_rate_pretrain:
        :param learner: adagrad, adam, rmsprop, sgd
        :param learner_pretrain: adagrad, adam, rmsprop, sgd
        :param root_folder_pretrain: Specify the pretrain model folder where to save MF and MLP for MF part.
        :param do_pretrain:
        :return:
        """


        self.batch_size = batch_size
        self.mf_dim = num_factors
        self.layers = layers.copy()
        self.reg_mf = reg_mf
        self.reg_layers = reg_layers.copy()
        self.num_negatives = num_negatives

        assert learner in ["adagrad", "adam", "rmsprop", "sgd"]
        assert learner_pretrain in ["adagrad", "adam", "rmsprop", "sgd"]

        self.pretrain = pretrain

        if self.pretrain:

            if root_folder_pretrain is not None:
                print("NeuMF_RecommenderWrapper: pretrained models will be saved in '{}'".format(root_folder_pretrain))

                # If directory does not exist, create
                if not os.path.exists(root_folder_pretrain):
                    os.makedirs(root_folder_pretrain)

            print("NeuMF_RecommenderWrapper: root_folder_pretrain not provided, pretrained models will not be saved")

            print("NeuMF_RecommenderWrapper: Pretraining GMF...")

            self.model = GMF_get_model(self.n_users, self.n_items, self.mf_dim)
            self.model = set_learner(self.model, learning_rate_pretrain, learner_pretrain)

            self._best_model = deep_clone_model(self.model)

            self._train_with_early_stopping(epochs_gmf,
                                            algorithm_name = self.RECOMMENDER_NAME,
                                            **earlystopping_kwargs)

            self.epochs_best_gmf = self.epochs_best

            if root_folder_pretrain is not None:
                model_out_file = "GMF_factors_{}_pretrain".format(self.mf_dim)
                self._best_model.save_weights(root_folder_pretrain + model_out_file, overwrite=True)

            self.gmf_model = deep_clone_model(self._best_model)



            print("NeuMF_RecommenderWrapper: Pretraining MLP...")

            self.model = MLP_get_model(self.n_users, self.n_items, self.layers, self.reg_layers)
            self.model = set_learner(self.model, learning_rate_pretrain, learner_pretrain)

            self._best_model = deep_clone_model(self.model)

            self._train_with_early_stopping(epochs_mlp,
                                            algorithm_name = self.RECOMMENDER_NAME,
                                            **earlystopping_kwargs)

            self.epochs_best_mlp = self.epochs_best

            if root_folder_pretrain is not None:
                model_out_file = "MLP_layers_{}_reg_layers_{}_pretrain".format(self.layers, reg_layers)
                self._best_model.save_weights(root_folder_pretrain + model_out_file, overwrite=True)

            self.mlp_model = deep_clone_model(self._best_model)





        # Build model
        self.model = NeuCF_get_model(self.n_users, self.n_items, self.mf_dim, self.layers, self.reg_layers, self.reg_mf)
        self.model = set_learner(self.model, learning_rate, learner)


        # Load pretrain model
        if pretrain:
            self.model = load_pretrain_model(self.model, self.gmf_model, self.mlp_model, len(layers))
            print("NeuMF_RecommenderWrapper: Load pretrained GMF and MLP models.")


        print("NeuMF_RecommenderWrapper: Training NeuCF...")

        self._best_model = deep_clone_model(self.model)

        self._train_with_early_stopping(epochs,
                                        algorithm_name = self.RECOMMENDER_NAME,
                                        **earlystopping_kwargs)


        print("NeuMF_RecommenderWrapper: Tranining complete")

        self.model = deep_clone_model(self._best_model)




    def _prepare_model_for_validation(self):
        pass


    def _update_best_model(self):
        # Keras only clones the structure of the model, not the weights
        self._best_model = deep_clone_model(self.model)


    def _run_epoch(self, currentEpoch):

        # Generate training instances
        user_input, item_input, labels = get_train_instances(self._train, self.num_negatives, self.n_items)

        # Training
        hist = self.model.fit([np.array(user_input), np.array(item_input)], #input
                         np.array(labels), # labels
                         batch_size=self.batch_size, epochs=1, verbose=0, shuffle=True)

        print("NeuMF_RecommenderWrapper: Epoch {}, loss {:.2E}".format(currentEpoch+1, hist.history['loss'][0]))




















    def save_model(self, folder_path, file_name = None):

        if file_name is None:
            file_name = self.RECOMMENDER_NAME

        self._print("Saving model in file '{}'".format(folder_path + file_name))

        self.model.save_weights(folder_path + file_name + "_weights", overwrite=True)

        data_dict_to_save = {
            "n_users": self.n_users,
            "n_items": self.n_items,
            "mf_dim": self.mf_dim,
            "layers": self.layers,
            "reg_layers": self.reg_layers,
            "reg_mf": self.reg_mf,
        }

        dataIO = DataIO(folder_path=folder_path)
        dataIO.save_data(file_name=file_name, data_dict_to_save = data_dict_to_save)


        self._print("Saving complete")




    def load_model(self, folder_path, file_name = None):

        if file_name is None:
            file_name = self.RECOMMENDER_NAME

        self._print("Loading model from file '{}'".format(folder_path + file_name))

        dataIO = DataIO(folder_path=folder_path)
        data_dict = dataIO.load_data(file_name=file_name)

        for attrib_name in data_dict.keys():
             self.__setattr__(attrib_name, data_dict[attrib_name])


        self.model = NeuCF_get_model(self.n_users, self.n_items, self.mf_dim, self.layers, self.reg_layers, self.reg_mf)
        self.model.load_weights(folder_path + file_name + "_weights")


        self._print("Loading complete")


## run_WWW_17_NeuMF.py

In [4]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on 22/11/17

@author: Maurizio Ferrari Dacrema
"""

from Recommender_import_list import *
from Conferences.WWW.NeuMF_our_interface.NeuMF_RecommenderWrapper import NeuMF_RecommenderWrapper
from ParameterTuning.run_parameter_search import runParameterSearch_Collaborative
from ParameterTuning.SearchSingleCase import SearchSingleCase
from ParameterTuning.SearchAbstractClass import SearchInputRecommenderArgs
import os, traceback, argparse
from functools import partial
import numpy as np
from Utils.assertions_on_data_for_experiments import assert_implicit_data, assert_disjoint_matrices
from Utils.plot_popularity import plot_popularity_bias, save_popularity_statistics
from Utils.ResultFolderLoader import ResultFolderLoader, generate_latex_hyperparameters

def read_data_split_and_search(dataset_name,
                              CONFERENCE_NAME,
                              ALGORITHM_NAME,
                              flag_baselines_tune=False,
                              flag_DL_article_default=False,
                              flag_print_results=False):

    from Conferences.WWW.NeuMF_our_interface.Movielens1M.Movielens1MReader import Movielens1MReader
    from Conferences.WWW.NeuMF_our_interface.Pinterest.PinterestICCVReader import PinterestICCVReader

    result_folder_path = "result_experiments/{}/{}_{}/".format(CONFERENCE_NAME, ALGORITHM_NAME, dataset_name)

    if dataset_name == "movielens1m":
        dataset = Movielens1MReader(result_folder_path)
    elif dataset_name == "pinterest":
        dataset = PinterestICCVReader(result_folder_path)

    URM_train = dataset.URM_DICT["URM_train"].copy()
    URM_validation = dataset.URM_DICT["URM_validation"].copy()
    URM_test = dataset.URM_DICT["URM_test"].copy()
    URM_test_negative = dataset.URM_DICT["URM_test_negative"].copy()

    # Ensure IMPLICIT data and DISJOINT sets
    assert_implicit_data([URM_train, URM_validation, URM_test, URM_test_negative])
    assert_disjoint_matrices([URM_train, URM_validation, URM_test])
    assert_disjoint_matrices([URM_train, URM_validation, URM_test_negative])

    # If directory does not exist, create
    if not os.path.exists(result_folder_path):
        os.makedirs(result_folder_path)

    algorithm_dataset_string = "{}_{}_".format(ALGORITHM_NAME, dataset_name)
    plot_popularity_bias([URM_train + URM_validation, URM_test],
                         ["URM train", "URM test"],
                         result_folder_path + algorithm_dataset_string + "popularity_plot")
    save_popularity_statistics([URM_train + URM_validation, URM_test],
                               ["URM train", "URM test"],
                               result_folder_path + algorithm_dataset_string + "popularity_statistics")

    collaborative_algorithm_list = [
        Random,
        TopPop,
        UserKNNCFRecommender,
        ItemKNNCFRecommender,
        P3alphaRecommender,
        RP3betaRecommender,
        PureSVDRecommender,
        NMFRecommender,
        IALSRecommender,
        MatrixFactorization_BPR_Cython,
        MatrixFactorization_FunkSVD_Cython,
        EASE_R_Recommender,
        SLIM_BPR_Cython,
        SLIMElasticNetRecommender,
    ]

    metric_to_optimize = "HIT_RATE"
    n_cases = 50
    n_random_starts = 15

    from Base.Evaluation.Evaluator import EvaluatorNegativeItemSample
    evaluator_validation = EvaluatorNegativeItemSample(URM_validation, URM_test_negative, cutoff_list=[10])
    evaluator_test = EvaluatorNegativeItemSample(URM_test, URM_test_negative, cutoff_list=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

    runParameterSearch_Collaborative_partial = partial(runParameterSearch_Collaborative,
                                                       URM_train=URM_train,
                                                       URM_train_last_test=URM_train + URM_validation,
                                                       metric_to_optimize=metric_to_optimize,
                                                       evaluator_validation_earlystopping=evaluator_validation,
                                                       evaluator_validation=evaluator_validation,
                                                       evaluator_test=evaluator_test,
                                                       output_folder_path=result_folder_path,
                                                       parallelizeKNN=False,
                                                       allow_weighting=True,
                                                       resume_from_saved=True,
                                                       n_cases=n_cases,
                                                       n_random_starts=n_random_starts)

    if flag_baselines_tune:
        for recommender_class in collaborative_algorithm_list:
            try:
                runParameterSearch_Collaborative_partial(recommender_class)
            except Exception as e:
                print("On recommender {} Exception {}".format(recommender_class, str(e)))
                traceback.print_exc()

    if flag_DL_article_default:
        try:
            if dataset_name == "movielens1m":
                num_factors = 64
            elif dataset_name == "pinterest":
                num_factors = 16

            neuMF_article_hyperparameters = {
                "epochs": 100,
                "epochs_gmf": 100,
                "epochs_mlp": 100,
                "batch_size": 256,
                "num_factors": num_factors,
                "layers": [num_factors*4, num_factors*2, num_factors],
                "reg_mf": 0.0,
                "reg_layers": [0,0,0],
                "num_negatives": 4,
                "learning_rate": 1e-3,
                "learning_rate_pretrain": 1e-3,
                "learner": "sgd",
                "learner_pretrain": "adam",
                "pretrain": True
            }

            neuMF_earlystopping_hyperparameters = {
                "validation_every_n": 5,
                "stop_on_validation": True,
                "evaluator_object": evaluator_validation,
                "lower_validations_allowed": 5,
                "validation_metric": metric_to_optimize
            }

            parameterSearch = SearchSingleCase(NeuMF_RecommenderWrapper,
                                               evaluator_validation=evaluator_validation,
                                               evaluator_test=evaluator_test)

            recommender_input_args = SearchInputRecommenderArgs(
                                                CONSTRUCTOR_POSITIONAL_ARGS=[URM_train],
                                                FIT_KEYWORD_ARGS=neuMF_earlystopping_hyperparameters)

            recommender_input_args_last_test = recommender_input_args.copy()
            recommender_input_args_last_test.CONSTRUCTOR_POSITIONAL_ARGS[0] = URM_train + URM_validation

            parameterSearch.search(recommender_input_args,
                                   recommender_input_args_last_test=recommender_input_args_last_test,
                                   fit_hyperparameters_values=neuMF_article_hyperparameters,
                                   output_folder_path=result_folder_path,
                                   resume_from_saved=True,
                                   output_file_name_root=NeuMF_RecommenderWrapper.RECOMMENDER_NAME)

        except Exception as e:
            print("On recommender {} Exception {}".format(NeuMF_RecommenderWrapper, str(e)))
            traceback.print_exc()

    if flag_print_results:
        n_test_users = np.sum(np.ediff1d(URM_test.indptr) >= 1)
        file_name = "{}..//{}_{}_".format(result_folder_path, ALGORITHM_NAME, dataset_name)

        result_loader = ResultFolderLoader(result_folder_path,
                                         base_algorithm_list=None,
                                         other_algorithm_list=[NeuMF_RecommenderWrapper],
                                         KNN_similarity_list=KNN_similarity_to_report_list,
                                         ICM_names_list=None,
                                         UCM_names_list=None)

        result_loader.generate_latex_results(file_name + "{}_latex_results.txt".format("article_metrics"),
                                           metrics_list=["HIT_RATE", "NDCG"],
                                           cutoffs_list=[1, 5, 10],
                                           table_title=None,
                                           highlight_best=True)

        result_loader.generate_latex_results(file_name + "{}_latex_results.txt".format("all_metrics"),
                                           metrics_list=["PRECISION", "RECALL", "MAP", "MRR", "NDCG", "F1", "HIT_RATE", "ARHR",
                                                           "NOVELTY", "DIVERSITY_MEAN_INTER_LIST", "DIVERSITY_HERFINDAHL", "COVERAGE_ITEM", "DIVERSITY_GINI", "SHANNON_ENTROPY"],
                                           cutoffs_list=[10],
                                           table_title=None,
                                           highlight_best=True)

        result_loader.generate_latex_time_statistics(file_name + "{}_latex_results.txt".format("time"),
                                           n_evaluation_users=n_test_users,
                                           table_title=None)

def main(baseline_tune=False, DL_article_default=False, print_results=True):
    ALGORITHM_NAME = "NeuMF"
    CONFERENCE_NAME = "WWW"
    KNN_similarity_to_report_list = ["cosine", "dice", "jaccard", "asymmetric", "tversky"]
    dataset_list = ["movielens1m"]

    for dataset_name in dataset_list:
        read_data_split_and_search(dataset_name,
                                   CONFERENCE_NAME,
                                   ALGORITHM_NAME,
                                   flag_baselines_tune=baseline_tune,
                                   flag_DL_article_default=DL_article_default,
                                   flag_print_results=print_results)

    if print_results:
        generate_latex_hyperparameters(result_folder_path="result_experiments/{}/".format(CONFERENCE_NAME),
                                       algorithm_name=ALGORITHM_NAME,
                                       experiment_subfolder_list=dataset_list,
                                       other_algorithm_list=[NeuMF_RecommenderWrapper],
                                       KNN_similarity_to_report_list=KNN_similarity_to_report_list,
                                       split_per_algorithm_type=True)

if __name__ == '__main__':
    import sys
    if 'ipykernel' in sys.modules:
        main(baseline_tune=True, DL_article_default=True, print_results=True)
    else:
        parser = argparse.ArgumentParser()
        parser.add_argument('-b', '--baseline_tune', action='store_true', 
                            help="Baseline hyperparameter search", default=False)
        parser.add_argument('-a', '--DL_article_default', action='store_true', 
                            help="Train the DL model with article hyperparameters", default=False)
        parser.add_argument('-p', '--print_results', action='store_true', 
                            help="Print results", default=True)
        args = parser.parse_args()
        main(baseline_tune=args.baseline_tune, 
             DL_article_default=args.DL_article_default, 
             print_results=args.print_results)

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


Dataset_Movielens1M: Attempting to load pre-splitted data
Dataset_Movielens1M: Dataset loaded
Assertion assert_implicit_data: Passed
Assertion assert_disjoint_matrices: Passed
Assertion assert_disjoint_matrices: Passed
SearchSingleCase: Resuming 'RandomRecommender'... Loaded 1 configurations.
RandomRecommender: URM Detected 2 (0.05 %) cold items.
SearchSingleCase: Resuming 'RandomRecommender'... Result on last already available.
SearchSingleCase: Resuming 'TopPopRecommender'... Loaded 1 configurations.
TopPopRecommender: URM Detected 2 (0.05 %) cold items.
SearchSingleCase: Resuming 'TopPopRecommender'... Result on last already available.
SearchBayesianSkopt: Resuming 'UserKNNCFRecommender_cosine'... Loaded 50 configurations.
Iteration No: 1 started. Searching for the next optimal point.
Iteration No: 1 ended. Search finished for the next optimal point.
Time taken: 1.1864
Function value obtained: -0.8043
Current minimum: -0.8055
UserKNNCFRecommender: URM Detected 2 (0.05 %) cold items.

In [18]:
import pandas as pd
from surprise import Reader, Dataset
import tensorflow as tf

In [19]:
data = Dataset.load_builtin('ml-100k')

In [20]:
df = pd.DataFrame(data.raw_ratings)
df.columns = ['user', 'item', 'rating', 'timestamp']
df.head()

Unnamed: 0,user,item,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [23]:
users_len = df['user'].nunique()  # Количество уникальных пользователей
items_len = df['item'].nunique()   # Количество уникальных элементов

In [31]:
def rmse(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(50, input_shape=(users_len + items_len,), activation='relu'),
    tf.keras.layers.Dense(25, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

optimizer = tf.keras.optimizers.Adam(lr=1e-3)
loss = tf.keras.losses.binary_crossentropy

model.compile(optimizer=optimizer,
              loss=loss,
              metrics=['accuracy', rmse])

Получитм эмбединги пользователей с помощью SVD.
Подадим их в нейронную сеть. Она принимает на вход пользователей и эмбединг айтомов,
далее объединяет их в два полносвязанных слоя. На последнем слое выдаёт прогноз с функцией активации сигмоид.
Обучам сеть.

In [46]:
data2 = Dataset.load_builtin('ml-100k')

In [47]:
df2 = pd.DataFrame(data2.raw_ratings)
df2.columns = ['user', 'item', 'rating', 'timestamp']
df2.head(10)

Unnamed: 0,user,item,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596
5,298,474,4.0,884182806
6,115,265,2.0,881171488
7,253,465,5.0,891628467
8,305,451,3.0,886324817
9,6,86,3.0,883603013


In [48]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split as surprise_train_test_split
import numpy as np

# Предположим, что df2 - это DataFrame с колонками: user, item, rating, timestamp
print("Первые 5 строк данных:")
print(df2.head(10))

# Преобразование данных в формат Surprise
reader = Reader(rating_scale=(1, 5))
surprise_data = Dataset.load_from_df(df2[['user', 'item', 'rating']], reader)

# Разделение данных на обучающую и тестовую выборки в формате Surprise
trainset, testset = surprise_train_test_split(surprise_data, test_size=0.25, random_state=42)

# Обучение SVD модели
model_svd = SVD(n_factors=50, random_state=42)
model_svd.fit(trainset)  # Обучаем на trainset от Surprise

# Извлечение эмбедингов
user_embeddings = model_svd.pu  # Матрица эмбедингов пользователей (n_users x n_factors)
item_embeddings = model_svd.qi  # Матрица эмбедингов айтемов (n_items x n_factors)

print(f"Размерность эмбедингов пользователей: {user_embeddings.shape}")
print(f"Размерность эмбедингов айтемов: {item_embeddings.shape}")

Первые 5 строк данных:
  user item  rating  timestamp
0  196  242     3.0  881250949
1  186  302     3.0  891717742
2   22  377     1.0  878887116
3  244   51     2.0  880606923
4  166  346     1.0  886397596
5  298  474     4.0  884182806
6  115  265     2.0  881171488
7  253  465     5.0  891628467
8  305  451     3.0  886324817
9    6   86     3.0  883603013
Размерность эмбедингов пользователей: (943, 50)
Размерность эмбедингов айтемов: (1644, 50)


In [49]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from tensorflow.keras.models import Model

# Вычисляем количество пользователей и айтемов из матриц эмбедингов
n_users = user_embeddings.shape[0]  # Количество пользователей = количество строк в матрице user_embeddings
n_items = item_embeddings.shape[0]  # Количество айтемов = количество строк в матрице item_embeddings

# Размерность эмбедингов из SVD
embedding_dim = user_embeddings.shape[1]

# Создание модели
user_input = Input(shape=(1,), name='user_input')
item_input = Input(shape=(1,), name='item_input')

# Эмбединг-слой для пользователей (используем предобученные веса)
user_embedding_layer = Embedding(
    input_dim=n_users,
    output_dim=embedding_dim,
    weights=[user_embeddings],
    trainable=True,  # Разрешаем дообучение
    name='user_embedding'
)(user_input)
user_flat = Flatten()(user_embedding_layer)

# Эмбединг-слой для айтемов (используем предобученные веса)
item_embedding_layer = Embedding(
    input_dim=n_items,
    output_dim=embedding_dim,
    weights=[item_embeddings],
    trainable=True,  # Разрешаем дообучение
    name='item_embedding'
)(item_input)
item_flat = Flatten()(item_embedding_layer)

# Объединение эмбедингов
concat = Concatenate()([user_flat, item_flat])

# Полносвязные слои
dense1 = Dense(128, activation='relu')(concat)
dense2 = Dense(64, activation='relu')(dense1)
output = Dense(1, activation='sigmoid')(dense2)

# Сборка модели
model = Model(inputs=[user_input, item_input], outputs=output)

# Компиляция модели
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', rmse])

In [52]:
# Функция для преобразования данных с фильтрацией неизвестных пользователей/предметов
# Функция для преобразования тестовых данных с фильтрацией
def prepare_test_data(trainset, testset):
    users, items, labels = [], [], []
    for (raw_user, raw_item, rating) in testset:
        try:
            u = trainset.to_inner_uid(raw_user)
            i = trainset.to_inner_iid(raw_item)
            users.append(u)
            items.append(i)
            labels.append(1 if float(rating) >= 4 else 0)
        except ValueError:
            continue  # Пропускаем неизвестных пользователей/предметов
    return (
        np.array(users, dtype=np.int32),
        np.array(items, dtype=np.int32),
        np.array(labels, dtype=np.float32)
    )

# Функция для подготовки обучающих данных (без фильтрации)
def prepare_train_data(trainset):
    users, items, labels = [], [], []
    for (user_inner_id, item_inner_id, rating) in trainset.all_ratings():
        users.append(user_inner_id)
        items.append(item_inner_id)
        labels.append(1 if rating >= 4 else 0)  # Бинаризация рейтинга
    return (
        np.array(users, dtype=np.int32),
        np.array(items, dtype=np.int32),
        np.array(labels, dtype=np.float32)
    )

# Подготовка данных
train_users, train_items, train_labels = prepare_train_data(trainset)
test_users, test_items, test_labels = prepare_test_data(trainset, testset)

# Подготовка тестовых данных
test_users, test_items, test_labels = prepare_data(trainset, testset)

# Проверка диапазонов индексов
n_users = trainset.n_users
n_items = trainset.n_items


# Создание модели нейронной сети
def rmse(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_true - y_pred)))

user_input = Input(shape=(1,), name='user_input')
item_input = Input(shape=(1,), name='item_input')

# Embedding слои с предобученными весами
user_embedding = Embedding(
    input_dim=n_users,
    output_dim=50,
    weights=[user_embeddings],
    trainable=True,
    name='user_embedding'
)(user_input)
user_vec = Flatten()(user_embedding)

item_embedding = Embedding(
    input_dim=n_items,
    output_dim=50,
    weights=[item_embeddings],
    trainable=True,
    name='item_embedding'
)(item_input)
item_vec = Flatten()(item_embedding)

# Объединение и полносвязные слои
concat = Concatenate()([user_vec, item_vec])
dense1 = Dense(128, activation='relu')(concat)
dense2 = Dense(64, activation='relu')(dense1)
output = Dense(1, activation='sigmoid')(dense2)

model = Model(inputs=[user_input, item_input], outputs=output)
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', rmse]
)

# Обучение модели
history = model.fit(
    [train_users, train_items],
    train_labels,
    batch_size=64,
    epochs=10,
    validation_data=([test_users, test_items], test_labels)
)

# Визуализация результатов
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.title('Loss Evolution')

plt.subplot(1, 2, 2)
plt.plot(history.history['acc'], label='Train Accuracy')
plt.plot(history.history['val_acc'], label='Validation Accuracy')
plt.legend()
plt.title('Accuracy Evolution')
plt.savefig('training_metrics.png') 

Train on 75000 samples, validate on 24956 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
