# Collaborative Filtering Using Matrix Factorization

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mlxtend.preprocessing import TransactionEncoder
from functools import lru_cache
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.metrics import recall_score
import optuna
from utility_functions import *


In [2]:
# set seed for reproducibility
np.random.seed(0)


## Data Preparation

In [3]:
# define filepaths
products_path = "./data/products_train.csv"
train_sessions_path = "./data/sessions_train.csv"
test_sessions_path = "./data/sessions_test_task1.csv"


@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(products_path)


@lru_cache(maxsize=1)
def read_train_data():
    train_df = pd.read_csv(train_sessions_path)
    train_df["prev_items"] = (
        train_df["prev_items"].str.strip(
            "[']").str.replace("\n", "").str.split("' '")
    )
    train_df["all_items"] = train_df.apply(
        lambda row: list(row["prev_items"]) + [row["next_item"]], axis=1
    )
    return train_df


@lru_cache(maxsize=1)
def read_test_data():
    test_df = pd.read_csv(test_sessions_path)
    test_df["prev_items"] = (
        test_df["prev_items"].str.strip(
            "[']").str.replace("\n", "").str.split("' '")
    )
    test_df["all_items"] = test_df.apply(
        lambda row: list(row["prev_items"]) + [row["next_item"]], axis=1
    )
    return test_df


@lru_cache(maxsize=1)
def read_seperated_train_data():
    train_df = read_train_data()
    # seperate train datasets for each locale
    train_whole_de_df = train_df[train_df["locale"] == "DE"]
    train_whole_uk_df = train_df[train_df["locale"] == "UK"]
    train_whole_jp_df = train_df[train_df["locale"] == "JP"]
    # remove locale columns
    train_whole_de_df = train_whole_de_df.drop(columns="locale")
    train_whole_uk_df = train_whole_uk_df.drop(columns="locale")
    train_whole_jp_df = train_whole_jp_df.drop(columns="locale")
    # create seperate train and validation sets
    # this enables testing the models before making a submition to the competition
    train_de_df, val_de_df = train_test_split(train_whole_de_df, test_size=0.2)
    train_uk_df, val_uk_df = train_test_split(train_whole_uk_df, test_size=0.2)
    train_jp_df, val_jp_df = train_test_split(train_whole_jp_df, test_size=0.2)
    return (
        (train_de_df, val_de_df),
        (train_uk_df, val_uk_df),
        (train_jp_df, val_jp_df),
    )


@lru_cache(maxsize=1)
def read_sepereated_item_user_combinations():
    user_items_dfs = []
    for df_tuple in read_seperated_train_data():
        train_df = df_tuple[0]
        val_df = df_tuple[1]
        train_users_items = train_df["all_items"].explode()
        val_users_items = val_df["all_items"].explode()
        train_users_items = pd.DataFrame(
            {"user": train_users_items.index, "item": train_users_items}
        )
        val_users_items = pd.DataFrame(
            {"user": val_users_items.index, "item": val_users_items}
        )
        # drop duplicates -> binary values only
        train_users_items = train_users_items.drop_duplicates()
        val_users_items = val_users_items.drop_duplicates()
        # reset index
        train_users_items = train_users_items.reset_index()
        val_users_items = val_users_items.reset_index()
        user_items_dfs.append((train_users_items, val_users_items))
    return user_items_dfs


In [4]:
(
    (train_de_df, val_de_df),
    (train_uk_df, val_uk_df),
    (train_jp_df, val_jp_df),
) = read_seperated_train_data()


## Matrix Factorization

In [5]:
class MatrixFactorizationModel:
    def __init__(
        self,
        n_components,
        max_iter=200,
        init=None,
        solver="cd",
        beta_loss="frobenius",
        alpha_W=0.0,
        alpha_H="same",
        l1_ratio=0.0,
        random_state=0,
    ):
        self.n_components = n_components
        self.max_iter = max_iter
        self.init = init
        self.solver = solver
        self.beta_loss = beta_loss
        self.alpha_W = alpha_W
        self.alpha_H = alpha_H
        self.l1_ratio = l1_ratio
        self.random_state = random_state
        self.user_item_matrix_df = None
        self.W = None
        self.H = None

    def fit(self, dataset):
        # build a sparse dataframe to store sessions data
        te = TransactionEncoder()
        user_item_matrix = te.fit(dataset).transform(dataset, sparse=True)
        self.user_item_matrix_df = pd.DataFrame.sparse.from_spmatrix(
            user_item_matrix, columns=te.columns_, index=dataset.index
        )
        # perform non-negative matrix factorization and store user matrix (W) and item matrix (H) in this object
        model = NMF(
            n_components=self.n_components,
            max_iter=self.max_iter,
            init=self.init,
            solver=self.solver,
            beta_loss=self.beta_loss,
            alpha_W=self.alpha_W,
            alpha_H=self.alpha_H,
            l1_ratio=self.l1_ratio,
            random_state=self.random_state,
        )
        self.W = pd.DataFrame(
            model.fit_transform(self.user_item_matrix_df),
            index=self.user_item_matrix_df.index,
        )
        self.H = pd.DataFrame(
            model.components_, columns=self.user_item_matrix_df.columns
        )

    def get_recommendations(self, users=None, remove_already_bought_items=True, n=100):
        if type(users) != type(None):
            W_selection = self.W.loc[users]
            user_item_matrix_df_selection = self.user_item_matrix_df.loc[users]
        else:
            W_selection = self.W
            user_item_matrix_df_selection = self.user_item_matrix_df
        n_users = W_selection.shape[0]
        all_recom_items = []
        # iterate over user matrix in batches of 100 users -> memory
        for i in range(n_users // 100 + 1):
            from_idx = 100 * i
            to_idx = 100 * (i + 1) if 100 * (i + 1) <= n_users else n_users
            # reconstructed scores
            scores = np.dot(W_selection.iloc[from_idx:to_idx], self.H)
            true_scores = user_item_matrix_df_selection.iloc[from_idx:to_idx].values
            # if defined manually set scores to 0 for items that are already in the original set
            if remove_already_bought_items:
                scores[true_scores == 1] = 0
            # sort index by score, best item first...
            idx_best = np.flip(np.argsort(scores))[:, -n:]
            # ...and use index to retrieve item ids
            recom_items = user_item_matrix_df_selection.columns.values[idx_best]
            all_recom_items.append(recom_items)
        all_recom_items = np.concatenate(all_recom_items)
        return all_recom_items

In [6]:
# concatenate all_items column of train df with prev_items column of validation df
de_sessions = pd.concat([train_de_df["all_items"], val_de_df["prev_items"]])
# sample 500 users from the validation set to test the results
val_de_users_subset = np.random.choice(val_de_df.index.values, 500, replace=False)
val_de_subset_next_items = val_de_df.loc[val_de_users_subset, "next_item"]


def objective(trial):
    # build and train model on this data
    mf_model_de = MatrixFactorizationModel(
        n_components=trial.suggest_int("n_components", 5, 100),
        init=trial.suggest_categorical("init", ["random", "nndsvd", "nndsvda"]),
        # beta_loss=trial.suggest_categorical("beta_loss", ["frobenius", "kullback-leibler"]),
        alpha_W=trial.suggest_float("alpha_W_and_H", 0, 10),
        l1_ratio=trial.suggest_float("l1_ratio", 0, 1),
        max_iter=500,
    )
    mf_model_de.fit(de_sessions)

    # make recommendations for 500 users from the validation set
    val_de_subset_recoms = mf_model_de.get_recommendations(users=val_de_users_subset)

    # calculate mrr
    mrr_val_de_subset, rr_list = mean_reciprocal_rank(
        val_de_subset_recoms, val_de_subset_next_items
    )

    return mrr_val_de_subset


study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

[32m[I 2023-04-27 15:49:14,931][0m A new study created in memory with name: no-name-d7c48af6-057a-49c0-953e-e878b8e0d183[0m
  array = array.astype(new_dtype)
[32m[I 2023-04-27 15:54:36,245][0m Trial 0 finished with value: 0.0 and parameters: {'n_components': 24, 'init': 'random', 'alpha_W_and_H': 8.337337193969478, 'l1_ratio': 0.5540777215449285}. Best is trial 0 with value: 0.0.[0m
  array = array.astype(new_dtype)
[32m[I 2023-04-27 16:02:05,046][0m Trial 1 finished with value: 0.0 and parameters: {'n_components': 63, 'init': 'nndsvda', 'alpha_W_and_H': 4.647421534933819, 'l1_ratio': 0.14685523755644592}. Best is trial 0 with value: 0.0.[0m
  array = array.astype(new_dtype)
[32m[I 2023-04-27 16:08:14,411][0m Trial 2 finished with value: 0.0 and parameters: {'n_components': 34, 'init': 'nndsvd', 'alpha_W_and_H': 7.4264728854301705, 'l1_ratio': 0.08559395773748546}. Best is trial 0 with value: 0.0.[0m
  array = array.astype(new_dtype)
[32m[I 2023-04-27 16:13:54,579][0m Tri

In [None]:
for trial in study.trials:
    hyperparams = trial.params
    result = trial.value
    print(f"Hyperparameters: {hyperparams}, Result: {result}")
