# Collaborative Filtering  
This notebook implements a collaborative filtering approach to generate recommendations. Among these methods, ALS and BPR were implemented in the original study, while I implemented the rest.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
path='/content/drive/MyDrive/RecSys_206894495'
%run /content/drive/MyDrive/RecSys_206894495/models/evaluate_models.ipynb

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install implicit
#!pip install surprise
!pip install pyarrow



In [None]:
#import
#warning
import warnings
warnings.filterwarnings('ignore')

#general
import pandas as pd
import numpy as np
from itertools import product
import json

#for predictions
import implicit
from implicit.bpr import BayesianPersonalizedRanking
from implicit.lmf import LogisticMatrixFactorization
from implicit.nearest_neighbours import ItemItemRecommender, BM25Recommender
from scipy.sparse import coo_matrix



In [None]:
#load data
user_splits=pd.read_parquet(path+'/models/user_splits.parquet')
outfits=pd.read_parquet(path+'/archive/data/outfits.parquet')

### ALS

In [None]:
def train_als_model(user_splits_df, outfit_column, factors=16, regularization=0.1, iterations=50):
    flat_df = user_splits_df.explode(outfit_column)
    flat_df["value"] = 1

    unique_outfit_ids = flat_df[outfit_column].unique()
    outfit_to_index = {outfit_id: i for i, outfit_id in enumerate(unique_outfit_ids)}
    flat_df["outfit_index"] = flat_df[outfit_column].map(outfit_to_index)

    unique_users = pd.unique(flat_df.index)
    user_to_index = {user_id: i for i, user_id in enumerate(unique_users)}
    flat_df['user_index'] = flat_df.index.map(user_to_index)

    # Debug: Check if there are any negative indices
    if (flat_df['outfit_index'] < 0).any():
        raise ValueError('Negative outfit_index found')
    if (flat_df['user_index'] < 0).any():
        raise ValueError('Negative user_index found')

    coo = coo_matrix(
        (flat_df['value'].values, (flat_df['user_index'].values, flat_df['outfit_index'].values)),
        shape=(len(unique_users), len(unique_outfit_ids))
    )

    np.random.seed(42)
    model = implicit.als.AlternatingLeastSquares(factors=factors, regularization=regularization, iterations=iterations, )
    csr = coo.tocsr()
    model.fit(csr)

    return model, csr, outfit_to_index, user_to_index

def run_als_training_loop(df, factors, regularization):
    model_ind, csr_ind, outfit_id_to_index, user_to_index = train_als_model(df, "train_outfit_ids", factors=factors, regularization=regularization)
    model_group, csr_group, outfit_group_to_index, _ = train_als_model(df, "train_group", factors=factors, regularization=regularization)

    df["user_index"] = df.index.map(user_to_index)

    ind_index_to_id = {value : key for key, value in outfit_id_to_index.items()}
    df["id_prediction"] = df.apply(lambda x: evaluate_model(model_ind, csr_ind, x["user_index"], n=100), axis=1)
    df["id_prediction"] = df.apply(lambda x: get_outfit_id_from_index(x["id_prediction"], ind_index_to_id), axis=1)

    group_index_to_id = {value : key for key, value in outfit_group_to_index.items()}
    df["group_prediction"] = df.apply(lambda x: evaluate_model(model_group, csr_group, x["user_index"], n=100), axis=1)
    df["group_prediction"] = df.apply(lambda x: get_outfit_id_from_index(x["group_prediction"], group_index_to_id), axis=1)

    return df


In [None]:
#Hyper-Parameters tuning
TEST_FACTORS = [48,32,24]
TEST_REGULARIZATIONS = [0.02,0.01,0.005]

test_permutations = list(product(TEST_FACTORS, TEST_REGULARIZATIONS))
test_permutations = [(t_factors, t_reg) for t_factors, t_reg in product(TEST_FACTORS, TEST_REGULARIZATIONS)]

for test_factors, test_regularization in test_permutations:
    df=user_splits.copy()
    dft= run_als_training_loop(df, test_factors, test_regularization)
    param_dict = {'test_factors':test_factors,'test_regularization':test_regularization}
    user_splits_df, all_dict = evaluate_val_metrics_at_n(dft,'ALS', n=10, model_params=param_dict)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
# Retrieve the best model parameters from the JSON file for the specified method
best_model_params = retrieve_best_model_params_from_file(path + "/models/val.json", 'ALS', n=10)

df=user_splits.copy()
test_factors = best_model_params['test_factors']
test_regularizatio = best_model_params['test_regularization']

# Train the model with the best parameters
dft=run_als_training_loop(df, test_factors, test_regularizatio)
# Evaluate the model
user_splits_df, all_dict = evaluate_df_metrics_at_n(dft,'ALS', n=10)

  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Unnamed: 0,0
id_hit_rate_at_10,0.029502
id_precision_at_10,0.003104
id_recall_at_10,0.01055
id_f1_score_at_10,0.00396
group_hit_rate_at_10,0.042842
group_precision_at_10,0.004746
group_recall_at_10,0.013871
group_f1_score_at_10,0.005597
method_name,ALS


### BPR

In [None]:
def train_bpr_model(user_splits_df, outfit_column, factors=16, regularization=0.1, learning_rate=0.1, iterations=500):
    flat_df = user_splits_df.explode(outfit_column)
    flat_df["value"] = 1

    unique_outfit_ids = flat_df[outfit_column].unique()
    outfit_to_index = {outfit_id: i for i, outfit_id in enumerate(unique_outfit_ids)}
    flat_df["outfit_index"] = flat_df[outfit_column].map(outfit_to_index)

    unique_users = pd.unique(flat_df.index)
    user_to_index = {user_id: i for i, user_id in enumerate(unique_users)}
    flat_df['user_index'] = flat_df.index.map(user_to_index)

    # Debug: Check if there are any negative indices
    if (flat_df['outfit_index'] < 0).any():
        raise ValueError('Negative outfit_index found')
    if (flat_df['user_index'] < 0).any():
        raise ValueError('Negative user_index found')

    coo = coo_matrix(
        (flat_df['value'].values, (flat_df['user_index'].values, flat_df['outfit_index'].values)),
        shape=(len(unique_users), len(unique_outfit_ids))
    )

    np.random.seed(43)
    model = BayesianPersonalizedRanking(factors=factors, regularization=regularization, iterations=iterations, learning_rate=learning_rate)
    csr = coo.tocsr()
    model.fit(csr)

    return model, csr, outfit_to_index, user_to_index


def run_bpr_training_loop(df, factors, regularization, learning_rate):
    model_ind, csr_ind, outfit_id_to_index, user_to_index = train_bpr_model(df, "train_outfit_ids", factors=factors, regularization=regularization, learning_rate=learning_rate)
    model_group, csr_group, outfit_group_to_index, _ = train_bpr_model(df, "train_group", factors=factors, regularization=regularization, learning_rate=learning_rate)

    df["user_index"] = df.index.map(user_to_index)

    ind_index_to_id = {value : key for key, value in outfit_id_to_index.items()}
    df["id_prediction"] = df.apply(lambda x: evaluate_model(model_ind, csr_ind, x["user_index"], n=100), axis=1)
    df["id_prediction"] = df.apply(lambda x: get_outfit_id_from_index(x["id_prediction"], ind_index_to_id), axis=1)

    group_index_to_id = {value : key for key, value in outfit_group_to_index.items()}
    df["group_prediction"] = df.apply(lambda x: evaluate_model(model_group, csr_group, x["user_index"], n=100), axis=1)
    df["group_prediction"] = df.apply(lambda x: get_outfit_id_from_index(x["group_prediction"], group_index_to_id), axis=1)

    return df

In [None]:
#Hyper-Parameters tuning
TEST_FACTORS = [128,216,64]
TEST_REGULARIZATIONS = [0.01,0.02,0.005]
TEST_LEARNING_RATES = [0.01,0.02,0.005]

test_permutations = list(product(TEST_FACTORS, TEST_REGULARIZATIONS, TEST_LEARNING_RATES))
test_permutations = [(t_factors, t_reg, t_lr) for t_factors, t_reg, t_lr in product(TEST_FACTORS, TEST_REGULARIZATIONS, TEST_LEARNING_RATES)]


for test_factors, test_regularization, test_learning_rate in test_permutations:
    df=user_splits.copy()
    dft = run_bpr_training_loop(df, test_factors, test_regularization, test_learning_rate)
    param_dict = {'test_factors':test_factors,'test_regularization':test_regularization,'test_learning_rate':test_learning_rate}
    user_splits_df, all_dict = evaluate_val_metrics_at_n(dft,'BPR', n=10,model_params=param_dict)

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
# Retrieve the best model parameters from the JSON file for the specified method
best_model_params = retrieve_best_model_params_from_file(path + "/models/val.json", 'BPR', n=10)

df=user_splits.copy()
test_factors = best_model_params['test_factors']
test_regularizatio = best_model_params['test_regularization']
test_learning_rate = best_model_params['test_learning_rate']

# Train the model with the best parameters
dft = run_bpr_training_loop(df, test_factors, test_regularization, test_learning_rate)
# Evaluate the model
user_splits_df, all_dict = evaluate_df_metrics_at_n(dft,'BPR', n=10)

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Unnamed: 0,0
id_hit_rate_at_10,0.026167
id_precision_at_10,0.002617
id_recall_at_10,0.011103
id_f1_score_at_10,0.003398
group_hit_rate_at_10,0.031811
group_precision_at_10,0.003309
group_recall_at_10,0.012674
group_f1_score_at_10,0.004222
method_name,BPR


### LMF

In [None]:
def train_lmf_model(user_splits_df, outfit_column, factors=16, regularization=0.1, learning_rate=0.1, iterations=500):
    flat_df = user_splits_df.explode(outfit_column)
    flat_df["value"] = 1

    unique_outfit_ids = flat_df[outfit_column].unique()
    outfit_to_index = {outfit_id: i for i, outfit_id in enumerate(unique_outfit_ids)}
    flat_df["outfit_index"] = flat_df[outfit_column].map(outfit_to_index)

    unique_users = pd.unique(flat_df.index)
    user_to_index = {user_id: i for i, user_id in enumerate(unique_users)}
    flat_df['user_index'] = flat_df.index.map(user_to_index)

    # Debug: Check if there are any negative indices
    if (flat_df['outfit_index'] < 0).any():
        raise ValueError('Negative outfit_index found')
    if (flat_df['user_index'] < 0).any():
        raise ValueError('Negative user_index found')

    coo = coo_matrix(
        (flat_df['value'].values, (flat_df['user_index'].values, flat_df['outfit_index'].values)),
        shape=(len(unique_users), len(unique_outfit_ids))
    )

    np.random.seed(43)
    model = LogisticMatrixFactorization(factors=factors, regularization=regularization, iterations=iterations, learning_rate=learning_rate)
    csr = coo.tocsr()
    model.fit(csr)

    return model, csr, outfit_to_index, user_to_index

def run_lmf_training_loop(df, factors, regularization, learning_rate):
    model_ind, csr_ind, outfit_id_to_index, user_to_index = train_lmf_model(df, "train_outfit_ids", factors=factors, regularization=regularization, learning_rate=learning_rate)
    model_group, csr_group, outfit_group_to_index, _ = train_lmf_model(df, "train_group", factors=factors, regularization=regularization, learning_rate=learning_rate)

    df["user_index"] = df.index.map(user_to_index)

    ind_index_to_id = {value : key for key, value in outfit_id_to_index.items()}
    df["id_prediction"] = df.apply(lambda x: evaluate_model(model_ind, csr_ind, x["user_index"], n=100), axis=1)
    df["id_prediction"] = df.apply(lambda x: get_outfit_id_from_index(x["id_prediction"], ind_index_to_id), axis=1)

    group_index_to_id = {value : key for key, value in outfit_group_to_index.items()}
    df["group_prediction"] = df.apply(lambda x: evaluate_model(model_group, csr_group, x["user_index"], n=100), axis=1)
    df["group_prediction"] = df.apply(lambda x: get_outfit_id_from_index(x["group_prediction"], group_index_to_id), axis=1)

    return df




In [None]:
#Hyper-Parameters tuning
TEST_FACTORS = [128,216,64]
TEST_REGULARIZATIONS = [0.01,0.02,0.005]
TEST_LEARNING_RATES = [0.01,0.02,0.005]

test_permutations = list(product(TEST_FACTORS, TEST_REGULARIZATIONS, TEST_LEARNING_RATES))
test_permutations = [(t_factors, t_reg, t_lr) for t_factors, t_reg, t_lr in product(TEST_FACTORS, TEST_REGULARIZATIONS, TEST_LEARNING_RATES)]

for test_factors, test_regularization, test_learning_rate in test_permutations:
    df=user_splits.copy()
    dft = run_lmf_training_loop(df, test_factors, test_regularization, test_learning_rate)
    param_dict = {'test_factors':test_factors,'test_regularization':test_regularization,'test_learning_rate':test_learning_rate}
    user_splits_df, all_dict = evaluate_val_metrics_at_n(dft,'LMF', n=10,model_params=param_dict)

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

In [None]:
# Retrieve the best model parameters from the JSON file for the specified method
best_model_params = retrieve_best_model_params_from_file(path + "/models/val.json", 'LMF', n=10)

df=user_splits.copy()
test_factors = best_model_params['test_factors']
test_regularizatio = best_model_params['test_regularization']
test_learning_rate = best_model_params['test_learning_rate']

# Train the model with the best parameters
dft = run_lmf_training_loop(df, test_factors, test_regularization, test_learning_rate)
# Evaluate the model
user_splits_df, all_dict = evaluate_df_metrics_at_n(dft,'LMF', n=10)

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Unnamed: 0,0
id_hit_rate_at_10,0.001796
id_precision_at_10,0.00018
id_recall_at_10,0.000531
id_f1_score_at_10,0.000233
group_hit_rate_at_10,0.003335
group_precision_at_10,0.000334
group_recall_at_10,0.001025
group_f1_score_at_10,0.000392
method_name,LMF


### NN

In [None]:
def train_nn_model(user_splits_df, outfit_column, k=20):
    flat_df = user_splits_df.explode(outfit_column)
    flat_df["value"] = 1

    unique_outfit_ids = flat_df[outfit_column].unique()
    outfit_to_index = {outfit_id: i for i, outfit_id in enumerate(unique_outfit_ids)}
    flat_df["outfit_index"] = flat_df[outfit_column].map(outfit_to_index)

    unique_users = pd.unique(flat_df.index)
    user_to_index = {user_id: i for i, user_id in enumerate(unique_users)}
    flat_df['user_index'] = flat_df.index.map(user_to_index)

    # Debug: Check if there are any negative indices
    if (flat_df['outfit_index'] < 0).any():
        raise ValueError('Negative outfit_index found')
    if (flat_df['user_index'] < 0).any():
        raise ValueError('Negative user_index found')

    coo = coo_matrix(
    (flat_df['value'].astype(np.float64).values, (flat_df['user_index'].values, flat_df['outfit_index'].values)),
    shape=(len(unique_users), len(unique_outfit_ids))
    )

    np.random.seed(43)
    model = ItemItemRecommender(K=k)
    csr = coo.tocsr()
    model.fit(csr)

    return model, csr, outfit_to_index, user_to_index

def run_nn_training_loop(df, k):
    model_ind, csr_ind, outfit_id_to_index, user_to_index = train_nn_model(df, "train_outfit_ids", k)
    model_group, csr_group, outfit_group_to_index, _ = train_nn_model(df, "train_group", k)

    df["user_index"] = df.index.map(user_to_index)

    ind_index_to_id = {value : key for key, value in outfit_id_to_index.items()}
    df["id_prediction"] = df.apply(lambda x: evaluate_model(model_ind, csr_ind, x["user_index"], n=100), axis=1)
    df["id_prediction"] = df.apply(lambda x: get_outfit_id_from_index(x["id_prediction"], ind_index_to_id), axis=1)

    group_index_to_id = {value : key for key, value in outfit_group_to_index.items()}
    df["group_prediction"] = df.apply(lambda x: evaluate_model(model_group, csr_group, x["user_index"], n=100), axis=1)
    df["group_prediction"] = df.apply(lambda x: get_outfit_id_from_index(x["group_prediction"], group_index_to_id), axis=1)

    return df

In [None]:
#Hyper-Parameters tuning

K=[20,10,15]

for k in K:
    df=user_splits.copy()
    dft = run_nn_training_loop(df, k)
    param_dict = {'k':k}
    user_splits_df, all_dict = evaluate_val_metrics_at_n(dft,'NN', n=10,model_params=param_dict)

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

In [None]:
# Retrieve the best model parameters from the JSON file for the specified method
best_model_params = retrieve_best_model_params_from_file(path + "/models/val.json", 'NN', n=10)

df=user_splits.copy()
k = best_model_params['k']

# Train the model with the best parameters
dft = run_nn_training_loop(df, k)
# Evaluate the model
user_splits_df, all_dict = evaluate_df_metrics_at_n(dft,'NN', n=10)

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

Unnamed: 0,0
id_hit_rate_at_10,0.023345
id_precision_at_10,0.00236
id_recall_at_10,0.01228
id_f1_score_at_10,0.003316
group_hit_rate_at_10,0.038738
group_precision_at_10,0.004028
group_recall_at_10,0.018808
group_f1_score_at_10,0.005489
method_name,NN


#BM25

In [None]:
def train_BM25_model(user_splits_df, outfit_column, k=20, k1=100, b=0.8):
    flat_df = user_splits_df.explode(outfit_column)
    flat_df["value"] = 1

    unique_outfit_ids = flat_df[outfit_column].unique()
    outfit_to_index = {outfit_id: i for i, outfit_id in enumerate(unique_outfit_ids)}
    flat_df["outfit_index"] = flat_df[outfit_column].map(outfit_to_index)

    unique_users = pd.unique(flat_df.index)
    user_to_index = {user_id: i for i, user_id in enumerate(unique_users)}
    flat_df['user_index'] = flat_df.index.map(user_to_index)

    # Debug: Check if there are any negative indices
    if (flat_df['outfit_index'] < 0).any():
        raise ValueError('Negative outfit_index found')
    if (flat_df['user_index'] < 0).any():
        raise ValueError('Negative user_index found')

    coo = coo_matrix(
    (flat_df['value'].astype(np.float64).values, (flat_df['user_index'].values, flat_df['outfit_index'].values)),
    shape=(len(unique_users), len(unique_outfit_ids))
    )

    np.random.seed(43)
    model = BM25Recommender(K=k, K1=k1, B=b)
    csr = coo.tocsr()
    model.fit(csr)

    return model, csr, outfit_to_index, user_to_index

def run_BM25_training_loop(df, k, k1, b, run_name=""):
    model_ind, csr_ind, outfit_id_to_index, user_to_index = train_BM25_model(df, "train_outfit_ids", k, k1, b)
    model_group, csr_group, outfit_group_to_index, _ = train_BM25_model(df, "train_group", k, k1, b)

    df["user_index"] = df.index.map(user_to_index)

    ind_index_to_id = {value : key for key, value in outfit_id_to_index.items()}
    df["id_prediction"] = df.apply(lambda x: evaluate_model(model_ind, csr_ind, x["user_index"], n=100), axis=1)
    df["id_prediction"] = df.apply(lambda x: get_outfit_id_from_index(x["id_prediction"], ind_index_to_id), axis=1)

    group_index_to_id = {value : key for key, value in outfit_group_to_index.items()}
    df["group_prediction"] = df.apply(lambda x: evaluate_model(model_group, csr_group, x["user_index"], n=100), axis=1)
    df["group_prediction"] = df.apply(lambda x: get_outfit_id_from_index(x["group_prediction"], group_index_to_id), axis=1)

    return df

In [None]:
#Hyper-Parameters tuning

K=[20,10,15]
K1=[100,150,50,200]
B=[0.8,0.9,0.75]
test_permutations = list(product(K, K1, B))

for k, k1, b in test_permutations:
    df=user_splits.copy()
    dft = run_BM25_training_loop(df, k, k1, b)
    param_dict = {'k':k,'k1':k1,'b':b}
    user_splits_df, all_dict = evaluate_val_metrics_at_n(dft,'BM25', n=10,model_params=param_dict)

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

In [None]:
# Retrieve the best model parameters from the JSON file for the specified method
best_model_params = retrieve_best_model_params_from_file(path + "/models/val.json", 'BM25', n=10)

df=user_splits.copy()
k = best_model_params['k']
k1 = best_model_params['k1']
b = best_model_params['b']

# Train the model with the best parameters
dft = run_BM25_training_loop(df, k, k1, b)
# Evaluate the model
user_splits_df, all_dict = evaluate_df_metrics_at_n(dft,'BM25', n=10)

  0%|          | 0/10768 [00:00<?, ?it/s]

  0%|          | 0/7211 [00:00<?, ?it/s]

Unnamed: 0,0
id_hit_rate_at_10,0.010518
id_precision_at_10,0.001077
id_recall_at_10,0.006748
id_f1_score_at_10,0.001562
group_hit_rate_at_10,0.01411
group_precision_at_10,0.001411
group_recall_at_10,0.009136
group_f1_score_at_10,0.002095
method_name,BM25
