In [1]:
!pip install implicit
#!pip install surprise
!pip install pyarrow



In [13]:
#import
#warning
import warnings
warnings.filterwarnings('ignore')

#general
import pandas as pd
import numpy as np
from itertools import product

#for predictions
import implicit
from implicit.bpr import BayesianPersonalizedRanking
from implicit.lmf import LogisticMatrixFactorization
from implicit.nearest_neighbours import ItemItemRecommender, BM25Recommender
from scipy.sparse import coo_matrix

%run prepare_train_test_splits.ipynb
%run evaluate_models.ipynb

In [3]:
#load data
orders=pd.read_parquet('../archive/data/orders.parquet',engine='pyarrow')
outfits=pd.read_parquet('../archive/data/outfits.parquet',engine='pyarrow')

In [4]:
#prepare data

#convert tag_categories and outfit_tags to lists
outfits["tag_categories"] = outfits["tag_categories"].apply(eval)
outfits["outfit_tags"] = outfits["outfit_tags"].apply(eval)

outfits['group']=outfits['group'].astype(str)

# Convert triplets into entries for each individual user
orders = remove_consecutive_duplicates(orders)
user_orders_df = translate_user_triplets_to_orders(orders, outfits)
user_orders_df.dropna(inplace=True)

# Split the data into train and test sets, with one dataframe with no restirictions on outfits in the test data and one that prohibits repeated outfits
# It prints any cases in which it is unable to construct a test set with unique outfits.
user_splits_df, user_splits_unique_df = convert_user_orders_to_train_test_splits(user_orders_df, percentage_test=0.3)

3607
No unique outfit found with groups ['group.423a23f6717e6d85adac54c051ee9832'
 'group.423a23f6717e6d85adac54c051ee9832']
No unique outfit found with groups ['group.384b8170c6a6ddfd568ff7fab5fb49c4'
 'group.384b8170c6a6ddfd568ff7fab5fb49c4']
No unique outfit found with groups ['group.a3ab26b5d2f7ef2cf102422a3dde3b46'
 'group.a3ab26b5d2f7ef2cf102422a3dde3b46']
No unique outfit found with groups ['group.9b5204b87abc93f8f0467b0a6a9c6a97'
 'group.9b5204b87abc93f8f0467b0a6a9c6a97'
 'group.9b5204b87abc93f8f0467b0a6a9c6a97']
No unique outfit found with groups ['group.8e50238120d13b31284f151941c2ee81'
 'group.8e50238120d13b31284f151941c2ee81']
No unique outfit found with groups ['group.a494d07781a1aab0e3a42989288feff2'
 'group.a494d07781a1aab0e3a42989288feff2']
No unique outfit found with groups ['group.a1d284ef1c7035dd14e57eba3838a303'
 'group.a1d284ef1c7035dd14e57eba3838a303']
No unique outfit found with groups ['group.e0cb0f6e113edc4df8a1e304376734f6'
 'group.e0cb0f6e113edc4df8a1e3043767

### ALS

In [5]:
def train_als_model(user_splits_df, outfit_column, factors=16, regularization=0.1, iterations=50):
    flat_df = user_splits_df.explode(outfit_column)
    flat_df["value"] = 1
    
    unique_outfit_ids = flat_df[outfit_column].unique()
    outfit_to_index = {outfit_id: i for i, outfit_id in enumerate(unique_outfit_ids)}
    flat_df["outfit_index"] = flat_df[outfit_column].map(outfit_to_index)
    
    unique_users = pd.unique(flat_df.index)
    user_to_index = {user_id: i for i, user_id in enumerate(unique_users)}
    flat_df['user_index'] = flat_df.index.map(user_to_index)
    
    # Debug: Check if there are any negative indices
    if (flat_df['outfit_index'] < 0).any():
        raise ValueError('Negative outfit_index found')
    if (flat_df['user_index'] < 0).any():
        raise ValueError('Negative user_index found')

    coo = coo_matrix(
        (flat_df['value'].values, (flat_df['user_index'].values, flat_df['outfit_index'].values)),
        shape=(len(unique_users), len(unique_outfit_ids))
    )
    
    np.random.seed(42)
    model = implicit.als.AlternatingLeastSquares(factors=factors, regularization=regularization, iterations=iterations, )
    csr = coo.tocsr()
    model.fit(csr)

    return model, csr, outfit_to_index, user_to_index

def run_als_training_loop(df, factors, regularization, run_name=""):
    model_ind, csr_ind, outfit_id_to_index, user_to_index = train_als_model(df, "train_outfit_ids", factors=factors, regularization=regularization)
    model_group, csr_group, outfit_group_to_index, _ = train_als_model(df, "train_group", factors=factors, regularization=regularization)

    df["user_index"] = df.index.map(user_to_index)

    ind_index_to_id = {value : key for key, value in outfit_id_to_index.items()}
    df["ind_recommendations"] = df.apply(lambda x: evaluate_model(model_ind, csr_ind, x["user_index"], n=100), axis=1)
    df["ind_recommendations"] = df.apply(lambda x: get_outfit_id_from_index(x["ind_recommendations"], ind_index_to_id), axis=1)

    group_index_to_id = {value : key for key, value in outfit_group_to_index.items()}
    df["group_recommendations"] = df.apply(lambda x: evaluate_model(model_group, csr_group, x["user_index"], n=100), axis=1)
    df["group_recommendations"] = df.apply(lambda x: get_outfit_id_from_index(x["group_recommendations"], group_index_to_id), axis=1)

    HIT_RATE_COLUMNS = ["id_hit_rate_at_100", "id_hit_rate_at_10", "group_hit_rate_at_100", "group_hit_rate_at_10"]
    df["id_hit_rate_at_100"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["ind_recommendations"], n=100), axis=1)
    df["id_hit_rate_at_10"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["ind_recommendations"], n=10), axis=1)
    df["group_hit_rate_at_100"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_group"], x["group_recommendations"], n=100), axis=1)
    df["group_hit_rate_at_10"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_group"], x["group_recommendations"], n=10), axis=1)

    print(f"Run name: {run_name} Factors: {factors}, Regularization: {regularization}")
    display(df[HIT_RATE_COLUMNS].mean())
    print("="*20)

    result_dict = {column: df[column].mean() for column in HIT_RATE_COLUMNS}

    return df, (factors, regularization, df["group_hit_rate_at_100"].mean(), run_name), result_dict
    


TEST_FACTORS = [32]
TEST_REGULARIZATIONS = [0.01]
run_dataframes = [(user_splits_df, "All Outfits"), (user_splits_unique_df, "Unique Outfit")]

test_permutations = list(product(TEST_FACTORS, TEST_REGULARIZATIONS))
test_permutations = [(t_factors, t_reg, run_df) for t_factors, t_reg, run_df in product(TEST_FACTORS, TEST_REGULARIZATIONS, run_dataframes)]

group_hr_10_means, result_dicts = [], []
for test_factors, test_regularization, (df, run_name) in test_permutations:
    df, result, result_dict = run_als_training_loop(df, test_factors, test_regularization, run_name=run_name)
    result_dicts.append(result_dict)
    group_hr_10_means.append(result)

  check_blas_config()


  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Run name: All Outfits Factors: 32, Regularization: 0.01


id_hit_rate_at_100       0.176395
id_hit_rate_at_10        0.038740
group_hit_rate_at_100    0.239411
group_hit_rate_at_10     0.058884
dtype: float64



  0%|          | 0/50 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

Run name: Unique Outfit Factors: 32, Regularization: 0.01


id_hit_rate_at_100       0.156258
id_hit_rate_at_10        0.036797
group_hit_rate_at_100    0.222078
group_hit_rate_at_10     0.062970
dtype: float64



### BPR

In [6]:
def train_bpr_model(user_splits_df, outfit_column, factors=16, regularization=0.1, learning_rate=0.1, iterations=500):
    flat_df = user_splits_df.explode(outfit_column)
    flat_df["value"] = 1
    
    unique_outfit_ids = flat_df[outfit_column].unique()
    outfit_to_index = {outfit_id: i for i, outfit_id in enumerate(unique_outfit_ids)}
    flat_df["outfit_index"] = flat_df[outfit_column].map(outfit_to_index)
    
    unique_users = pd.unique(flat_df.index)
    user_to_index = {user_id: i for i, user_id in enumerate(unique_users)}
    flat_df['user_index'] = flat_df.index.map(user_to_index)
    
    # Debug: Check if there are any negative indices
    if (flat_df['outfit_index'] < 0).any():
        raise ValueError('Negative outfit_index found')
    if (flat_df['user_index'] < 0).any():
        raise ValueError('Negative user_index found')

    coo = coo_matrix(
        (flat_df['value'].values, (flat_df['user_index'].values, flat_df['outfit_index'].values)),
        shape=(len(unique_users), len(unique_outfit_ids))
    )
    
    np.random.seed(43)
    model = BayesianPersonalizedRanking(factors=factors, regularization=regularization, iterations=iterations, learning_rate=learning_rate)
    csr = coo.tocsr()
    model.fit(csr)
    
    return model, csr, outfit_to_index, user_to_index


def run_bpr_training_loop(df, factors, regularization, learning_rate, run_name=""):
    model_ind, csr_ind, outfit_id_to_index, user_to_index = train_bpr_model(df, "train_outfit_ids", factors=factors, regularization=regularization, learning_rate=learning_rate)
    model_group, csr_group, outfit_group_to_index, _ = train_bpr_model(df, "train_group", factors=factors, regularization=regularization, learning_rate=learning_rate)

    df["user_index"] = df.index.map(user_to_index)

    ind_index_to_id = {value : key for key, value in outfit_id_to_index.items()}
    df["ind_recommendations"] = df.apply(lambda x: evaluate_model(model_ind, csr_ind, x["user_index"], n=100), axis=1)
    df["ind_recommendations"] = df.apply(lambda x: get_outfit_id_from_index(x["ind_recommendations"], ind_index_to_id), axis=1)

    group_index_to_id = {value : key for key, value in outfit_group_to_index.items()}
    df["group_recommendations"] = df.apply(lambda x: evaluate_model(model_group, csr_group, x["user_index"], n=100), axis=1)
    df["group_recommendations"] = df.apply(lambda x: get_outfit_id_from_index(x["group_recommendations"], group_index_to_id), axis=1)

    HIT_RATE_COLUMNS = ["id_hit_rate_at_100", "id_hit_rate_at_10", "group_hit_rate_at_100", "group_hit_rate_at_10"]
    df["id_hit_rate_at_100"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["ind_recommendations"], n=100), axis=1)
    df["id_hit_rate_at_10"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["ind_recommendations"], n=10), axis=1)
    df["group_hit_rate_at_100"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_group"], x["group_recommendations"], n=100), axis=1)
    df["group_hit_rate_at_10"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_group"], x["group_recommendations"], n=10), axis=1)

    print(f"Run name: {run_name} Factors: {factors}, Regularization: {regularization}")
    display(df[HIT_RATE_COLUMNS].mean())
    print("="*20)

    result_dict = {column: df[column].mean() for column in HIT_RATE_COLUMNS}

    return df, (factors, regularization, df["group_hit_rate_at_100"].mean(), df["id_hit_rate_at_100"].mean(), run_name), result_dict


TEST_FACTORS = [128]
TEST_REGULARIZATIONS = [0.01]
TEST_LEARNING_RATES = [0.01]
run_dataframes = [(user_splits_df, "All Outfits"), (user_splits_unique_df, "Unique Outfit")]

test_permutations = list(product(TEST_FACTORS, TEST_REGULARIZATIONS, TEST_LEARNING_RATES))
test_permutations = [(t_factors, t_reg, t_lr, run_df) for t_factors, t_reg, t_lr, run_df in product(TEST_FACTORS, TEST_REGULARIZATIONS, TEST_LEARNING_RATES, run_dataframes)]

group_hr_10_means, result_dicts, result_dataframes = [], [], []
for test_factors, test_regularization, test_learning_rate, (df, run_name) in test_permutations:
    df, result, result_dict = run_bpr_training_loop(df, test_factors, test_regularization, test_learning_rate, run_name=run_name)
    result_dicts.append(result_dict)
    group_hr_10_means.append(result)
    result_dataframes.append(df.copy())

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Run name: All Outfits Factors: 128, Regularization: 0.01


id_hit_rate_at_100       0.200671
id_hit_rate_at_10        0.033316
group_hit_rate_at_100    0.227014
group_hit_rate_at_10     0.045196
dtype: float64



  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Run name: Unique Outfit Factors: 128, Regularization: 0.01


id_hit_rate_at_100       0.217932
id_hit_rate_at_10        0.039907
group_hit_rate_at_100    0.243068
group_hit_rate_at_10     0.051049
dtype: float64



In [7]:
# Check whether the exact hit items for individual and group recommendations overlap for BPR

standard_result_df, unique_result_df = result_dataframes
print(standard_result_df["id_hit_rate_at_100"].mean(), unique_result_df["id_hit_rate_at_100"].mean())

unique_columns = ["u_" + column_name for column_name in unique_result_df.columns]
unique_result_df.columns = unique_columns

def check_if_overlap_in_hit(row, column_name):
    hit_standard = row[column_name]
    hit_unique = row["u_" + column_name]
    return hit_standard > 0.1 and hit_unique > 0.1

all_results = pd.concat([standard_result_df, unique_result_df], axis=1)
all_results["overlap_100"] = all_results.apply(lambda x: check_if_overlap_in_hit(x, "id_hit_rate_at_100"), axis=1)
all_results["overlap_100"].mean()

0.20067148760330578 0.2179321067634102


0.08548553719008264

### MF

In [8]:
def train_lmf_model(user_splits_df, outfit_column, factors=16, regularization=0.1, learning_rate=0.1, iterations=500):
    flat_df = user_splits_df.explode(outfit_column)
    flat_df["value"] = 1
    
    unique_outfit_ids = flat_df[outfit_column].unique()
    outfit_to_index = {outfit_id: i for i, outfit_id in enumerate(unique_outfit_ids)}
    flat_df["outfit_index"] = flat_df[outfit_column].map(outfit_to_index)
    
    unique_users = pd.unique(flat_df.index)
    user_to_index = {user_id: i for i, user_id in enumerate(unique_users)}
    flat_df['user_index'] = flat_df.index.map(user_to_index)
    
    # Debug: Check if there are any negative indices
    if (flat_df['outfit_index'] < 0).any():
        raise ValueError('Negative outfit_index found')
    if (flat_df['user_index'] < 0).any():
        raise ValueError('Negative user_index found')

    coo = coo_matrix(
        (flat_df['value'].values, (flat_df['user_index'].values, flat_df['outfit_index'].values)),
        shape=(len(unique_users), len(unique_outfit_ids))
    )
    
    np.random.seed(43)
    model = LogisticMatrixFactorization(factors=factors, regularization=regularization, iterations=iterations, learning_rate=learning_rate)
    csr = coo.tocsr()
    model.fit(csr)
    
    return model, csr, outfit_to_index, user_to_index

def run_lmf_training_loop(df, factors, regularization, learning_rate, run_name=""):
    model_ind, csr_ind, outfit_id_to_index, user_to_index = train_lmf_model(df, "train_outfit_ids", factors=factors, regularization=regularization, learning_rate=learning_rate)
    model_group, csr_group, outfit_group_to_index, _ = train_lmf_model(df, "train_group", factors=factors, regularization=regularization, learning_rate=learning_rate)

    df["user_index"] = df.index.map(user_to_index)

    ind_index_to_id = {value : key for key, value in outfit_id_to_index.items()}
    df["ind_recommendations"] = df.apply(lambda x: evaluate_model(model_ind, csr_ind, x["user_index"], n=100), axis=1)
    df["ind_recommendations"] = df.apply(lambda x: get_outfit_id_from_index(x["ind_recommendations"], ind_index_to_id), axis=1)

    group_index_to_id = {value : key for key, value in outfit_group_to_index.items()}
    df["group_recommendations"] = df.apply(lambda x: evaluate_model(model_group, csr_group, x["user_index"], n=100), axis=1)
    df["group_recommendations"] = df.apply(lambda x: get_outfit_id_from_index(x["group_recommendations"], group_index_to_id), axis=1)

    HIT_RATE_COLUMNS = ["id_hit_rate_at_100", "id_hit_rate_at_10", "group_hit_rate_at_100", "group_hit_rate_at_10"]
    df["id_hit_rate_at_100"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["ind_recommendations"], n=100), axis=1)
    df["id_hit_rate_at_10"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["ind_recommendations"], n=10), axis=1)
    df["group_hit_rate_at_100"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_group"], x["group_recommendations"], n=100), axis=1)
    df["group_hit_rate_at_10"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_group"], x["group_recommendations"], n=10), axis=1)

    print(f"Run name: {run_name} Factors: {factors}, Regularization: {regularization}")
    display(df[HIT_RATE_COLUMNS].mean())
    print("="*20)

    result_dict = {column: df[column].mean() for column in HIT_RATE_COLUMNS}

    return df, (factors, regularization, df["group_hit_rate_at_10"].mean(), df["id_hit_rate_at_10"].mean(), run_name), result_dict


TEST_FACTORS = [128]
TEST_REGULARIZATIONS = [0.01]
TEST_LEARNING_RATES = [0.01]
#run_dataframes = [user_splits_df]
run_dataframes = [(user_splits_df, "All Outfits"), (user_splits_unique_df, "Unique Outfit")]

test_permutations = list(product(TEST_FACTORS, TEST_REGULARIZATIONS, TEST_LEARNING_RATES))
test_permutations = [(t_factors, t_reg, t_lr, run_df) for t_factors, t_reg, t_lr, run_df in product(TEST_FACTORS, TEST_REGULARIZATIONS, TEST_LEARNING_RATES, run_dataframes)]

group_hr_10_means, result_dicts = [], []
for test_factors, test_regularization, test_learning_rate, (df, run_name) in test_permutations:
    df, result, result_dict = run_lmf_training_loop(df, test_factors, test_regularization, test_learning_rate, run_name=run_name)
    result_dicts.append(result_dict)
    group_hr_10_means.append(result)

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Run name: All Outfits Factors: 128, Regularization: 0.01


id_hit_rate_at_100       0.036415
id_hit_rate_at_10        0.004132
group_hit_rate_at_100    0.052686
group_hit_rate_at_10     0.005424
dtype: float64



  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/500 [00:00<?, ?it/s]

Run name: Unique Outfit Factors: 128, Regularization: 0.01


id_hit_rate_at_100       0.037315
id_hit_rate_at_10        0.005442
group_hit_rate_at_100    0.053900
group_hit_rate_at_10     0.004924
dtype: float64



### NN
not in the original code

In [20]:
def train_nn_model(user_splits_df, outfit_column, k=20):
    flat_df = user_splits_df.explode(outfit_column)
    flat_df["value"] = 1
    
    unique_outfit_ids = flat_df[outfit_column].unique()
    outfit_to_index = {outfit_id: i for i, outfit_id in enumerate(unique_outfit_ids)}
    flat_df["outfit_index"] = flat_df[outfit_column].map(outfit_to_index)
    
    unique_users = pd.unique(flat_df.index)
    user_to_index = {user_id: i for i, user_id in enumerate(unique_users)}
    flat_df['user_index'] = flat_df.index.map(user_to_index)
    
    # Debug: Check if there are any negative indices
    if (flat_df['outfit_index'] < 0).any():
        raise ValueError('Negative outfit_index found')
    if (flat_df['user_index'] < 0).any():
        raise ValueError('Negative user_index found')

    coo = coo_matrix(
    (flat_df['value'].astype(np.float64).values, (flat_df['user_index'].values, flat_df['outfit_index'].values)),
    shape=(len(unique_users), len(unique_outfit_ids))
    )
    
    np.random.seed(43)
    model = ItemItemRecommender(K=k)
    csr = coo.tocsr()
    model.fit(csr)
    
    return model, csr, outfit_to_index, user_to_index

def run_nn_training_loop(df, k, run_name=""):
    model_ind, csr_ind, outfit_id_to_index, user_to_index = train_nn_model(df, "train_outfit_ids", k)
    model_group, csr_group, outfit_group_to_index, _ = train_nn_model(df, "train_group", k)

    df["user_index"] = df.index.map(user_to_index)

    ind_index_to_id = {value : key for key, value in outfit_id_to_index.items()}
    df["ind_recommendations"] = df.apply(lambda x: evaluate_model(model_ind, csr_ind, x["user_index"], n=100), axis=1)
    df["ind_recommendations"] = df.apply(lambda x: get_outfit_id_from_index(x["ind_recommendations"], ind_index_to_id), axis=1)

    group_index_to_id = {value : key for key, value in outfit_group_to_index.items()}
    df["group_recommendations"] = df.apply(lambda x: evaluate_model(model_group, csr_group, x["user_index"], n=100), axis=1)
    df["group_recommendations"] = df.apply(lambda x: get_outfit_id_from_index(x["group_recommendations"], group_index_to_id), axis=1)

    HIT_RATE_COLUMNS = ["id_hit_rate_at_100", "id_hit_rate_at_10", "group_hit_rate_at_100", "group_hit_rate_at_10"]
    df["id_hit_rate_at_100"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["ind_recommendations"], n=100), axis=1)
    df["id_hit_rate_at_10"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["ind_recommendations"], n=10), axis=1)
    df["group_hit_rate_at_100"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_group"], x["group_recommendations"], n=100), axis=1)
    df["group_hit_rate_at_10"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_group"], x["group_recommendations"], n=10), axis=1)

    print(f"Run name: {run_name}, K: {k}")
    display(df[HIT_RATE_COLUMNS].mean())
    print("="*20)

    result_dict = {column: df[column].mean() for column in HIT_RATE_COLUMNS}

    return df, (k, df["group_hit_rate_at_10"].mean(), df["id_hit_rate_at_10"].mean(), run_name), result_dict

K=[20]
run_dataframes = [(user_splits_df, "All Outfits"), (user_splits_unique_df, "Unique Outfit")]

test_permutations = list(product(K))
test_permutations = [(k, run_df) for k, run_df in product(K, run_dataframes)]

group_hr_10_means, result_dicts = [], []
for k, (df, run_name) in test_permutations:
    df, result, result_dict = run_nn_training_loop(df, k, run_name=run_name)
    result_dicts.append(result_dict)
    group_hr_10_means.append(result)

  0%|          | 0/10791 [00:00<?, ?it/s]

  0%|          | 0/7236 [00:00<?, ?it/s]

Run name: All Outfits, K: 20


id_hit_rate_at_100       0.151085
id_hit_rate_at_10        0.029184
group_hit_rate_at_100    0.205062
group_hit_rate_at_10     0.051653
dtype: float64



  0%|          | 0/11151 [00:00<?, ?it/s]

  0%|          | 0/7217 [00:00<?, ?it/s]

Run name: Unique Outfit, K: 20


id_hit_rate_at_100       0.127753
id_hit_rate_at_10        0.028764
group_hit_rate_at_100    0.188132
group_hit_rate_at_10     0.059342
dtype: float64



In [21]:
def train_BM25_model(user_splits_df, outfit_column, k=20, k1=100, b=0.8):
    flat_df = user_splits_df.explode(outfit_column)
    flat_df["value"] = 1
    
    unique_outfit_ids = flat_df[outfit_column].unique()
    outfit_to_index = {outfit_id: i for i, outfit_id in enumerate(unique_outfit_ids)}
    flat_df["outfit_index"] = flat_df[outfit_column].map(outfit_to_index)
    
    unique_users = pd.unique(flat_df.index)
    user_to_index = {user_id: i for i, user_id in enumerate(unique_users)}
    flat_df['user_index'] = flat_df.index.map(user_to_index)
    
    # Debug: Check if there are any negative indices
    if (flat_df['outfit_index'] < 0).any():
        raise ValueError('Negative outfit_index found')
    if (flat_df['user_index'] < 0).any():
        raise ValueError('Negative user_index found')

    coo = coo_matrix(
    (flat_df['value'].astype(np.float64).values, (flat_df['user_index'].values, flat_df['outfit_index'].values)),
    shape=(len(unique_users), len(unique_outfit_ids))
    )
    
    np.random.seed(43)
    model = BM25Recommender(K=k, K1=k1, B=b)
    csr = coo.tocsr()
    model.fit(csr)
    
    return model, csr, outfit_to_index, user_to_index

def run_BM25_training_loop(df, k, k1, b, run_name=""):
    model_ind, csr_ind, outfit_id_to_index, user_to_index = train_BM25_model(df, "train_outfit_ids", k, k1, b)
    model_group, csr_group, outfit_group_to_index, _ = train_BM25_model(df, "train_group", k, k1, b)

    df["user_index"] = df.index.map(user_to_index)

    ind_index_to_id = {value : key for key, value in outfit_id_to_index.items()}
    df["ind_recommendations"] = df.apply(lambda x: evaluate_model(model_ind, csr_ind, x["user_index"], n=100), axis=1)
    df["ind_recommendations"] = df.apply(lambda x: get_outfit_id_from_index(x["ind_recommendations"], ind_index_to_id), axis=1)

    group_index_to_id = {value : key for key, value in outfit_group_to_index.items()}
    df["group_recommendations"] = df.apply(lambda x: evaluate_model(model_group, csr_group, x["user_index"], n=100), axis=1)
    df["group_recommendations"] = df.apply(lambda x: get_outfit_id_from_index(x["group_recommendations"], group_index_to_id), axis=1)

    HIT_RATE_COLUMNS = ["id_hit_rate_at_100", "id_hit_rate_at_10", "group_hit_rate_at_100", "group_hit_rate_at_10"]
    df["id_hit_rate_at_100"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["ind_recommendations"], n=100), axis=1)
    df["id_hit_rate_at_10"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["ind_recommendations"], n=10), axis=1)
    df["group_hit_rate_at_100"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_group"], x["group_recommendations"], n=100), axis=1)
    df["group_hit_rate_at_10"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_group"], x["group_recommendations"], n=10), axis=1)

    print(f"Run name: {run_name}, K: {k}, K1: {k1}, B: {b}")
    display(df[HIT_RATE_COLUMNS].mean())
    print("="*20)

    result_dict = {column: df[column].mean() for column in HIT_RATE_COLUMNS}

    return df, (k, k1, b, df["group_hit_rate_at_10"].mean(), df["id_hit_rate_at_10"].mean(), run_name), result_dict

K=[20]
K1=[100]
B=[0.8]
run_dataframes = [(user_splits_df, "All Outfits"), (user_splits_unique_df, "Unique Outfit")]

test_permutations = list(product(K, K1, B))
test_permutations = [(k, k1, b, run_df) for k, k1, b, run_df in product(K, K1, B, run_dataframes)]

group_hr_10_means, result_dicts = [], []
for k, k1, b, (df, run_name) in test_permutations:
    df, result, result_dict = run_BM25_training_loop(df, k, k1, b, run_name=run_name)
    result_dicts.append(result_dict)
    group_hr_10_means.append(result)

  0%|          | 0/10791 [00:00<?, ?it/s]

  0%|          | 0/7236 [00:00<?, ?it/s]

Run name: All Outfits, K: 20, K1: 100, B: 0.8


id_hit_rate_at_100       0.079804
id_hit_rate_at_10        0.010331
group_hit_rate_at_100    0.089360
group_hit_rate_at_10     0.016012
dtype: float64



  0%|          | 0/11151 [00:00<?, ?it/s]

  0%|          | 0/7217 [00:00<?, ?it/s]

Run name: Unique Outfit, K: 20, K1: 100, B: 0.8


id_hit_rate_at_100       0.072558
id_hit_rate_at_10        0.013993
group_hit_rate_at_100    0.084737
group_hit_rate_at_10     0.016325
dtype: float64

