In [None]:
import pandas as pd
import numpy as np
import os
os.chdir(os.path.dirname(os.getcwd()))
print(os.getcwd())

%load_ext autoreload
%autoreload 2
%reload_ext autoreload


from resources.constants import *

pictures_df = pd.read_csv(PICTURE_TRIPLETS_CSV_PATH, sep=CSV_SEPARATOR)
outfits_df = pd.read_csv(OUTFITS_CSV_PATH, sep=CSV_SEPARATOR)
user_triplets_df = pd.read_csv(USER_ACTIVITY_TRIPLETS_CSV_PATH, sep=CSV_SEPARATOR)

# CSV files are read as strings, so we need to convert them to lists
outfits_df["tag_categories"] = outfits_df["tag_categories"].apply(eval)
outfits_df["outfit_tags"] = outfits_df["outfit_tags"].apply(eval)

In [None]:
original_orders_df = pd.read_csv(ORIGINAL_ORDERS_CSV_PATH, sep=CSV_SEPARATOR)
user_triplets_df = pd.concat([user_triplets_df, original_orders_df], ignore_index=True)

In [None]:
from src.prepare_train_test_splits import translate_user_triplets_to_orders, remove_consecutive_duplicates

# Convert triplets into entries for each individual user
user_triplets_df = remove_consecutive_duplicates(user_triplets_df)
user_orders_df = translate_user_triplets_to_orders(user_triplets_df, outfits_df)

In [None]:
import numpy as np
from src.prepare_train_test_splits import convert_user_orders_to_train_test_splits

# Split the data into train and test sets, with one dataframe with no restirictions on outfits in the test data and one that prohibits repeated outfits
# It prints any cases in which it is unable to construct a test set with unique outfits.
user_splits_df, user_splits_unique_df = convert_user_orders_to_train_test_splits(user_orders_df, percentage_test=0.3)

In [None]:
import pandas as pd
import implicit
from scipy.sparse import coo_matrix
from IPython.display import display
from itertools import product


from src.evaluate_models import evaluate_model, evaluate_hit_rate_at_n, get_outfit_id_from_index

def train_als_model(user_splits_df, outfit_column, factors=16, regularization=0.1, iterations=50):
    flat_df = user_splits_df.explode(outfit_column)
    flat_df["value"] = 1
    
    unique_outfit_ids = flat_df[outfit_column].unique()
    outfit_to_index = {outfit_id: i for i, outfit_id in enumerate(unique_outfit_ids)}
    flat_df["outfit_index"] = flat_df[outfit_column].map(outfit_to_index)
    
    unique_users = pd.unique(flat_df.index)
    user_to_index = {user_id: i for i, user_id in enumerate(unique_users)}
    flat_df['user_index'] = flat_df.index.map(user_to_index)
    
    # Debug: Check if there are any negative indices
    if (flat_df['outfit_index'] < 0).any():
        raise ValueError('Negative outfit_index found')
    if (flat_df['user_index'] < 0).any():
        raise ValueError('Negative user_index found')

    coo = coo_matrix(
        (flat_df['value'].values, (flat_df['user_index'].values, flat_df['outfit_index'].values)),
        shape=(len(unique_users), len(unique_outfit_ids))
    )
    
    np.random.seed(42)
    model = implicit.als.AlternatingLeastSquares(factors=factors, regularization=regularization, iterations=iterations, )
    csr = coo.tocsr()
    model.fit(csr)

    return model, csr, outfit_to_index, user_to_index

def run_als_training_loop(df, factors, regularization, run_name=""):
    model_ind, csr_ind, outfit_id_to_index, user_to_index = train_als_model(df, "train_outfit_ids", factors=factors, regularization=regularization)
    model_group, csr_group, outfit_group_to_index, _ = train_als_model(df, "train_group", factors=factors, regularization=regularization)

    df["user_index"] = df.index.map(user_to_index)

    ind_index_to_id = {value : key for key, value in outfit_id_to_index.items()}
    df["ind_recommendations"] = df.apply(lambda x: evaluate_model(model_ind, csr_ind, x["user_index"], n=100), axis=1)
    df["ind_recommendations"] = df.apply(lambda x: get_outfit_id_from_index(x["ind_recommendations"], ind_index_to_id), axis=1)

    group_index_to_id = {value : key for key, value in outfit_group_to_index.items()}
    df["group_recommendations"] = df.apply(lambda x: evaluate_model(model_group, csr_group, x["user_index"], n=100), axis=1)
    df["group_recommendations"] = df.apply(lambda x: get_outfit_id_from_index(x["group_recommendations"], group_index_to_id), axis=1)

    HIT_RATE_COLUMNS = ["id_hit_rate_at_100", "id_hit_rate_at_10", "group_hit_rate_at_100", "group_hit_rate_at_10"]
    df["id_hit_rate_at_100"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["ind_recommendations"], n=100), axis=1)
    df["id_hit_rate_at_10"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["ind_recommendations"], n=10), axis=1)
    df["group_hit_rate_at_100"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_group"], x["group_recommendations"], n=100), axis=1)
    df["group_hit_rate_at_10"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_group"], x["group_recommendations"], n=10), axis=1)

    print(f"Run name: {run_name} Factors: {factors}, Regularization: {regularization}")
    display(df[HIT_RATE_COLUMNS].mean())
    print("="*20)

    result_dict = {column: df[column].mean() for column in HIT_RATE_COLUMNS}

    return df, (factors, regularization, df["group_hit_rate_at_100"].mean(), run_name), result_dict
    


TEST_FACTORS = [32]
TEST_REGULARIZATIONS = [0.01]
run_dataframes = [(user_splits_df, "All Outfits"), (user_splits_unique_df, "Unique Outfit")]

test_permutations = list(product(TEST_FACTORS, TEST_REGULARIZATIONS))
test_permutations = [(t_factors, t_reg, run_df) for t_factors, t_reg, run_df in product(TEST_FACTORS, TEST_REGULARIZATIONS, run_dataframes)]

group_hr_10_means, result_dicts = [], []
for test_factors, test_regularization, (df, run_name) in test_permutations:
    df, result, result_dict = run_als_training_loop(df, test_factors, test_regularization, run_name=run_name)
    result_dicts.append(result_dict)
    group_hr_10_means.append(result)


In [None]:
import pyperclip

def format_dicts_into_latex(all_dict, ind_dict, precision=4):
    first_row = f"ALS Ind & {all_dict['id_hit_rate_at_10']:.{precision}f} & {all_dict['id_hit_rate_at_100']:.{precision}f} & {ind_dict['id_hit_rate_at_10']:.{precision}f} & {ind_dict['id_hit_rate_at_100']:.{precision}f} \\\\"
    second_row = f"ALS Groups & {all_dict['group_hit_rate_at_10']:.{precision}f} & {all_dict['group_hit_rate_at_100']:.{precision}f} & {ind_dict['group_hit_rate_at_10']:.{precision}f} & {ind_dict['group_hit_rate_at_100']:.{precision}f} \\\\\\hline"
    full_string = first_row + "\n" + second_row + "\n"
    print(full_string)
    pyperclip.copy(full_string)

all_dict, ind_dict = result_dicts
format_dicts_into_latex(all_dict, ind_dict, precision=4)

In [None]:
import pandas as pd
import implicit
from scipy.sparse import coo_matrix
from IPython.display import display
from itertools import product

from implicit.bpr import BayesianPersonalizedRanking


def train_bpr_model(user_splits_df, outfit_column, factors=16, regularization=0.1, learning_rate=0.1, iterations=500):
    flat_df = user_splits_df.explode(outfit_column)
    flat_df["value"] = 1
    
    unique_outfit_ids = flat_df[outfit_column].unique()
    outfit_to_index = {outfit_id: i for i, outfit_id in enumerate(unique_outfit_ids)}
    flat_df["outfit_index"] = flat_df[outfit_column].map(outfit_to_index)
    
    unique_users = pd.unique(flat_df.index)
    user_to_index = {user_id: i for i, user_id in enumerate(unique_users)}
    flat_df['user_index'] = flat_df.index.map(user_to_index)
    
    # Debug: Check if there are any negative indices
    if (flat_df['outfit_index'] < 0).any():
        raise ValueError('Negative outfit_index found')
    if (flat_df['user_index'] < 0).any():
        raise ValueError('Negative user_index found')

    coo = coo_matrix(
        (flat_df['value'].values, (flat_df['user_index'].values, flat_df['outfit_index'].values)),
        shape=(len(unique_users), len(unique_outfit_ids))
    )
    
    np.random.seed(43)
    model = BayesianPersonalizedRanking(factors=factors, regularization=regularization, iterations=iterations, learning_rate=learning_rate)
    csr = coo.tocsr()
    model.fit(csr)
    
    return model, csr, outfit_to_index, user_to_index


def run_bpr_training_loop(df, factors, regularization, learning_rate, run_name=""):
    model_ind, csr_ind, outfit_id_to_index, user_to_index = train_bpr_model(df, "train_outfit_ids", factors=factors, regularization=regularization, learning_rate=learning_rate)
    model_group, csr_group, outfit_group_to_index, _ = train_bpr_model(df, "train_group", factors=factors, regularization=regularization, learning_rate=learning_rate)

    df["user_index"] = df.index.map(user_to_index)

    ind_index_to_id = {value : key for key, value in outfit_id_to_index.items()}
    df["ind_recommendations"] = df.apply(lambda x: evaluate_model(model_ind, csr_ind, x["user_index"], n=100), axis=1)
    df["ind_recommendations"] = df.apply(lambda x: get_outfit_id_from_index(x["ind_recommendations"], ind_index_to_id), axis=1)

    group_index_to_id = {value : key for key, value in outfit_group_to_index.items()}
    df["group_recommendations"] = df.apply(lambda x: evaluate_model(model_group, csr_group, x["user_index"], n=100), axis=1)
    df["group_recommendations"] = df.apply(lambda x: get_outfit_id_from_index(x["group_recommendations"], group_index_to_id), axis=1)

    HIT_RATE_COLUMNS = ["id_hit_rate_at_100", "id_hit_rate_at_10", "group_hit_rate_at_100", "group_hit_rate_at_10"]
    df["id_hit_rate_at_100"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["ind_recommendations"], n=100), axis=1)
    df["id_hit_rate_at_10"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["ind_recommendations"], n=10), axis=1)
    df["group_hit_rate_at_100"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_group"], x["group_recommendations"], n=100), axis=1)
    df["group_hit_rate_at_10"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_group"], x["group_recommendations"], n=10), axis=1)

    print(f"Run name: {run_name} Factors: {factors}, Regularization: {regularization}")
    display(df[HIT_RATE_COLUMNS].mean())
    print("="*20)

    result_dict = {column: df[column].mean() for column in HIT_RATE_COLUMNS}

    return df, (factors, regularization, df["group_hit_rate_at_100"].mean(), df["id_hit_rate_at_100"].mean(), run_name), result_dict


TEST_FACTORS = [128]
TEST_REGULARIZATIONS = [0.01]
TEST_LEARNING_RATES = [0.01]
#run_dataframes = [user_splits_df]
run_dataframes = [(user_splits_df, "All Outfits"), (user_splits_unique_df, "Unique Outfit")]

test_permutations = list(product(TEST_FACTORS, TEST_REGULARIZATIONS, TEST_LEARNING_RATES))
test_permutations = [(t_factors, t_reg, t_lr, run_df) for t_factors, t_reg, t_lr, run_df in product(TEST_FACTORS, TEST_REGULARIZATIONS, TEST_LEARNING_RATES, run_dataframes)]

group_hr_10_means, result_dicts, result_dataframes = [], [], []
for test_factors, test_regularization, test_learning_rate, (df, run_name) in test_permutations:
    df, result, result_dict = run_bpr_training_loop(df, test_factors, test_regularization, test_learning_rate, run_name=run_name)
    result_dicts.append(result_dict)
    group_hr_10_means.append(result)
    result_dataframes.append(df.copy())

In [None]:
# Check whether the exact hit items for individual and group recommendations overlap for BPR

standard_result_df, unique_result_df = result_dataframes
print(standard_result_df["id_hit_rate_at_100"].mean(), unique_result_df["id_hit_rate_at_100"].mean())

unique_columns = ["u_" + column_name for column_name in unique_result_df.columns]
unique_result_df.columns = unique_columns

def check_if_overlap_in_hit(row, column_name):
    hit_standard = row[column_name]
    hit_unique = row["u_" + column_name]
    return hit_standard > 0.1 and hit_unique > 0.1

all_results = pd.concat([standard_result_df, unique_result_df], axis=1)
all_results["overlap_100"] = all_results.apply(lambda x: check_if_overlap_in_hit(x, "id_hit_rate_at_100"), axis=1)
all_results["overlap_100"].mean()

In [None]:
import pyperclip

def format_dicts_into_latex(all_dict, ind_dict, precision=4):
    first_row = f"BPR Ind & {all_dict['id_hit_rate_at_10']:.{precision}f} & {all_dict['id_hit_rate_at_100']:.{precision}f} & {ind_dict['id_hit_rate_at_10']:.{precision}f} & {ind_dict['id_hit_rate_at_100']:.{precision}f} \\\\"
    second_row = f"BPR Groups & {all_dict['group_hit_rate_at_10']:.{precision}f} & {all_dict['group_hit_rate_at_100']:.{precision}f} & {ind_dict['group_hit_rate_at_10']:.{precision}f} & {ind_dict['group_hit_rate_at_100']:.{precision}f} \\\\\\hline"
    full_string = first_row + "\n" + second_row + "\n"
    print(full_string)
    pyperclip.copy(full_string)

all_dict, ind_dict = result_dicts
format_dicts_into_latex(all_dict, ind_dict, precision=4)

In [None]:
import pandas as pd
import implicit
from scipy.sparse import coo_matrix
from IPython.display import display
from itertools import product

from implicit.lmf import LogisticMatrixFactorization

def train_lmf_model(user_splits_df, outfit_column, factors=16, regularization=0.1, learning_rate=0.1, iterations=500):
    flat_df = user_splits_df.explode(outfit_column)
    flat_df["value"] = 1
    
    unique_outfit_ids = flat_df[outfit_column].unique()
    outfit_to_index = {outfit_id: i for i, outfit_id in enumerate(unique_outfit_ids)}
    flat_df["outfit_index"] = flat_df[outfit_column].map(outfit_to_index)
    
    unique_users = pd.unique(flat_df.index)
    user_to_index = {user_id: i for i, user_id in enumerate(unique_users)}
    flat_df['user_index'] = flat_df.index.map(user_to_index)
    
    # Debug: Check if there are any negative indices
    if (flat_df['outfit_index'] < 0).any():
        raise ValueError('Negative outfit_index found')
    if (flat_df['user_index'] < 0).any():
        raise ValueError('Negative user_index found')

    coo = coo_matrix(
        (flat_df['value'].values, (flat_df['user_index'].values, flat_df['outfit_index'].values)),
        shape=(len(unique_users), len(unique_outfit_ids))
    )
    
    np.random.seed(43)
    model = LogisticMatrixFactorization(factors=factors, regularization=regularization, iterations=iterations, learning_rate=learning_rate)
    csr = coo.tocsr()
    model.fit(csr)
    
    return model, csr, outfit_to_index, user_to_index

def run_lmf_training_loop(df, factors, regularization, learning_rate, run_name=""):
    model_ind, csr_ind, outfit_id_to_index, user_to_index = train_lmf_model(df, "train_outfit_ids", factors=factors, regularization=regularization, learning_rate=learning_rate)
    model_group, csr_group, outfit_group_to_index, _ = train_lmf_model(df, "train_group", factors=factors, regularization=regularization, learning_rate=learning_rate)

    df["user_index"] = df.index.map(user_to_index)

    ind_index_to_id = {value : key for key, value in outfit_id_to_index.items()}
    df["ind_recommendations"] = df.apply(lambda x: evaluate_model(model_ind, csr_ind, x["user_index"], n=100), axis=1)
    df["ind_recommendations"] = df.apply(lambda x: get_outfit_id_from_index(x["ind_recommendations"], ind_index_to_id), axis=1)

    group_index_to_id = {value : key for key, value in outfit_group_to_index.items()}
    df["group_recommendations"] = df.apply(lambda x: evaluate_model(model_group, csr_group, x["user_index"], n=100), axis=1)
    df["group_recommendations"] = df.apply(lambda x: get_outfit_id_from_index(x["group_recommendations"], group_index_to_id), axis=1)

    HIT_RATE_COLUMNS = ["id_hit_rate_at_100", "id_hit_rate_at_10", "group_hit_rate_at_100", "group_hit_rate_at_10"]
    df["id_hit_rate_at_100"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["ind_recommendations"], n=100), axis=1)
    df["id_hit_rate_at_10"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_outfit_id"], x["ind_recommendations"], n=10), axis=1)
    df["group_hit_rate_at_100"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_group"], x["group_recommendations"], n=100), axis=1)
    df["group_hit_rate_at_10"] = df.apply(lambda x: evaluate_hit_rate_at_n(x["test_group"], x["group_recommendations"], n=10), axis=1)

    print(f"Run name: {run_name} Factors: {factors}, Regularization: {regularization}")
    display(df[HIT_RATE_COLUMNS].mean())
    print("="*20)

    result_dict = {column: df[column].mean() for column in HIT_RATE_COLUMNS}

    return df, (factors, regularization, df["group_hit_rate_at_10"].mean(), df["id_hit_rate_at_10"].mean(), run_name), result_dict


TEST_FACTORS = [128]
TEST_REGULARIZATIONS = [0.01]
TEST_LEARNING_RATES = [0.01]
#run_dataframes = [user_splits_df]
run_dataframes = [(user_splits_df, "All Outfits"), (user_splits_unique_df, "Unique Outfit")]

test_permutations = list(product(TEST_FACTORS, TEST_REGULARIZATIONS, TEST_LEARNING_RATES))
test_permutations = [(t_factors, t_reg, t_lr, run_df) for t_factors, t_reg, t_lr, run_df in product(TEST_FACTORS, TEST_REGULARIZATIONS, TEST_LEARNING_RATES, run_dataframes)]

group_hr_10_means, result_dicts = [], []
for test_factors, test_regularization, test_learning_rate, (df, run_name) in test_permutations:
    df, result, result_dict = run_lmf_training_loop(df, test_factors, test_regularization, test_learning_rate, run_name=run_name)
    result_dicts.append(result_dict)
    group_hr_10_means.append(result)

In [None]:
def format_dicts_into_latex(all_dict, ind_dict, precision=4):
    first_row = f"LMF Ind: & {all_dict['id_hit_rate_at_10']:.{precision}f} & {all_dict['id_hit_rate_at_100']:.{precision}f} & {ind_dict['id_hit_rate_at_10']:.{precision}f} & {ind_dict['id_hit_rate_at_100']:.{precision}f} \\\\"
    second_row = f"LMF Groupd: & {all_dict['group_hit_rate_at_10']:.{precision}f} & {all_dict['group_hit_rate_at_100']:.{precision}f} & {ind_dict['group_hit_rate_at_10']:.{precision}f} & {ind_dict['group_hit_rate_at_100']:.{precision}f} \\\\\\hline"
    print(first_row)
    print(second_row)

all_dict, ind_dict = result_dicts
format_dicts_into_latex(all_dict, ind_dict, precision=4)