In [1]:
#install
!pip install pyarrow

#import
import pandas as pd
import numpy as np

%run prepare_train_test_splits.ipynb
%run evaluate_models.ipynb
%run availability_based_methods.ipynb



In [2]:
#load data
orders=pd.read_parquet('../archive/data/orders.parquet',engine='pyarrow')
outfits=pd.read_parquet('../archive/data/outfits.parquet',engine='pyarrow')

In [3]:
#prepare data

#convert tag_categories and outfit_tags to lists
outfits["tag_categories"] = outfits["tag_categories"].apply(eval)
outfits["outfit_tags"] = outfits["outfit_tags"].apply(eval)

outfits['group']=outfits['group'].astype(str)

# Convert triplets into entries for each individual user
orders = remove_consecutive_duplicates(orders)
user_orders_df = translate_user_triplets_to_orders(orders, outfits)
user_orders_df.dropna(inplace=True)

# Split the data into train and test sets, with one dataframe with no restirictions on outfits in the test data and one that prohibits repeated outfits
# It prints any cases in which it is unable to construct a test set with unique outfits.
user_splits, user_splits_unique = convert_user_orders_to_train_test_splits(user_orders_df, percentage_test=0.3)

3607
No unique outfit found with groups ['group.423a23f6717e6d85adac54c051ee9832'
 'group.423a23f6717e6d85adac54c051ee9832']
No unique outfit found with groups ['group.384b8170c6a6ddfd568ff7fab5fb49c4'
 'group.384b8170c6a6ddfd568ff7fab5fb49c4']
No unique outfit found with groups ['group.a3ab26b5d2f7ef2cf102422a3dde3b46'
 'group.a3ab26b5d2f7ef2cf102422a3dde3b46']
No unique outfit found with groups ['group.9b5204b87abc93f8f0467b0a6a9c6a97'
 'group.9b5204b87abc93f8f0467b0a6a9c6a97'
 'group.9b5204b87abc93f8f0467b0a6a9c6a97']
No unique outfit found with groups ['group.8e50238120d13b31284f151941c2ee81'
 'group.8e50238120d13b31284f151941c2ee81']
No unique outfit found with groups ['group.a494d07781a1aab0e3a42989288feff2'
 'group.a494d07781a1aab0e3a42989288feff2']
No unique outfit found with groups ['group.a1d284ef1c7035dd14e57eba3838a303'
 'group.a1d284ef1c7035dd14e57eba3838a303']
No unique outfit found with groups ['group.e0cb0f6e113edc4df8a1e304376734f6'
 'group.e0cb0f6e113edc4df8a1e3043767

In [4]:
user_splits.columns

Index(['train_outfit_ids', 'test_outfit_id', 'train_group', 'test_group',
       'train_booking_times_start', 'test_booking_time_start',
       'train_booking_times_end', 'test_booking_time_end'],
      dtype='object')

In [5]:
user_splits.head()

Unnamed: 0,train_outfit_ids,test_outfit_id,train_group,test_group,train_booking_times_start,test_booking_time_start,train_booking_times_end,test_booking_time_end
0,"[outfit.85f26909d8334ab78f30c2fc9c73faf7, outf...",[outfit.9f5058295098471abdfaf0a7c74ddbfe],"[group.c79c907b6c94a9bd2005e038943ab529, group...",[group.f6f0b9ebb3228aab27a79ac658c76682],"[2023-11-22 00:00:00, 2023-11-24 00:00:00, 202...",[2023-12-06 00:00:00],"[2023-12-21 00:00:00, 2023-12-23 00:00:00, 202...",[2024-01-05 00:00:00]
1,"[outfit.d7bff1b799a34575a47ce0f531791c9f, outf...","[outfit.98fa1b5287182a9d, outfit.dd04098010f74...","[group.287dba5268fb7b20e8ef81c053970691, group...","[group.a4449ee16d7951f425083623efd0dcec, group...","[2021-08-02 00:00:00, 2021-08-02 00:00:00, 202...","[2021-11-01 00:00:00, 2021-12-01 00:00:00, 202...","[2021-08-27 00:00:00, 2021-08-27 00:00:00, 202...","[2021-11-30 00:00:00, 2021-12-31 00:00:00, 202..."
2,[outfit.9fde090f117fb9d9],[outfit.849ace7e1811150d],[group.27808d969027a4e243c8945176f280c0],[group.caafbed55494b0c93dab58d58d526f0a],[2018-09-06 00:00:00],[2018-09-06 00:00:00],[2018-09-09 00:00:00],[2018-09-09 00:00:00]
3,"[outfit.98eebea274f23dd6, outfit.648db79508724...","[outfit.b2c68e50868a46a8872e81bcd3a17870, outf...","[group.a02de08741b879719c3ea97e24e5f230, group...","[group.69217601bce159dcf21b4c8e6f059f42, group...","[2021-08-25 00:00:00, 2021-08-25 00:00:00, 202...","[2022-02-28 00:00:00, 2022-02-28 00:00:00, 202...","[2021-09-24 00:00:00, 2021-09-24 00:00:00, 202...","[2022-03-28 00:00:00, 2022-03-28 00:00:00, 202..."
4,"[outfit.5e1b9778e36d475699772148e5d4e27b, outf...",[outfit.7321c26a479e46cd9fb07fa3ab7d7594],"[group.0a736bffd33390d7693442e6eecd0f35, group...",[group.cce63b3a8de0f3495c0744990e88b78f],"[2019-11-20 00:00:00, 2019-11-20 00:00:00]",[2019-11-20 00:00:00],"[2019-11-21 00:00:00, 2019-11-21 00:00:00]",[2019-11-21 00:00:00]


In [6]:
orders.head()

Unnamed: 0,customer.id,outfit.id,rentalPeriod.start,rentalPeriod.end,group
0,3945,outfit.923f3fd476b5450b9582d1f525604546,2018-05-25,2018-05-28,group.6f394f7e504a39f26ca691fb02e5ed22
1,4088,outfit.8c8e922e228ba03f,2019-08-29,2019-09-02,group.9e3f83b7a4adb80992eee691fc83403f
2,4360,outfit.96f152543e7668ae,2018-08-10,2018-08-13,group.32ebcc4a2acfe62819949bb3b6c9256b
3,4697,outfit.ddba05a5ced34fa1ab3a0722c05bb11a,2018-06-14,2018-06-19,group.27f89b2927dbb3def926fa21ac1bdd26
4,3890,outfit.5ef01d4dc15243fb854ca797716fd663,2019-08-24,2019-08-27,group.9ba8ee768fcc2ca9c7dee05b23b7daa0


In [7]:
# Assorted functions for building simple heuristics for recommender systems

def get_most_popular_outfits(df, n=10):
    most_popular_train_outfit_ids = df["train_outfit_ids"].explode().value_counts().index[:n]
    most_popular_train_groups = df["train_group"].explode().value_counts().index[:n]

    return np.array(most_popular_train_outfit_ids), np.array(most_popular_train_groups)

def get_previous_rentals(df, n=10):
    df["id_prediction"] = df["train_outfit_ids"].apply(lambda x: x if len(x) < n else x[n:])
    df["group_prediction"] = df["train_group"].apply(lambda x: x if len(x) < n else x[n:])

    return df

def pad_with_most_popular(x, pop_outfits, n=10):
    if len(x) < n:
        return np.append(x, pop_outfits[:n - len(x)])
    else:
        return x[:n]

def get_previous_rentals_pad_most_popular(df, n=10):
    most_popular_train_outfit_ids, most_popular_train_groups = get_most_popular_outfits(df, n)
    df["id_prediction"] = df.apply(lambda x: pad_with_most_popular(x["train_outfit_ids"], most_popular_train_outfit_ids, n), axis=1)
    df["group_prediction"] = df.apply(lambda x: pad_with_most_popular(x["train_group"], most_popular_train_groups, n), axis=1)
    return df

In [8]:
# The maximum number of items to recommend
NUM_ITEMS = 100

# The below code represents the four baseline methods discussed.

"""
- availability_based determines whether to filter out clothes that are currently rented
- use_testing_time determines whether we are currently testing the system or making real predictions
    (True is used to test the system), only used if availability_based = True"""

# Most popular outfits prediction
def predict_most_popular(user_splits_df_pop, user_splits_unique_df_pop,availability_based=True,use_testing_time=True):
    most_popular_train_outfit_ids_pop, most_popular_train_groups_pop = get_most_popular_outfits(user_splits_df_pop, NUM_ITEMS)
    user_splits_df_pop["id_prediction"] = [most_popular_train_outfit_ids_pop] * len(user_splits_df_pop)
    user_splits_df_pop["group_prediction"] = [most_popular_train_groups_pop] * len(user_splits_df_pop)
    most_popular_train_outfit_ids_pop, most_popular_train_groups_pop = get_most_popular_outfits(user_splits_unique_df_pop, NUM_ITEMS)
    user_splits_unique_df_pop["id_prediction"] = [most_popular_train_outfit_ids_pop] * len(user_splits_unique_df_pop)
    user_splits_unique_df_pop["group_prediction"] = [most_popular_train_groups_pop] * len(user_splits_unique_df_pop)
    if availability_based==True:
        user_splits_df_pop=availability_based_filter(user_splits_df_pop,use_testing_time)
        user_splits_unique_df_pop=availability_based_filter(user_splits_unique_df_pop,use_testing_time)
    return user_splits_df_pop, user_splits_unique_df_pop

# Previous rental prediction
def predict_previous_rental(user_splits_df_rep, user_splits_unique_df_rep,availability_based=True,use_testing_time=True):
    user_splits_df_rep["id_prediction"] = user_splits_df_rep["train_outfit_ids"].apply(lambda x: x[-NUM_ITEMS:])
    user_splits_df_rep["group_prediction"] = user_splits_df_rep["train_group"].apply(lambda x: x[-NUM_ITEMS:])
    user_splits_unique_df_rep["id_prediction"] = user_splits_unique_df_rep["train_outfit_ids"].apply(lambda x: x if len(x) <= NUM_ITEMS else x[-NUM_ITEMS:])
    user_splits_unique_df_rep["group_prediction"] = user_splits_unique_df_rep["train_group"].apply(lambda x: x if len(x) <= NUM_ITEMS else x[-NUM_ITEMS:])
    if availability_based==True:
        user_splits_df_rep=availability_based_filter(user_splits_df_rep,use_testing_time)
        user_splits_unique_df_rep=availability_based_filter(user_splits_unique_df_rep,use_testing_time)
    return user_splits_df_rep, user_splits_unique_df_rep

# Previous rental + most popular outfits prediction
def predict_rental_and_most_popular(user_splits_df, user_splits_unique_df,availability_based=True,use_testing_time=True):
    def pad_with_most_popular(x, pop_outfits):
        if len(x) < NUM_ITEMS:
            return np.append(x, pop_outfits[:NUM_ITEMS - len(x)])
        else:
            return x[-NUM_ITEMS:]

    most_popular_train_outfit_ids, most_popular_train_groups = get_most_popular_outfits(user_splits_df, NUM_ITEMS)
    user_splits_df["id_prediction"] = user_splits_df.apply(lambda x: pad_with_most_popular(x["train_outfit_ids"], most_popular_train_outfit_ids), axis=1)
    user_splits_df["group_prediction"] = user_splits_df.apply(lambda x: pad_with_most_popular(x["train_group"], most_popular_train_groups), axis=1)
    user_splits_unique_df["id_prediction"] = user_splits_unique_df.apply(lambda x: pad_with_most_popular(x["train_outfit_ids"], most_popular_train_outfit_ids), axis=1)
    user_splits_unique_df["group_prediction"] = user_splits_unique_df.apply(lambda x: pad_with_most_popular(x["train_group"], most_popular_train_groups), axis=1)
    if availability_based==True:
        user_splits_df=availability_based_filter(user_splits_df,use_testing_time)
        user_splits_unique_df=availability_based_filter(user_splits_unique_df,use_testing_time)
    return user_splits_df, user_splits_unique_df

# Random prediction
def predict_random_outfit(user_splits_df_rand, user_splits_unique_df_rand,availability_based=True,use_testing_time=True):
    def get_random_outfits(x):
        return np.random.choice(all_outfit_ids_rand, NUM_ITEMS, replace=False)
    all_outfit_ids_rand = outfits["id"].values
    all_groups_rand = outfits["group"].values
    user_splits_df_rand["id_prediction"] = user_splits_df_rand.apply(lambda x: get_random_outfits(x), axis=1)
    user_splits_df_rand["group_prediction"] = user_splits_df_rand.apply(lambda x: np.random.choice(all_groups_rand, NUM_ITEMS, replace=False), axis=1)
    user_splits_unique_df_rand["id_prediction"] = user_splits_unique_df_rand.apply(lambda x: get_random_outfits(x), axis=1)
    user_splits_unique_df_rand["group_prediction"] = user_splits_unique_df_rand.apply(lambda x: np.random.choice(all_groups_rand, NUM_ITEMS, replace=False), axis=1)
    if availability_based==True:
        user_splits_df_rand=availability_based_filter(user_splits_df_rand,use_testing_time)
        user_splits_unique_df_rand=availability_based_filter(user_splits_unique_df_rand,use_testing_time)
    return user_splits_df_rand, user_splits_unique_df_rand

In [9]:
#predict and evaluat for method  Most popular 
user_splits_df_pop, user_splits_unique_df_pop = predict_most_popular(user_splits, user_splits_unique,availability_based=False)
print(f"Baseline evaluation for method:  Most popular")
user_splits_df_pop, all_dict_pop = evaluate_df_hit_rate_at_n(user_splits_df_pop, n=10)
user_splits_unique_df_pop, ind_dict_pop = evaluate_df_hit_rate_at_n(user_splits_unique_df_pop, n=10)

Baseline evaluation for method:  Most popular


id_hit_rate_at_100       0.117252
id_hit_rate_at_10        0.021952
group_hit_rate_at_100    0.206870
group_hit_rate_at_10     0.044163
dtype: float64

id_hit_rate_at_100       0.116870
id_hit_rate_at_10        0.021767
group_hit_rate_at_100    0.205494
group_hit_rate_at_10     0.044312
dtype: float64

In [10]:
#predict and evaluat for method Most popular with availability_based
user_splits_df_pop, user_splits_unique_df_pop = predict_most_popular(user_splits, user_splits_unique,availability_based=True, use_testing_time=True)
print(f"Baseline evaluation for method:  Most popular")
user_splits_df_pop, all_dict_pop = evaluate_df_hit_rate_at_n(user_splits_df_pop, n=10)
user_splits_unique_df_pop, ind_dict_pop = evaluate_df_hit_rate_at_n(user_splits_unique_df_pop, n=10)

Baseline evaluation for method:  Most popular


id_hit_rate_at_100       0.117252
id_hit_rate_at_10        0.021952
group_hit_rate_at_100    0.206870
group_hit_rate_at_10     0.044163
dtype: float64

id_hit_rate_at_100       0.116870
id_hit_rate_at_10        0.021767
group_hit_rate_at_100    0.205494
group_hit_rate_at_10     0.044312
dtype: float64

In [12]:
#predict and evaluat for method Previous rental
user_splits_df_rep, user_splits_unique_df_rep = predict_previous_rental(user_splits, user_splits_unique,availability_based=False)
print(f"Baseline evaluation for method: Previous rental")
user_splits_df_rep, all_dict_rep = evaluate_df_hit_rate_at_n(user_splits_df_rep, n=10)
user_splits_unique_df_rep, ind_dict_rep = evaluate_df_hit_rate_at_n(user_splits_unique_df_rep, n=10)

Baseline evaluation for method: Previous rental


id_hit_rate_at_100       0.125775
id_hit_rate_at_10        0.055527
group_hit_rate_at_100    0.153151
group_hit_rate_at_10     0.077221
dtype: float64

id_hit_rate_at_100       0.123089
id_hit_rate_at_10        0.052604
group_hit_rate_at_100    0.150298
group_hit_rate_at_10     0.074112
dtype: float64

In [13]:
#predict and evaluat for method Previous rental with availability_based
del user_splits_unique_df_rep
user_splits_df_rep, user_splits_unique_df_rep = predict_previous_rental(user_splits, user_splits_unique,availability_based=True, use_testing_time=True)
print(f"Baseline evaluation for method: Previous rental")
user_splits_df_rep, all_dict_rep = evaluate_df_hit_rate_at_n(user_splits_df_rep, n=10)
user_splits_unique_df_rep, ind_dict_rep = evaluate_df_hit_rate_at_n(user_splits_unique_df_rep, n=10)

Baseline evaluation for method: Previous rental


id_hit_rate_at_100       0.125775
id_hit_rate_at_10        0.055527
group_hit_rate_at_100    0.153151
group_hit_rate_at_10     0.077221
dtype: float64

id_hit_rate_at_100       0.123089
id_hit_rate_at_10        0.052604
group_hit_rate_at_100    0.150298
group_hit_rate_at_10     0.074112
dtype: float64

In [14]:
#predict and evaluat for method Previous rental + Most popular
user_splits_df_rep_pop, user_splits_unique_df_rep_pop = predict_rental_and_most_popular(user_splits, user_splits_unique,availability_based=False)
print(f"Baseline evaluation for method: Previous rental + Most popular")
user_splits_df_rep_pop, all_dict_rep_pop = evaluate_df_hit_rate_at_n(user_splits_df_rep_pop, n=10)
user_splits_unique_df_rep_pop, ind_dict_rep_pop = evaluate_df_hit_rate_at_n(user_splits_unique_df_rep_pop, n=10)

Baseline evaluation for method: Previous rental + Most popular


id_hit_rate_at_100       0.197572
id_hit_rate_at_10        0.062242
group_hit_rate_at_100    0.262913
group_hit_rate_at_10     0.080062
dtype: float64

id_hit_rate_at_100       0.195128
id_hit_rate_at_10        0.059342
group_hit_rate_at_100    0.260430
group_hit_rate_at_10     0.076963
dtype: float64

In [15]:
#predict and evaluat for method Previous rental + Most popular with ,availability_based
user_splits_df_rep_pop, user_splits_unique_df_rep_pop = predict_rental_and_most_popular(user_splits, user_splits_unique,availability_based=True, use_testing_time=True)
print(f"Baseline evaluation for method: Previous rental + Most popular")
user_splits_df_rep_pop, all_dict_rep_pop = evaluate_df_hit_rate_at_n(user_splits_df_rep_pop, n=10)
user_splits_unique_df_rep_pop, ind_dict_rep_pop = evaluate_df_hit_rate_at_n(user_splits_unique_df_rep_pop, n=10)

Baseline evaluation for method: Previous rental + Most popular


id_hit_rate_at_100       0.197572
id_hit_rate_at_10        0.062242
group_hit_rate_at_100    0.262913
group_hit_rate_at_10     0.080062
dtype: float64

id_hit_rate_at_100       0.195128
id_hit_rate_at_10        0.059342
group_hit_rate_at_100    0.260430
group_hit_rate_at_10     0.076963
dtype: float64

In [16]:
#predict and evaluat for method Random
user_splits_df_rand, user_splits_unique_df_rand = predict_random_outfit(user_splits, user_splits_unique,availability_based=False)
print(f"Baseline evaluation for method: Random")
user_splits_df_rand, all_dict_rand = evaluate_df_hit_rate_at_n(user_splits_df_rand, n=10)
user_splits_unique_df_rand, ind_dict_rand = evaluate_df_hit_rate_at_n(user_splits_unique_df_rand, n=10)

Baseline evaluation for method: Random


id_hit_rate_at_100       0.030992
id_hit_rate_at_10        0.003099
group_hit_rate_at_100    0.068957
group_hit_rate_at_10     0.007748
dtype: float64

id_hit_rate_at_100       0.030578
id_hit_rate_at_10        0.002850
group_hit_rate_at_100    0.072299
group_hit_rate_at_10     0.009070
dtype: float64

In [17]:
#predict and evaluat for method Random with ,availability_based
user_splits_df_rand, user_splits_unique_df_rand = predict_random_outfit(user_splits, user_splits_unique,availability_based=True, use_testing_time=True)
print(f"Baseline evaluation for method: Random")
user_splits_df_rand, all_dict_rand = evaluate_df_hit_rate_at_n(user_splits_df_rand, n=10)
user_splits_unique_df_rand, ind_dict_rand = evaluate_df_hit_rate_at_n(user_splits_unique_df_rand, n=10)

Baseline evaluation for method: Random


id_hit_rate_at_100       0.032025
id_hit_rate_at_10        0.003357
group_hit_rate_at_100    0.069215
group_hit_rate_at_10     0.011364
dtype: float64

id_hit_rate_at_100       0.031614
id_hit_rate_at_10        0.003887
group_hit_rate_at_100    0.069966
group_hit_rate_at_10     0.010884
dtype: float64