# Baseline Methods  
This notebook implements several baseline recommendation methods, including the most popular outfits, previous rentals by the customer, and random selection.

In [1]:
from google.colab import drive
drive.mount('/content/drive')
path='/content/drive/MyDrive/RecSys_206894495'
%run /content/drive/MyDrive/RecSys_206894495/models/evaluate_models.ipynb

Mounted at /content/drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#install
!pip install pyarrow

#import
import pandas as pd
import numpy as np



In [3]:
user_splits=pd.read_parquet(path+'/models/user_splits.parquet')
outfits=pd.read_parquet(path+'/archive/data/outfits.parquet')

In [4]:
# Assorted functions for building simple heuristics for recommender systems

def get_most_popular_outfits(df, n=10):
    most_popular_train_outfit_ids = df["join_outfit_ids"].explode().value_counts().index[:n]
    most_popular_train_groups = df["join_group"].explode().value_counts().index[:n]

    return np.array(most_popular_train_outfit_ids), np.array(most_popular_train_groups)

def get_previous_rentals(df, n=10):
    df["id_prediction"] = df["join_outfit_ids"].apply(lambda x: x if len(x) < n else x[n:])
    df["group_prediction"] = df["join_group"].apply(lambda x: x if len(x) < n else x[n:])

    return df

def pad_with_most_popular(x, pop_outfits, n=10):
    if len(x) < n:
        return np.append(x, pop_outfits[:n - len(x)])
    else:
        return x[:n]

def get_previous_rentals_pad_most_popular(df, n=10):
    most_popular_train_outfit_ids, most_popular_train_groups = get_most_popular_outfits(df, n)
    df["id_prediction"] = df.apply(lambda x: pad_with_most_popular(x["join_outfit_ids"], most_popular_train_outfit_ids, n), axis=1)
    df["group_prediction"] = df.apply(lambda x: pad_with_most_popular(x["join_group"], most_popular_train_groups, n), axis=1)
    return df

In [5]:
# The maximum number of items to recommend
NUM_ITEMS = 100

# The below code represents the four baseline methods discussed.
# Most popular outfits prediction
def predict_most_popular(user_splits_df_pop):
    most_popular_train_outfit_ids_pop, most_popular_train_groups_pop = get_most_popular_outfits(user_splits_df_pop, NUM_ITEMS)
    user_splits_df_pop["id_prediction"] = [most_popular_train_outfit_ids_pop] * len(user_splits_df_pop)
    user_splits_df_pop["group_prediction"] = [most_popular_train_groups_pop] * len(user_splits_df_pop)

    return user_splits_df_pop

# Previous rental prediction
def predict_previous_rental(user_splits_df_rep):
    user_splits_df_rep["id_prediction"] = user_splits_df_rep["join_outfit_ids"].apply(lambda x: x[-NUM_ITEMS:])
    user_splits_df_rep["group_prediction"] = user_splits_df_rep["join_group"].apply(lambda x: x[-NUM_ITEMS:])

    return user_splits_df_rep

# Previous rental + most popular outfits prediction
def predict_rental_and_most_popular(user_splits_df):
    def pad_with_most_popular(x, pop_outfits):
        if len(x) < NUM_ITEMS:
            return np.append(x, pop_outfits[:NUM_ITEMS - len(x)])
        else:
            return x[-NUM_ITEMS:]

    most_popular_train_outfit_ids, most_popular_train_groups = get_most_popular_outfits(user_splits_df, NUM_ITEMS)
    user_splits_df["id_prediction"] = user_splits_df.apply(lambda x: pad_with_most_popular(x["join_outfit_ids"], most_popular_train_outfit_ids), axis=1)
    user_splits_df["group_prediction"] = user_splits_df.apply(lambda x: pad_with_most_popular(x["join_group"], most_popular_train_groups), axis=1)

    return user_splits_df

# Random prediction
def predict_random_outfit(user_splits_df_rand):
    def get_random_outfits(x):
        return np.random.choice(all_outfit_ids_rand, NUM_ITEMS, replace=False)
    all_outfit_ids_rand = outfits["id"].values
    all_groups_rand = outfits["group"].values
    user_splits_df_rand["id_prediction"] = user_splits_df_rand.apply(lambda x: get_random_outfits(x), axis=1)
    user_splits_df_rand["group_prediction"] = user_splits_df_rand.apply(lambda x: np.random.choice(all_groups_rand, NUM_ITEMS, replace=False), axis=1)

    return user_splits_df_rand

In [6]:
#predict and evaluat for method  Most popular
user_splits_df_pop = predict_most_popular(user_splits)
print(f"Baseline evaluation for method:  Most popular")
user_splits_df_pop, all_dict_pop = evaluate_df_metrics_at_n(user_splits_df_pop,'Most popular', n=10)

Baseline evaluation for method:  Most popular


Unnamed: 0,0
id_hit_rate_at_10,0.017445
id_precision_at_10,0.001847
id_recall_at_10,0.009505
id_f1_score_at_10,0.002638
group_hit_rate_at_10,0.029502
group_precision_at_10,0.003207
group_recall_at_10,0.007545
group_f1_score_at_10,0.003467
method_name,Most popular


In [7]:
#predict and evaluat for method Previous rental
user_splits_df_rep = predict_previous_rental(user_splits)
print(f"Baseline evaluation for method: Previous rental")
user_splits_df_rep, all_dict_rep = evaluate_df_metrics_at_n(user_splits_df_rep,'Previous rental', n=10)

Baseline evaluation for method: Previous rental


Unnamed: 0,0
id_hit_rate_at_10,0.042842
id_precision_at_10,0.004541
id_recall_at_10,0.019143
id_f1_score_at_10,0.006087
group_hit_rate_at_10,0.056439
group_precision_at_10,0.006208
group_recall_at_10,0.02369
group_f1_score_at_10,0.007998
method_name,Previous rental


In [8]:
#predict and evaluat for method Previous rental + Most popular
user_splits_df_rep_pop= predict_rental_and_most_popular(user_splits)
print(f"Baseline evaluation for method: Previous rental + Most popular")
user_splits_df_rep_pop, all_dict_rep_pop = evaluate_df_metrics_at_n(user_splits_df_rep_pop,'Previous rental + Most popular', n=10)


Baseline evaluation for method: Previous rental + Most popular


Unnamed: 0,0
id_hit_rate_at_10,0.048743
id_precision_at_10,0.005131
id_recall_at_10,0.024915
id_f1_score_at_10,0.007156
group_hit_rate_at_10,0.058492
group_precision_at_10,0.006414
group_recall_at_10,0.025614
group_f1_score_at_10,0.008368
method_name,Previous rental + Most popular


In [9]:
#predict and evaluat for method Random
user_splits_df_rand = predict_random_outfit(user_splits)
print(f"Baseline evaluation for method: Random")
user_splits_df_rand, all_dict_rand = evaluate_df_metrics_at_n(user_splits_df_rand,'Random', n=10)


Baseline evaluation for method: Random


Unnamed: 0,0
id_hit_rate_at_10,0.002822
id_precision_at_10,0.000282
id_recall_at_10,0.000945
id_f1_score_at_10,0.000309
group_hit_rate_at_10,0.00667
group_precision_at_10,0.000718
group_recall_at_10,0.001201
group_f1_score_at_10,0.000671
method_name,Random
