# Order by Previous Rents  
In this notebook, I modify the *predict_previous_rental* and *predict_rental_and_most_popular* methods by adding sorting based on the number of times the customer has rented the outfit.

In [1]:
from google.colab import drive
drive.mount('/content/drive')
path='/content/drive/MyDrive/RecSys_206894495'
%run /content/drive/MyDrive/RecSys_206894495/models/evaluate_models.ipynb

Mounted at /content/drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#install
!pip install pyarrow

#import
import pandas as pd
import numpy as np
from time import time



In [3]:
user_splits=pd.read_parquet(path+'/models/user_splits.parquet')
outfits=pd.read_parquet(path+'/archive/data/outfits.parquet')

In [4]:
# Function to create a new DataFrame for a specific set (train, val, test)
def create_new_df(df, set_type):
    new_data = []

    for index, row in df.iterrows():
        outfit_ids = row[f'{set_type}_outfit_ids']
        unique_outfit_ids, counts = np.unique(outfit_ids, return_counts=True)
        for outfit_id, count in zip(unique_outfit_ids, counts):
            new_data.append({
                'customer_id': row['customer_id'],
                'outfit_id': outfit_id,
                'count': count
            })

    return pd.DataFrame(new_data)

# Create new DataFrames for train, val, and test
train_df = create_new_df(user_splits, 'join')
test_df = create_new_df(user_splits, 'test')

In [5]:
# Function to create a new DataFrame for a specific set (train, val, test)
def create_new_df_group(df, set_type):
    new_data = []

    for index, row in df.iterrows():
        outfit_ids = row[f'{set_type}_group']
        unique_outfit_ids, counts = np.unique(outfit_ids, return_counts=True)
        for outfit_id, count in zip(unique_outfit_ids, counts):
            new_data.append({
                'customer_id': row['customer_id'],
                'group': outfit_id,
                'count': count
            })

    return pd.DataFrame(new_data)

# Create new DataFrames for train, val, and test
train_df_group = create_new_df_group(user_splits, 'join')
test_df_group = create_new_df_group(user_splits, 'test')

In [6]:
# Assorted functions for building simple heuristics for recommender systems

def get_most_popular_outfits(df, n=10):
    most_popular_train_outfit_ids = df["join_outfit_ids"].explode().value_counts().index[:n]
    most_popular_train_groups = df["join_group"].explode().value_counts().index[:n]

    return np.array(most_popular_train_outfit_ids), np.array(most_popular_train_groups)

def get_previous_rentals(df, n=10):
    df["id_prediction"] = df["join_outfit_ids"].apply(lambda x: x if len(x) < n else x[n:])
    df["group_prediction"] = df["join_group"].apply(lambda x: x if len(x) < n else x[n:])

    return df

def pad_with_most_popular(x, pop_outfits, n=10):
    if len(x) < n:
        return np.append(x, pop_outfits[:n - len(x)])
    else:
        return x[:n]

def get_previous_rentals_pad_most_popular(df, n=10):
    most_popular_train_outfit_ids, most_popular_train_groups = get_most_popular_outfits(df, n)
    df["id_prediction"] = df.apply(lambda x: pad_with_most_popular(x["join_outfit_ids"], most_popular_train_outfit_ids, n), axis=1)
    df["group_prediction"] = df.apply(lambda x: pad_with_most_popular(x["join_group"], most_popular_train_groups, n), axis=1)
    return df

In [7]:
# Function to order id_prediction based on counts in pair_counts_outfits
def order_id_prediction_outfit(user_splits_df, pair_counts_outfits):
    # Merge user_splits_df with pair_counts_outfits to get counts
    merged_df = user_splits_df.explode('id_prediction').merge(
        pair_counts_outfits,
        left_on=['customer_id', 'id_prediction'],
        right_on=['customer_id', 'outfit_id'],
        how='left'
    ).fillna({'count': 0})

    # Sort by customer_id and count
    merged_df = merged_df.sort_values(by=['customer_id', 'count'], ascending=[True, False])

    # Group by customer_id and aggregate id_prediction into lists
    ordered_df = merged_df.groupby('customer_id')['id_prediction'].apply(list).reset_index()

    # Merge back to the original dataframe
    user_splits_df = user_splits_df.drop(columns=['id_prediction']).merge(ordered_df, on='customer_id', how='left')

    return user_splits_df


# Function to order group_prediction based on counts in pair_counts_groups
def order_id_prediction_group(user_splits_df, pair_counts_groups):
    # Merge user_splits_df with pair_counts_outfits to get counts
    merged_df = user_splits_df.explode('group_prediction').merge(
        pair_counts_groups,
        left_on=['customer_id', 'group_prediction'],
        right_on=['customer_id', 'group'],
        how='left'
    ).fillna({'count': 0})

    # Sort by customer_id and count
    merged_df = merged_df.sort_values(by=['customer_id', 'count'], ascending=[True, False])

    # Group by customer_id and aggregate id_prediction into lists
    ordered_df = merged_df.groupby('customer_id')['group_prediction'].apply(list).reset_index()

    # Merge back to the original dataframe
    user_splits_df = user_splits_df.drop(columns=['group_prediction']).merge(ordered_df, on='customer_id', how='left')

    return user_splits_df

In [8]:
# The maximum number of items to recommend
NUM_ITEMS = 100

# Most popular outfits prediction
def predict_most_popular(user_splits_df_pop):
    most_popular_train_outfit_ids_pop, most_popular_train_groups_pop = get_most_popular_outfits(user_splits_df_pop, NUM_ITEMS)
    user_splits_df_pop["id_prediction"] = [most_popular_train_outfit_ids_pop] * len(user_splits_df_pop)
    user_splits_df_pop["group_prediction"] = [most_popular_train_groups_pop] * len(user_splits_df_pop)

    return user_splits_df_pop

# Previous rental prediction
def predict_previous_rental(user_splits_df_rep, pair_counts_outfits,pair_counts_groups):
    user_splits_df_rep["id_prediction"] = user_splits_df_rep["join_outfit_ids"].apply(lambda x: x[-NUM_ITEMS:])
    user_splits_df_rep["group_prediction"] = user_splits_df_rep["join_group"].apply(lambda x: x[-NUM_ITEMS:])
    # Order the id_prediction list
    user_splits_df_rep = order_id_prediction_outfit(user_splits_df_rep, pair_counts_outfits)
    user_splits_df_rep = order_id_prediction_group(user_splits_df_rep, pair_counts_groups)

    return user_splits_df_rep

# Previous rental + most popular outfits prediction
def predict_rental_and_most_popular(user_splits_df, pair_counts_outfits,pair_counts_groups):
    def pad_with_most_popular(x, pop_outfits):
        if len(x) < NUM_ITEMS:
            return np.append(x, pop_outfits[:NUM_ITEMS - len(x)])
        else:
            return x[-NUM_ITEMS:]

    most_popular_train_outfit_ids, most_popular_train_groups = get_most_popular_outfits(user_splits_df, NUM_ITEMS)
    user_splits_df["id_prediction"] = user_splits_df.apply(lambda x: pad_with_most_popular(x["join_outfit_ids"], most_popular_train_outfit_ids), axis=1)
    user_splits_df["group_prediction"] = user_splits_df.apply(lambda x: pad_with_most_popular(x["join_group"], most_popular_train_groups), axis=1)
    # Order the id_prediction list
    user_splits_df = order_id_prediction_outfit(user_splits_df, pair_counts_outfits)
    user_splits_df = order_id_prediction_group(user_splits_df, pair_counts_groups)
    return user_splits_df


In [9]:
#predict and evaluat for method Previous rental
user_splits_df_rep = predict_previous_rental(user_splits,train_df,train_df_group )
print(f"Baseline evaluation for method: Previous rental")
user_splits_df_rep, all_dict_rep = evaluate_df_metrics_at_n(user_splits_df_rep,'Previous rental with order', n=10)

Baseline evaluation for method: Previous rental


Unnamed: 0,0
id_hit_rate_at_10,0.044638
id_precision_at_10,0.004746
id_recall_at_10,0.019124
id_f1_score_at_10,0.00613
group_hit_rate_at_10,0.061827
group_precision_at_10,0.006978
group_recall_at_10,0.023833
group_f1_score_at_10,0.008354
method_name,Previous rental with order


In [10]:
#predict and evaluat for method Previous rental + Most popular
user_splits_df_rep_pop= predict_rental_and_most_popular(user_splits,train_df,train_df_group)
print(f"Baseline evaluation for method: Previous rental + Most popular")
user_splits_df_rep_pop, all_dict_rep_pop = evaluate_df_metrics_at_n(user_splits_df_rep_pop,'Previous rental + Most popular with order', n=10)


Baseline evaluation for method: Previous rental + Most popular


Unnamed: 0,0
id_hit_rate_at_10,0.050026
id_precision_at_10,0.005285
id_recall_at_10,0.024511
id_f1_score_at_10,0.007109
group_hit_rate_at_10,0.063109
group_precision_at_10,0.007081
group_recall_at_10,0.025103
group_f1_score_at_10,0.00857
method_name,Previous rental + Most popular with order
