#Train-Test Split
This notebook divides the data into training, validation, and test sets in chronological order.

In [1]:
from google.colab import drive
drive.mount('/content/drive')
path='/content/drive/MyDrive/RecSys_206894495'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#import
import numpy as np
import pandas as pd
import math

In [3]:

def leave_one_out_split(outfit_ids, groups, derived_booking_times_start, derived_booking_times_end,customer_id):
    """
    Splits the data into training and test sets by leaving out the last entry based on booking times.

    Parameters:
    outfit_ids (list): List of outfit IDs.
    groups (list): List of group IDs corresponding to the outfits.
    derived_booking_times_start (list): List of booking start times.
    derived_booking_times_end (list): List of booking end times.

    Returns:
    tuple: Training and test sets for outfit IDs, groups, and booking times.
    """

    outfit_ids, groups, derived_booking_times_start, derived_booking_times_end = np.array(outfit_ids), np.array(groups), np.array(derived_booking_times_start), np.array(derived_booking_times_end)

    if len(outfit_ids) <= 2:
        raise ValueError("Not enough data to create training and test sets.")

    sorted_indices = np.argsort(derived_booking_times_start)

    return (
        customer_id,outfit_ids[sorted_indices[:-1]], outfit_ids[sorted_indices[-1:]],
        groups[sorted_indices[:-1]], groups[sorted_indices[-1:]],
        derived_booking_times_start[sorted_indices[:-1]], derived_booking_times_start[sorted_indices[-1:]],
        derived_booking_times_end[sorted_indices[:-1]], derived_booking_times_end[sorted_indices[-1:]]
    )


def leave_percentage_out_split(outfit_ids, groups, derived_booking_times_start, derived_booking_times_end,customer_id, test_percentage=0.2, val_percentage=0.1):
    """
    Splits the data into training, validation, and test sets by leaving out percentages of the entries based on booking times.

    Parameters:
    outfit_ids (list): List of outfit IDs.
    groups (list): List of group IDs corresponding to the outfits.
    derived_booking_times_start (list): List of booking start times.
    derived_booking_times_end (list): List of booking end times.
    test_percentage (float): Percentage of data to leave out for the test set.
    val_percentage (float): Percentage of data to leave out for the validation set.

    Returns:
    tuple: Training, validation, and test sets for outfit IDs, groups, and booking times.
    """
    outfit_ids, groups, derived_booking_times_start, derived_booking_times_end = np.array(outfit_ids), np.array(groups), np.array(derived_booking_times_start), np.array(derived_booking_times_end)
    num_to_leave_test = max(math.floor(len(outfit_ids) * test_percentage), 1)
    num_to_leave_val = max(math.floor(len(outfit_ids) * val_percentage), 1)
    if len(outfit_ids)<=2:#not enugth for have val
        num_to_leave_val=0

    sorted_indices = np.argsort(derived_booking_times_start)

    train_indices = sorted_indices[:-(num_to_leave_test + num_to_leave_val)]
    val_indices = sorted_indices[-(num_to_leave_test + num_to_leave_val):-num_to_leave_test]
    test_indices = sorted_indices[-num_to_leave_test:]

    return (
        customer_id,outfit_ids[train_indices], outfit_ids[val_indices], outfit_ids[test_indices],
        groups[train_indices], groups[val_indices], groups[test_indices],
        derived_booking_times_start[train_indices], derived_booking_times_start[val_indices], derived_booking_times_start[test_indices],
        derived_booking_times_end[train_indices], derived_booking_times_end[val_indices], derived_booking_times_end[test_indices]
    )



def convert_user_orders_to_train_test_splits(user_orders_df, date_column_start="rentalPeriod.start", date_column_end="rentalPeriod.end", percentage_test=None, percentage_val=None):
    """
    Converts user orders DataFrame to training, validation, and test splits.

    Parameters:
    user_orders_df (DataFrame): DataFrame containing user orders.
    date_column_start (str): Column name for booking start times.
    date_column_end (str): Column name for booking end times.
    percentage_test (float, optional): Percentage of data to leave out for the test set.
    percentage_val (float, optional): Percentage of data to leave out for the validation set.

    Returns:
    DataFrame: DataFrame containing training, validation, and test splits.
    """
    if percentage_test is not None and percentage_val is not None:
        user_splits = user_orders_df.apply(lambda x: leave_percentage_out_split(x["outfit.id"], x["group"], x[date_column_start], x[date_column_end],x['customer.id'], test_percentage=percentage_test, val_percentage=percentage_val), axis=1)
    elif percentage_test is not None:
        user_splits = user_orders_df.apply(lambda x: leave_percentage_out_split(x["outfit.id"], x["group"], x[date_column_start], x[date_column_end],x['customer.id'], test_percentage=percentage_test, val_percentage=0.1), axis=1)
    else:
        user_splits = user_orders_df.apply(lambda x: leave_one_out_split(x["outfit.id"], x["group"], x[date_column_start], x[date_column_end],x['customer.id']), axis=1)

    user_splits_df = pd.DataFrame(user_splits.tolist(), columns=["customer_id","train_outfit_ids", "val_outfit_ids", "test_outfit_ids", "train_group", "val_group", "test_group", "train_booking_times_start", "val_booking_times_start", "test_booking_times_start", "train_booking_times_end", "val_booking_times_end", "test_booking_times_end"])
    # Create join train and val columns by concatenating the lists from the respective columns
    user_splits_df['join_outfit_ids'] = user_splits_df.apply(lambda row: np.concatenate((row['train_outfit_ids'], row['val_outfit_ids']), axis=None), axis=1)
    user_splits_df['join_group'] = user_splits_df.apply(lambda row: np.concatenate((row['train_group'], row['val_group']), axis=None), axis=1)
    return user_splits_df

# Some entries among the triplet data have been rented twice for within short time intervals. Often, these are mistaken entries and should be removed.
# Other times the outfit has been rented for two consecutive months, regardless we should remove these entries.
def remove_consecutive_duplicates(user_triplets_df, date_column="rentalPeriod.start"):
    """
    Removes consecutive duplicate entries from the DataFrame.

    Parameters:
    user_triplets_df (DataFrame): DataFrame containing user triplets.
    date_column (str): Column name for booking times.

    Returns:
    DataFrame: DataFrame with consecutive duplicates removed.
    """
    user_triplets_df[date_column] = pd.to_datetime(user_triplets_df[date_column])

    drop_indexes = []
    for i, (customer_id, group) in enumerate(user_triplets_df.groupby('customer.id')):
        # Check if any repeated outfit ids have less than a month between booking times.
        repeated_ids = group["outfit.id"].value_counts() > 1
        for repeated_id in repeated_ids[repeated_ids].index:
            repeated_subset = group[group["outfit.id"] == repeated_id]
            previous_entry_index = 0 # Since we'll occasionally find more than two repeated entries, we need to keep track of the last valid index we have.
            for i in range(1, repeated_subset.shape[0]):
                if (repeated_subset.iloc[i][date_column] - repeated_subset.iloc[previous_entry_index][date_column]).days < 30:
                    drop_indexes.append(repeated_subset.index[i])
                else:
                    previous_entry_index = i

    print(len(drop_indexes))
    user_triplets_df = user_triplets_df.drop(drop_indexes)
    return user_triplets_df

# Convert the triplets format with one transaction per row to a format all transactions per user
def translate_user_triplets_to_orders(user_triplets_df, outfits_df):
    """
    Converts user triplets DataFrame to user orders DataFrame.

    Parameters:
    user_triplets_df (DataFrame): DataFrame containing user triplets.
    outfits_df (DataFrame): DataFrame containing outfit information.

    Returns:
    DataFrame: DataFrame with user orders.
    """
    # Translate the outfit ids to outfit groups before aggregating
    id_group_dict = outfits_df[["id", "group"]].to_dict(orient="records")
    id_group_dict = {x["id"]: x["group"] for x in id_group_dict}

    user_triplets_df["group"] = user_triplets_df["outfit.id"].map(id_group_dict)
    #display(user_triplets_df)

    # Aggregate the outfit ids, groups, validFrom and bookingTime for each user
    user_orders_df = user_triplets_df.groupby("customer.id").agg({"outfit.id": list, "group":list, "rentalPeriod.start":list, "rentalPeriod.end":list}).reset_index()
    user_orders_df["num_orders"] = user_orders_df["outfit.id"].apply(lambda x: len(x))
    user_orders_df = user_orders_df[user_orders_df["num_orders"] > 1]
    return user_orders_df

In [4]:
#load data
orders=pd.read_parquet(path+'/archive/data/orders.parquet',engine='pyarrow')
outfits=pd.read_parquet(path+'/archive/data/outfits.parquet',engine='pyarrow')

In [5]:
orders=orders.merge(outfits['id'],left_on='outfit.id',right_on='id')

In [6]:
#prepare data

#convert tag_categories and outfit_tags to lists
outfits["tag_categories"] = outfits["tag_categories"].apply(eval)
outfits["outfit_tags"] = outfits["outfit_tags"].apply(eval)

outfits['group']=outfits['group'].astype(str)

# Convert triplets into entries for each individual user
orders = remove_consecutive_duplicates(orders)
user_orders_df = translate_user_triplets_to_orders(orders, outfits)
user_orders_df.dropna(inplace=True)

# Split the data into train and test sets, with one dataframe with no restirictions on outfits in the test data and one that prohibits repeated outfits
user_splits = convert_user_orders_to_train_test_splits(user_orders_df, percentage_test=0.2, percentage_val=0.1)

3607


In [7]:
user_splits

Unnamed: 0,customer_id,train_outfit_ids,val_outfit_ids,test_outfit_ids,train_group,val_group,test_group,train_booking_times_start,val_booking_times_start,test_booking_times_start,train_booking_times_end,val_booking_times_end,test_booking_times_end,join_outfit_ids,join_group
0,0,"[outfit.85f26909d8334ab78f30c2fc9c73faf7, outf...",[outfit.d4b6896b1ae74cdabebfdcf948fe64e2],[outfit.9f5058295098471abdfaf0a7c74ddbfe],"[group.c79c907b6c94a9bd2005e038943ab529, group...",[group.e5d92e41f348d9c05919685917e77de9],[group.f6f0b9ebb3228aab27a79ac658c76682],"[2023-11-22 00:00:00, 2023-11-24 00:00:00]",[2023-11-24 00:00:00],[2023-12-06 00:00:00],"[2023-12-21 00:00:00, 2023-12-23 00:00:00]",[2023-12-23 00:00:00],[2024-01-05 00:00:00],"[outfit.85f26909d8334ab78f30c2fc9c73faf7, outf...","[group.c79c907b6c94a9bd2005e038943ab529, group..."
1,3,"[outfit.d7bff1b799a34575a47ce0f531791c9f, outf...","[outfit.98fa1b5287182a9d, outfit.dd04098010f74...","[outfit.d7bff1b799a34575a47ce0f531791c9f, outf...","[group.287dba5268fb7b20e8ef81c053970691, group...","[group.a4449ee16d7951f425083623efd0dcec, group...","[group.287dba5268fb7b20e8ef81c053970691, group...","[2021-08-02 00:00:00, 2021-08-02 00:00:00, 202...","[2021-11-01 00:00:00, 2021-12-01 00:00:00]","[2021-12-01 00:00:00, 2021-12-01 00:00:00, 202...","[2021-08-27 00:00:00, 2021-08-27 00:00:00, 202...","[2021-11-30 00:00:00, 2021-12-31 00:00:00]","[2021-12-31 00:00:00, 2021-12-31 00:00:00, 202...","[outfit.d7bff1b799a34575a47ce0f531791c9f, outf...","[group.287dba5268fb7b20e8ef81c053970691, group..."
2,5,[outfit.9fde090f117fb9d9],[],[outfit.849ace7e1811150d],[group.27808d969027a4e243c8945176f280c0],[],[group.caafbed55494b0c93dab58d58d526f0a],[2018-09-06 00:00:00],[],[2018-09-06 00:00:00],[2018-09-09 00:00:00],[],[2018-09-09 00:00:00],[outfit.9fde090f117fb9d9],[group.27808d969027a4e243c8945176f280c0]
3,6,"[outfit.98eebea274f23dd6, outfit.648db79508724...","[outfit.c50c68dca68a4c8e857fb9330c6251f7, outf...","[outfit.53760b48aca64f54b952881a5c05294b, outf...","[group.a02de08741b879719c3ea97e24e5f230, group...","[group.cec301c724823ba9569a0226bb713228, group...","[group.b434d9587453feab911063bf66cae433, group...","[2021-08-25 00:00:00, 2021-08-25 00:00:00, 202...","[2022-02-28 00:00:00, 2022-03-29 00:00:00]","[2022-03-29 00:00:00, 2022-03-29 00:00:00, 202...","[2021-09-24 00:00:00, 2021-09-24 00:00:00, 202...","[2022-03-28 00:00:00, 2022-04-28 00:00:00]","[2022-04-28 00:00:00, 2022-04-28 00:00:00, 202...","[outfit.98eebea274f23dd6, outfit.648db79508724...","[group.a02de08741b879719c3ea97e24e5f230, group..."
4,7,[outfit.5e1b9778e36d475699772148e5d4e27b],[outfit.af3cb94171784fff803b50e9a99cd890],[outfit.7321c26a479e46cd9fb07fa3ab7d7594],[group.0a736bffd33390d7693442e6eecd0f35],[group.7dd15ae70f7b1df744862cc06aa33d4b],[group.cce63b3a8de0f3495c0744990e88b78f],[2019-11-20 00:00:00],[2019-11-20 00:00:00],[2019-11-20 00:00:00],[2019-11-21 00:00:00],[2019-11-21 00:00:00],[2019-11-21 00:00:00],"[outfit.5e1b9778e36d475699772148e5d4e27b, outf...","[group.0a736bffd33390d7693442e6eecd0f35, group..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3893,7409,"[outfit.efeca2ca71b54ba49ec487475dfae85c, outf...","[outfit.cbacb3a0c6f84276a388ba4ab7420f5e, outf...","[outfit.1a901d539bb14226b44722adfb562c3d, outf...","[group.2bd6562e31c164659847062516f75556, group...","[group.3e81d1c657e1c5e5978e490e65310589, group...","[group.8311167d22bcb8fbca6447bec8c75c99, group...","[2022-05-04 00:00:00, 2022-06-04 00:00:00, 202...","[2023-09-04 00:00:00, 2023-09-04 00:00:00, 202...","[2023-10-04 00:00:00, 2023-11-04 00:00:00, 202...","[2022-06-03 00:00:00, 2022-07-03 00:00:00, 202...","[2023-10-03 00:00:00, 2023-10-03 00:00:00, 202...","[2023-11-03 00:00:00, 2023-12-03 00:00:00, 202...","[outfit.efeca2ca71b54ba49ec487475dfae85c, outf...","[group.2bd6562e31c164659847062516f75556, group..."
3894,7411,"[outfit.501f59ab11f94aa1a28a566911f8a8be, outf...","[outfit.1c1ffa63507249c89f4f3575ec337bd6, outf...","[outfit.4e0d5e39a3124323bb5fb24df9e59d9c, outf...","[group.1ca0aabd88c25cd6b20ad80abd2f737c, group...","[group.399cf5787fbe0447733ead0d23c32bc5, group...","[group.e9df097d663ce1f79e83de4ec4258dc5, group...","[2020-06-17 00:00:00, 2020-06-17 00:00:00, 202...","[2023-07-19 00:00:00, 2023-07-19 00:00:00, 202...","[2023-10-19 00:00:00, 2023-10-19 00:00:00, 202...","[2020-07-20 00:00:00, 2020-07-20 00:00:00, 202...","[2023-08-18 00:00:00, 2023-08-18 00:00:00, 202...","[2023-11-18 00:00:00, 2023-11-18 00:00:00, 202...","[outfit.501f59ab11f94aa1a28a566911f8a8be, outf...","[group.1ca0aabd88c25cd6b20ad80abd2f737c, group..."
3895,7412,"[outfit.d8f20c3b2cae42b29b56f2b61cb6e715, outf...",[outfit.9a52065e9e7b4814bb5707173819674f],"[outfit.947da84896894ca7ad2663758be2297b, outf...","[group.cbc4dd5bb5185507dddc53b22513d182, group...",[group.e2ce75b28cfb15775399b0deca082670],"[group.47c89eceb6420f2c1a406a33beb8df53, group...","[2020-11-05 00:00:00, 2020-11-05 00:00:00, 202...",[2021-03-02 00:00:00],"[2021-03-02 00:00:00, 2021-03-02 00:00:00]","[2020-12-21 00:00:00, 2020-12-21 00:00:00, 202...",[2021-04-02 00:00:00],"[2021-04-02 00:00:00, 2021-04-02 00:00:00]","[outfit.d8f20c3b2cae42b29b56f2b61cb6e715, outf...","[group.cbc4dd5bb5185507dddc53b22513d182, group..."
3896,7413,"[outfit.965435bd6a4e96e0, outfit.d78cbfab14414...",[outfit.02b2458596a944b2ac409198765f3438],[outfit.74b8a62f6ecf484d854c4360dcb8a761],"[group.589ca1b26d8f77a078d9f9b92ea8e19e, group...",[group.f587a1a05d3f5b86d9327989ef6fce47],[group.ff6a23d4ad34e4ff219051e9bac2f48d],"[2019-06-13 00:00:00, 2020-09-01 00:00:00, 202...",[2023-10-29 00:00:00],[2023-10-29 00:00:00],"[2019-06-18 00:00:00, 2020-09-30 00:00:00, 202...",[2023-11-28 00:00:00],[2023-11-28 00:00:00],"[outfit.965435bd6a4e96e0, outfit.d78cbfab14414...","[group.589ca1b26d8f77a078d9f9b92ea8e19e, group..."


In [8]:
user_splits.to_parquet(path+"/models/user_splits.parquet")