In [1]:
!pip install pyarrow



In [1]:
#import
#warning
import warnings
warnings.filterwarnings('ignore')

#general
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from itertools import product
import pickle
from sklearn.preprocessing import MultiLabelBinarizer #for Multi Label of the tags

#for predictions
from sklearn.neighbors import NearestNeighbors


%run prepare_train_test_splits.ipynb
%run evaluate_models.ipynb
%run load_embeddings.ipynb

In [2]:
# Loading embeddings
#loaded_embeddings_dict = load_embeddings_from_folder()
#pickle.dump(loaded_embeddings_dict, open(EMBEDDING_MODEL_DICT_PICKLE_PATH, "wb"))
loaded_embeddings_dict = pickle.load(open(EMBEDDING_MODEL_DICT_PICKLE_PATH, "rb"))

#load data
pictures_df = pd.read_parquet('../archive/data/picture_triplets.parquet',engine='pyarrow')
orders=pd.read_parquet('../archive/data/orders.parquet',engine='pyarrow')
outfits=pd.read_parquet('../archive/data/outfits.parquet',engine='pyarrow')
my_encoding=pd.read_parquet('my_encoding.parquet',engine='pyarrow')


In [4]:
my_encoding.head()

Unnamed: 0,id,Spring,Summer,Winter,Fall,Sandro,ILAG,Rodebjer,Stylein,Kupong knit.wear,...,Dressed-up,Business,Formal,Active,tag_x,category_x,tag_y,category_y,group,normalized_pricePerMonth
0,outfit.ff154c3d8e2f49099b9a633559af8d97,True,True,True,True,False,False,False,False,False,...,False,False,False,False,3,Size,Midi,Length,group.07bb1ae875636605fdff099dcdd81058,0.112381
1,outfit.ff0b0e4e4b3044a4b36d9f3c889e0f6e,True,True,False,False,False,False,False,False,False,...,False,False,False,False,3,Size,Midi,Length,group.ec8a6b7c8f6046c755483e578d37c99e,0.112381
2,outfit.fef39a12939f457eaf3c48515a6bd85f,True,True,True,True,False,False,False,False,False,...,False,False,False,False,5,Size,Midi,Length,group.de54a3691add1e46d5946c79e0fc7e82,0.154286
3,outfit.fec66293d37940b79dde2c1acd761fe3,True,True,True,True,False,False,False,False,False,...,False,False,False,False,5,Size,Midi,Length,group.fcc139e28abeaf451c19d46bf311fcb0,0.125714
4,outfit.fea3bb7d8ff54872ad84977465e29da4,True,True,True,True,False,False,False,False,False,...,False,False,False,False,3,Size,Midi,Length,group.fc687f43497199532a11b0f991ebee04,0.112381


In [3]:
#prepare data

# Introduce the embeddings for each outfit, if the outfit has no embeddings, we drop it
pictures_df["embeddings"] = pictures_df["picture.id"].map(loaded_embeddings_dict)
outfit_pictures_df = pictures_df.groupby("outfit.id").agg({"picture.id": list, "embeddings": list}).reset_index()
outfits["embeddings"] = outfits["id"].map(outfit_pictures_df.set_index("outfit.id")["embeddings"])
outfits = outfits.dropna(subset=["embeddings"])

#convert tag_categories and outfit_tags to lists
outfits["tag_categories"] = outfits["tag_categories"].apply(eval)
outfits["outfit_tags"] = outfits["outfit_tags"].apply(eval)

outfits['group']=outfits['group'].astype(str)

# Convert to sets and find common IDs
common_ids = set(orders['outfit.id']).intersection(set(outfits['id']))

# Filter the dataframes to keep only the common ids
orders = orders[orders['outfit.id'].isin(common_ids)]
outfits = outfits[outfits['id'].isin(common_ids)]

KeyboardInterrupt: 

In [None]:
#prepare tags
all_tags = outfits["outfit_tags"].values.tolist()
mlb = MultiLabelBinarizer()
one_hot_encoded = mlb.fit_transform(all_tags)
outfits["one_hot_encoded"] = [np.array(oh_list) for oh_list in one_hot_encoded.tolist()]

In [4]:
#prepare my encoding
# Function to create the new column with arrays of other columns excluding 'id'
def create_encoding_column(row):
    return row.drop('id').values.tolist()

# Apply the function to each row and create a new column 'my_encoding'
df['my_encoding'] = df.apply(create_array_column, axis=1)
outfits.merge(my_encoding[['id','my_encoding']],on='id')

In [5]:
#prepare embeddings
def get_mean_embedding(embeddings):
    embeddings = np.array(embeddings)
    mean_embedding = np.mean(embeddings, axis=0)
    return mean_embedding

def concatenate_embeddings(oh_embeddings, image_embeddings, oh_weighting):
    oh_embeddings = np.array(oh_embeddings) * oh_weighting
    return np.concatenate((oh_embeddings, image_embeddings))

outfits["mean_embeddings"] = outfits["embeddings"].apply(lambda x: get_mean_embedding(x))
#one_hot_encoded = np.array(outfits_df["one_hot_encoded"].tolist())
#mean_embeddings = np.array(outfits_df["mean_embeddings"].tolist())

outfits["concatenated_embeddings"] = outfits.apply(lambda x: concatenate_embeddings(x["one_hot_encoded"], x["mean_embeddings"], oh_weighting=4), axis=1)

In [6]:
# Convert triplets into entries for each individual user
orders = remove_consecutive_duplicates(orders)
user_orders_df = translate_user_triplets_to_orders(orders, outfits)
user_orders_df.dropna(inplace=True)

# Split the data into train and test sets, with one dataframe with no restirictions on outfits in the test data and one that prohibits repeated outfits
# It prints any cases in which it is unable to construct a test set with unique outfits.
user_splits_df, user_splits_unique_df = convert_user_orders_to_train_test_splits(user_orders_df, percentage_test=0.3)

3598
No unique outfit found with groups ['group.423a23f6717e6d85adac54c051ee9832'
 'group.423a23f6717e6d85adac54c051ee9832']
No unique outfit found with groups ['group.384b8170c6a6ddfd568ff7fab5fb49c4'
 'group.384b8170c6a6ddfd568ff7fab5fb49c4']
No unique outfit found with groups ['group.a3ab26b5d2f7ef2cf102422a3dde3b46'
 'group.a3ab26b5d2f7ef2cf102422a3dde3b46']
No unique outfit found with groups ['group.9b5204b87abc93f8f0467b0a6a9c6a97'
 'group.9b5204b87abc93f8f0467b0a6a9c6a97'
 'group.9b5204b87abc93f8f0467b0a6a9c6a97']
No unique outfit found with groups ['group.8e50238120d13b31284f151941c2ee81'
 'group.8e50238120d13b31284f151941c2ee81']
No unique outfit found with groups ['group.a494d07781a1aab0e3a42989288feff2'
 'group.a494d07781a1aab0e3a42989288feff2']
No unique outfit found with groups ['group.a1d284ef1c7035dd14e57eba3838a303'
 'group.a1d284ef1c7035dd14e57eba3838a303']
No unique outfit found with groups ['group.e0cb0f6e113edc4df8a1e304376734f6'
 'group.e0cb0f6e113edc4df8a1e3043767

In [7]:
NUM_ITEMS = 100

def find_rental_history_embeddings(outfit_ids, outfit_to_embedding_dict):
    outfit_ids = [outfit_id for outfit_id in outfit_ids if outfit_id != "nan"] # TODO: Find out where these nan values are coming from, only two of them for now, as far as I can tell.
    return [outfit_to_embedding_dict[outfit_id] for outfit_id in outfit_ids]

def get_mean_embedding(embeddings):
    embeddings = np.array(embeddings)
    mean_embedding = np.mean(embeddings, axis=0)
    return mean_embedding

def get_nearest_neighbors_batch(embeddings, nn, num_items, index_to_id):
    distances, indices = nn.kneighbors(embeddings, n_neighbors=num_items+1)
    ids = [[index_to_id[i] for i in idx[1:]] for idx in indices]
    distances = [dist[1:] for dist in distances]
    return ids, distances


def predict_nearest_neighbors(df, outfits_df, embeddings_column="embeddings"):
    outfits_df.dropna(subset=embeddings_column,inplace=True)
    outfit_to_embedding_dict = outfits_df.set_index("id")[embeddings_column].to_dict()
    index_to_outfit_dict = {i: outfit_id for i, outfit_id in enumerate(outfits_df["id"].values)}
    group_to_embedding_dict = outfits_df.set_index("group")[embeddings_column].to_dict()
    index_to_group_dict = {i: group for i, group in enumerate(outfits_df["group"].values)}
    
    df["train_id_embeddings"] = df["train_outfit_ids"].apply(lambda x: find_rental_history_embeddings(x, outfit_to_embedding_dict))
    df["train_group_embeddings"] = df["train_group"].apply(lambda x: find_rental_history_embeddings(x, group_to_embedding_dict))

    df["rental_history_id_embedding"] = df["train_id_embeddings"].apply(lambda x: get_mean_embedding(x))
    df["rental_history_group_embedding"] = df["train_group_embeddings"].apply(lambda x: get_mean_embedding(x))
    
    nearest_neighbors = NearestNeighbors(n_neighbors=NUM_ITEMS+1, metric="cosine")
    embeddings = np.stack(outfits_df[embeddings_column].values)
    nearest_neighbors.fit(embeddings)

    id_embeddings = np.stack(df["rental_history_id_embedding"].values)
    group_embeddings = np.stack(df["rental_history_group_embedding"].values)

    id_predictions, id_distances = get_nearest_neighbors_batch(id_embeddings, nearest_neighbors, NUM_ITEMS, index_to_outfit_dict)
    group_predictions, group_distances = get_nearest_neighbors_batch(group_embeddings, nearest_neighbors, NUM_ITEMS, index_to_group_dict)

    df["id_prediction"], df["id_prediction_distances"] = id_predictions, id_distances
    df["group_prediction"], df["group_prediction_distances"] = group_predictions, group_distances
    
    return df

def predict_nearest_neighbors_images(df, outfits_df, embeddings_column="embeddings"):
    outfits_df["mean_embeddings"] = outfits_df[embeddings_column].apply(lambda x: get_mean_embedding(x))
    return predict_nearest_neighbors(df, outfits_df, embeddings_column="mean_embeddings")

# Apply to dataframes
tqdm.pandas()

In [11]:
# Tag based predictions
user_splits=user_splits_df.copy()
user_splits_unique=user_splits_unique_df.copy()
user_splits = predict_nearest_neighbors(user_splits, outfits, embeddings_column="one_hot_encoded")
user_splits_unique = predict_nearest_neighbors(user_splits_unique, outfits, embeddings_column="one_hot_encoded")
user_splits = evaluate_df_hit_rate_at_n(user_splits, n=10)
user_splits_unique = evaluate_df_hit_rate_at_n(user_splits_unique, n=10)

Unnamed: 0,train_outfit_ids,test_outfit_id,train_group,test_group,train_booking_times,test_booking_time,train_id_embeddings,train_group_embeddings,rental_history_id_embedding,rental_history_group_embedding,id_prediction,id_prediction_distances,group_prediction,group_prediction_distances
0,"[outfit.85f26909d8334ab78f30c2fc9c73faf7, outf...",[outfit.9f5058295098471abdfaf0a7c74ddbfe],"[group.c79c907b6c94a9bd2005e038943ab529, group...",[group.f6f0b9ebb3228aab27a79ac658c76682],"[2023-11-22 00:00:00, 2023-11-24 00:00:00, 202...",[2023-12-06 00:00:00],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[outfit.ba8159724e7d416a984c32fa60dbd2b4, outf...","[0.125, 0.125, 0.125, 0.125, 0.130373643453695...","[group.323d1cf17db24ffe526aa32445fc6781, group...","[0.125, 0.125, 0.125, 0.125, 0.130373643453695..."
1,"[outfit.d7bff1b799a34575a47ce0f531791c9f, outf...","[outfit.98fa1b5287182a9d, outfit.dd04098010f74...","[group.287dba5268fb7b20e8ef81c053970691, group...","[group.a4449ee16d7951f425083623efd0dcec, group...","[2021-08-02 00:00:00, 2021-08-02 00:00:00, 202...","[2021-11-01 00:00:00, 2021-12-01 00:00:00, 202...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[outfit.c6cb1f5d9a7e4a88b26f31213949f781, outf...","[0.24045705335058343, 0.24045705335058354, 0.2...","[group.6cf4a47327e1b1f938c07bcc054e3d1c, group...","[0.25039433966700675, 0.25039433966700675, 0.2..."
2,[outfit.9fde090f117fb9d9],[outfit.849ace7e1811150d],[group.27808d969027a4e243c8945176f280c0],[group.caafbed55494b0c93dab58d58d526f0a],[2018-09-06 00:00:00],[2018-09-06 00:00:00],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[outfit.c0cf06b8f4354675a443486c308830d5, outf...","[0.15672595728843208, 0.1567259572884322, 0.20...","[group.c3dd483ba4bfbb7afa918b9a63bb911a, group...","[0.15672595728843208, 0.1567259572884322, 0.20..."
3,"[outfit.98eebea274f23dd6, outfit.648db79508724...","[outfit.b2c68e50868a46a8872e81bcd3a17870, outf...","[group.a02de08741b879719c3ea97e24e5f230, group...","[group.69217601bce159dcf21b4c8e6f059f42, group...","[2021-08-25 00:00:00, 2021-08-25 00:00:00, 202...","[2022-02-28 00:00:00, 2022-02-28 00:00:00, 202...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[outfit.aba62e508b55432e95eb510fcfe61caa, outf...","[0.20052492158760415, 0.2035086339196126, 0.20...","[group.73b1598cc2dfda33e9cc302fdfb8351a, group...","[0.23525328040824267, 0.23829403487442047, 0.2..."
4,"[outfit.5e1b9778e36d475699772148e5d4e27b, outf...",[outfit.7321c26a479e46cd9fb07fa3ab7d7594],"[group.0a736bffd33390d7693442e6eecd0f35, group...",[group.cce63b3a8de0f3495c0744990e88b78f],"[2019-11-20 00:00:00, 2019-11-20 00:00:00]",[2019-11-20 00:00:00],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[outfit.df81c6ba86ef4c3c8fd2976934eb4078, outf...","[0.2583801512904337, 0.28848752646211473, 0.28...","[group.4615c13c2896753806ad273a6ba091f5, group...","[0.2583801512904337, 0.28848752646211473, 0.28..."


id_hit_rate_at_100       0.241818
id_hit_rate_at_10        0.062597
group_hit_rate_at_100    0.248312
group_hit_rate_at_10     0.070390
dtype: float64

id_hit_rate_at_100       0.222309
id_hit_rate_at_10        0.045869
group_hit_rate_at_100    0.217618
group_hit_rate_at_10     0.048215
dtype: float64

In [10]:
# Image based predictions
user_splits=user_splits_df.copy()
user_splits_unique=user_splits_unique_df.copy()
user_splits = predict_nearest_neighbors_images(user_splits, outfits, embeddings_column="embeddings")
user_splits_unique = predict_nearest_neighbors_images(user_splits_unique, outfits, embeddings_column="embeddings")
user_splits = evaluate_df_hit_rate_at_n(user_splits, n=10)
user_splits_unique = evaluate_df_hit_rate_at_n(user_splits_unique, n=10)

id_hit_rate_at_100       0.140519
id_hit_rate_at_10        0.037143
group_hit_rate_at_100    0.150649
group_hit_rate_at_10     0.039481
dtype: float64

id_hit_rate_at_100       0.158197
id_hit_rate_at_10        0.035966
group_hit_rate_at_100    0.168100
group_hit_rate_at_10     0.037790
dtype: float64

In [11]:
# Combined predictions
"""
user_splits=user_splits_df.copy()
user_splits_unique=user_splits_unique_df.copy()
user_splits = predict_nearest_neighbors(user_splits, outfits, embeddings_column="outfit_embeddings")
user_splits_unique = predict_nearest_neighbors(user_splits_unique, outfits, embeddings_column="outfit_embeddings")
user_splits = evaluate_df_hit_rate_at_n(user_splits, n=10)
user_splits_unique = evaluate_df_hit_rate_at_n(user_splits_unique, n=10)
"""

'\nuser_splits=user_splits_df.copy()\nuser_splits_unique=user_splits_unique_df.copy()\nuser_splits = predict_nearest_neighbors(user_splits, outfits, embeddings_column="outfit_embeddings")\nuser_splits_unique = predict_nearest_neighbors(user_splits_unique, outfits, embeddings_column="outfit_embeddings")\nuser_splits = evaluate_df_hit_rate_at_n(user_splits, n=10)\nuser_splits_unique = evaluate_df_hit_rate_at_n(user_splits_unique, n=10)\n'

In [12]:
# Concat predictions
user_splits=user_splits_df.copy()
user_splits_unique=user_splits_unique_df.copy()
user_splits = predict_nearest_neighbors(user_splits, outfits, embeddings_column="concatenated_embeddings")
user_splits_unique = predict_nearest_neighbors(user_splits_unique, outfits, embeddings_column="concatenated_embeddings")
user_splits = evaluate_df_hit_rate_at_n(user_splits, n=10)
user_splits_unique = evaluate_df_hit_rate_at_n(user_splits_unique, n=10)

id_hit_rate_at_100       0.213247
id_hit_rate_at_10        0.052987
group_hit_rate_at_100    0.211948
group_hit_rate_at_10     0.056883
dtype: float64

id_hit_rate_at_100       0.227000
id_hit_rate_at_10        0.052645
group_hit_rate_at_100    0.223873
group_hit_rate_at_10     0.053948
dtype: float64