In [1]:
!pip install pyarrow



In [2]:
#import
#warning
import warnings
warnings.filterwarnings('ignore')

#general
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from itertools import product
import pickle
from sklearn.preprocessing import MultiLabelBinarizer #for Multi Label of the tags

#for predictions
from sklearn.ensemble import GradientBoostingClassifier

%run prepare_train_test_splits.ipynb
%run evaluate_models.ipynb
%run load_embeddings.ipynb

In [3]:
# Loading embeddings
#loaded_embeddings_dict = load_embeddings_from_folder()
#pickle.dump(loaded_embeddings_dict, open(EMBEDDING_MODEL_DICT_PICKLE_PATH, "wb"))
loaded_embeddings_dict = pickle.load(open(EMBEDDING_MODEL_DICT_PICKLE_PATH, "rb"))

#load data
pictures_df = pd.read_parquet('../archive/data/picture_triplets.parquet',engine='pyarrow')
orders=pd.read_parquet('../archive/data/orders.parquet',engine='pyarrow')
outfits=pd.read_parquet('../archive/data/outfits.parquet',engine='pyarrow')

In [4]:
#prepare data

# Introduce the embeddings for each outfit, if the outfit has no embeddings, we drop it
pictures_df["embeddings"] = pictures_df["picture.id"].map(loaded_embeddings_dict)
outfit_pictures_df = pictures_df.groupby("outfit.id").agg({"picture.id": list, "embeddings": list}).reset_index()
outfits["embeddings"] = outfits["id"].map(outfit_pictures_df.set_index("outfit.id")["embeddings"])
outfits = outfits.dropna(subset=["embeddings"])

#convert tag_categories and outfit_tags to lists
outfits["tag_categories"] = outfits["tag_categories"].apply(eval)
outfits["outfit_tags"] = outfits["outfit_tags"].apply(eval)

outfits['group']=outfits['group'].astype(str)

# Convert to sets and find common IDs
common_ids = set(orders['outfit.id']).intersection(set(outfits['id']))

# Filter the dataframes to keep only the common ids
orders = orders[orders['outfit.id'].isin(common_ids)]
outfits = outfits[outfits['id'].isin(common_ids)]

In [5]:
#prepare tags
all_tags = outfits["outfit_tags"].values.tolist()
mlb = MultiLabelBinarizer()
one_hot_encoded = mlb.fit_transform(all_tags)
outfits["one_hot_encoded"] = [np.array(oh_list) for oh_list in one_hot_encoded.tolist()]

In [6]:
#prepare embeddings
def get_mean_embedding(embeddings):
    embeddings = np.array(embeddings)
    mean_embedding = np.mean(embeddings, axis=0)
    return mean_embedding

def concatenate_embeddings(oh_embeddings, image_embeddings, oh_weighting):
    oh_embeddings = np.array(oh_embeddings) * oh_weighting
    return np.concatenate((oh_embeddings, image_embeddings))

outfits["mean_embeddings"] = outfits["embeddings"].apply(lambda x: get_mean_embedding(x))
#one_hot_encoded = np.array(outfits_df["one_hot_encoded"].tolist())
#mean_embeddings = np.array(outfits_df["mean_embeddings"].tolist())

outfits["concatenated_embeddings"] = outfits.apply(lambda x: concatenate_embeddings(x["one_hot_encoded"], x["mean_embeddings"], oh_weighting=4), axis=1)

In [7]:
# Convert triplets into entries for each individual user
orders = remove_consecutive_duplicates(orders)
user_orders_df = translate_user_triplets_to_orders(orders, outfits)
user_orders_df.dropna(inplace=True)

# Split the data into train and test sets, with one dataframe with no restirictions on outfits in the test data and one that prohibits repeated outfits
# It prints any cases in which it is unable to construct a test set with unique outfits.
user_splits_df, user_splits_unique_df = convert_user_orders_to_train_test_splits(user_orders_df, percentage_test=0.3)

3598
No unique outfit found with groups ['group.423a23f6717e6d85adac54c051ee9832'
 'group.423a23f6717e6d85adac54c051ee9832']
No unique outfit found with groups ['group.384b8170c6a6ddfd568ff7fab5fb49c4'
 'group.384b8170c6a6ddfd568ff7fab5fb49c4']
No unique outfit found with groups ['group.a3ab26b5d2f7ef2cf102422a3dde3b46'
 'group.a3ab26b5d2f7ef2cf102422a3dde3b46']
No unique outfit found with groups ['group.9b5204b87abc93f8f0467b0a6a9c6a97'
 'group.9b5204b87abc93f8f0467b0a6a9c6a97'
 'group.9b5204b87abc93f8f0467b0a6a9c6a97']
No unique outfit found with groups ['group.8e50238120d13b31284f151941c2ee81'
 'group.8e50238120d13b31284f151941c2ee81']
No unique outfit found with groups ['group.a494d07781a1aab0e3a42989288feff2'
 'group.a494d07781a1aab0e3a42989288feff2']
No unique outfit found with groups ['group.a1d284ef1c7035dd14e57eba3838a303'
 'group.a1d284ef1c7035dd14e57eba3838a303']
No unique outfit found with groups ['group.e0cb0f6e113edc4df8a1e304376734f6'
 'group.e0cb0f6e113edc4df8a1e3043767

In [8]:
user_splits_df.columns

Index(['train_outfit_ids', 'test_outfit_id', 'train_group', 'test_group',
       'train_booking_times', 'test_booking_time'],
      dtype='object')

In [9]:
user_splits_df.head()

Unnamed: 0,train_outfit_ids,test_outfit_id,train_group,test_group,train_booking_times,test_booking_time
0,"[outfit.85f26909d8334ab78f30c2fc9c73faf7, outf...",[outfit.9f5058295098471abdfaf0a7c74ddbfe],"[group.c79c907b6c94a9bd2005e038943ab529, group...",[group.f6f0b9ebb3228aab27a79ac658c76682],"[2023-11-22 00:00:00, 2023-11-24 00:00:00, 202...",[2023-12-06 00:00:00]
1,"[outfit.d7bff1b799a34575a47ce0f531791c9f, outf...","[outfit.98fa1b5287182a9d, outfit.dd04098010f74...","[group.287dba5268fb7b20e8ef81c053970691, group...","[group.a4449ee16d7951f425083623efd0dcec, group...","[2021-08-02 00:00:00, 2021-08-02 00:00:00, 202...","[2021-11-01 00:00:00, 2021-12-01 00:00:00, 202..."
2,[outfit.9fde090f117fb9d9],[outfit.849ace7e1811150d],[group.27808d969027a4e243c8945176f280c0],[group.caafbed55494b0c93dab58d58d526f0a],[2018-09-06 00:00:00],[2018-09-06 00:00:00]
3,"[outfit.98eebea274f23dd6, outfit.648db79508724...","[outfit.b2c68e50868a46a8872e81bcd3a17870, outf...","[group.a02de08741b879719c3ea97e24e5f230, group...","[group.69217601bce159dcf21b4c8e6f059f42, group...","[2021-08-25 00:00:00, 2021-08-25 00:00:00, 202...","[2022-02-28 00:00:00, 2022-02-28 00:00:00, 202..."
4,"[outfit.5e1b9778e36d475699772148e5d4e27b, outf...",[outfit.7321c26a479e46cd9fb07fa3ab7d7594],"[group.0a736bffd33390d7693442e6eecd0f35, group...",[group.cce63b3a8de0f3495c0744990e88b78f],"[2019-11-20 00:00:00, 2019-11-20 00:00:00]",[2019-11-20 00:00:00]


In [10]:
"""NUM_ITEMS = 100

def find_rental_history_embeddings(outfit_ids, outfit_to_embedding_dict):
    outfit_ids = [outfit_id for outfit_id in outfit_ids if outfit_id != "nan"]
    return [outfit_to_embedding_dict[outfit_id] for outfit_id in outfit_ids]

def get_mean_embedding(embeddings):
    embeddings = np.array(embeddings)
    mean_embedding = np.mean(embeddings, axis=0)
    return mean_embedding

def train_gradient_boosting_classifier(embeddings, targets):
    model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
    model.fit(embeddings, targets)
    return model

def predict_with_gradient_boosting_classifier(model, embeddings):
    predictions = model.predict(embeddings)
    return predictions

def create_target_column(df, column_name):
    df['rented'] = df[column_name].apply(lambda x: 1 if len(x) > 0 else 0)
    return df

def predict_outfit_rental(df, outfits_df, embeddings_column="embeddings"):
    df = create_target_column(df, 'train_outfit_ids')
    
    outfits_df.dropna(subset=embeddings_column, inplace=True)
    outfit_to_embedding_dict = outfits_df.set_index("id")[embeddings_column].to_dict()
    
    df["train_id_embeddings"] = df["train_outfit_ids"].apply(lambda x: find_rental_history_embeddings(x, outfit_to_embedding_dict))
    df["rental_history_id_embedding"] = df["train_id_embeddings"].apply(lambda x: get_mean_embedding(x))
    
    embeddings = np.stack(outfits_df[embeddings_column].values)
    targets = df["rented"].values

    id_embeddings = np.stack(df["rental_history_id_embedding"].values)

    id_model = train_gradient_boosting_classifier(embeddings, targets)
    id_predictions = predict_with_gradient_boosting_classifier(id_model, id_embeddings)

    df["id_prediction"] = id_predictions
    
    return df

def predict_group_rental(df, outfits_df, embeddings_column="embeddings"):
    df = create_target_column(df, 'train_group')
    
    outfits_df.dropna(subset=embeddings_column, inplace=True)
    group_to_embedding_dict = outfits_df.set_index("group")[embeddings_column].to_dict()
    
    df["train_group_embeddings"] = df["train_group"].apply(lambda x: find_rental_history_embeddings(x, group_to_embedding_dict))
    df["rental_history_group_embedding"] = df["train_group_embeddings"].apply(lambda x: get_mean_embedding(x))
    
    embeddings = np.stack(outfits_df[embeddings_column].values)
    targets = df["rented"].values

    group_embeddings = np.stack(df["rental_history_group_embedding"].values)

    group_model = train_gradient_boosting_classifier(embeddings, targets)
    group_predictions = predict_with_gradient_boosting_classifier(group_model, group_embeddings)

    df["group_prediction"] = group_predictions
    
    return df

def predict_nearest_neighbors(df, outfits_df, embeddings_column="embeddings"):
    df = predict_outfit_rental(df, outfits_df, embeddings_column=embeddings_column)
    df = predict_group_rental(df, outfits_df, embeddings_column=embeddings_column)
    return df
def predict_nearest_neighbors_images(df, outfits_df, embeddings_column="embeddings"):
    outfits_df["mean_embeddings"] = outfits_df[embeddings_column].apply(lambda x: get_mean_embedding(x))
    df = predict_outfit_rental(df, outfits_df, embeddings_column="mean_embeddings")
    df = predict_group_rental(df, outfits_df, embeddings_column="mean_embeddings")
    return df


# Apply to dataframes
tqdm.pandas()"""

'NUM_ITEMS = 100\n\ndef find_rental_history_embeddings(outfit_ids, outfit_to_embedding_dict):\n    outfit_ids = [outfit_id for outfit_id in outfit_ids if outfit_id != "nan"]\n    return [outfit_to_embedding_dict[outfit_id] for outfit_id in outfit_ids]\n\ndef get_mean_embedding(embeddings):\n    embeddings = np.array(embeddings)\n    mean_embedding = np.mean(embeddings, axis=0)\n    return mean_embedding\n\ndef train_gradient_boosting_classifier(embeddings, targets):\n    model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)\n    model.fit(embeddings, targets)\n    return model\n\ndef predict_with_gradient_boosting_classifier(model, embeddings):\n    predictions = model.predict(embeddings)\n    return predictions\n\ndef create_target_column(df, column_name):\n    df[\'rented\'] = df[column_name].apply(lambda x: 1 if len(x) > 0 else 0)\n    return df\n\ndef predict_outfit_rental(df, outfits_df, embeddings_column="embeddings"):\n    df = create_target_column

In [24]:
"""
def find_rental_history_embeddings(outfit_ids, outfit_to_embedding_dict):
    outfit_ids = [outfit_id for outfit_id in outfit_ids if outfit_id != "nan"]
    return [outfit_to_embedding_dict[outfit_id] for outfit_id in outfit_ids]

def get_mean_embedding(embeddings):
    embeddings = np.array(embeddings)
    mean_embedding = np.mean(embeddings, axis=0)
    return mean_embedding

def prepare_data(df, outfits_df, embeddings_column="embeddings"):
    outfits_df.dropna(subset=[embeddings_column], inplace=True)
    outfit_to_embedding_dict = outfits_df.set_index("id")[embeddings_column].to_dict()
    group_to_embedding_dict = outfits_df.set_index("group")[embeddings_column].to_dict()
    
    df["train_id_embeddings"] = df["train_outfit_ids"].apply(lambda x: find_rental_history_embeddings(x, outfit_to_embedding_dict))
    df["train_group_embeddings"] = df["train_group"].apply(lambda x: find_rental_history_embeddings(x, group_to_embedding_dict))

    df["rental_history_id_embedding"] = df["train_id_embeddings"].apply(lambda x: get_mean_embedding(x))
    df["rental_history_group_embedding"] = df["train_group_embeddings"].apply(lambda x: get_mean_embedding(x))
    
    return df

def generate_negative_samples(df, outfits_df, embeddings_column="embeddings"):
    all_outfit_ids = set(outfits_df["id"].values)
    all_group_ids = set(outfits_df["group"].values)
    
    df["negative_outfit_ids"] = df["train_outfit_ids"].apply(lambda x: list(all_outfit_ids - set(x)))
    df["negative_group_ids"] = df["train_group"].apply(lambda x: list(all_group_ids - set(x)))
    
    return df

def create_balanced_dataset(df, embeddings_column="embeddings", sample_size=10000):
    positive_samples = df[["rental_history_id_embedding", "test_outfit_id"]].copy()
    positive_samples["label"] = 1
    
    negative_samples = df[["rental_history_id_embedding", "negative_outfit_ids"]].explode("negative_outfit_ids").copy()
    negative_samples.rename(columns={"negative_outfit_ids": "test_outfit_id"}, inplace=True)
    negative_samples["label"] = 0
    
    balanced_df = pd.concat([positive_samples, negative_samples], ignore_index=True)
    
    # Sample a subset of the data
    balanced_df = balanced_df.sample(n=sample_size, random_state=42)
    
    return balanced_df

def train_gradient_boosting(df, outfits_df, embeddings_column="embeddings", sample_size=10000):
    df = prepare_data(df, outfits_df, embeddings_column)
    df = generate_negative_samples(df, outfits_df, embeddings_column)
    
    balanced_df = create_balanced_dataset(df, embeddings_column, sample_size)
    
    X_outfit = np.stack(balanced_df["rental_history_id_embedding"].values)
    y_outfit = balanced_df["label"].values
    
    model_outfit = GradientBoostingClassifier()
    model_outfit.fit(X_outfit, y_outfit)
    
    return model_outfit

def predict_with_gradient_boosting(df, model_outfit, outfits_df, embeddings_column="embeddings"):
    df = prepare_data(df, outfits_df, embeddings_column)
    
    X_outfit = np.stack(df["rental_history_id_embedding"].values)
    
    outfit_predictions = model_outfit.predict(X_outfit)
    
    df["id_prediction"] = outfit_predictions
    return df
"""

In [38]:
NUM_ITEMS = 100

def find_rental_history_embeddings(outfit_ids, outfit_to_embedding_dict):
    outfit_ids = [outfit_id for outfit_id in outfit_ids if outfit_id != "nan"]
    return [outfit_to_embedding_dict[outfit_id] for outfit_id in outfit_ids]

def get_mean_embedding(embeddings):
    embeddings = np.array(embeddings)
    mean_embedding = np.mean(embeddings, axis=0)
    return mean_embedding

def prepare_data(df, outfits_df, embeddings_column="embeddings"):
    outfits_df.dropna(subset=[embeddings_column], inplace=True)
    outfit_to_embedding_dict = outfits_df.set_index("id")[embeddings_column].to_dict()
    group_to_embedding_dict = outfits_df.set_index("group")[embeddings_column].to_dict()
    
    df["train_id_embeddings"] = df["train_outfit_ids"].apply(lambda x: find_rental_history_embeddings(x, outfit_to_embedding_dict))
    df["train_group_embeddings"] = df["train_group"].apply(lambda x: find_rental_history_embeddings(x, group_to_embedding_dict))

    df["rental_history_id_embedding"] = df["train_id_embeddings"].apply(lambda x: get_mean_embedding(x))
    df["rental_history_group_embedding"] = df["train_group_embeddings"].apply(lambda x: get_mean_embedding(x))
    
    return df

def generate_negative_samples(df, outfits_df, embeddings_column="embeddings"):
    all_outfit_ids = set(outfits_df["id"].values)
    all_group_ids = set(outfits_df["group"].values)
    
    df["negative_outfit_ids"] = df["train_outfit_ids"].apply(lambda x: list(all_outfit_ids - set(x)))
    df["negative_group_ids"] = df["train_group"].apply(lambda x: list(all_group_ids - set(x)))
    
    return df

def create_balanced_dataset(df, embeddings_column="embeddings"):
    positive_samples = df[["rental_history_id_embedding", "test_outfit_id"]].copy()
    positive_samples["label"] = 1
    
    # Ensure negative samples are generated
    if "negative_outfit_ids" not in df.columns:
        df = generate_negative_samples(df, outfits_df, embeddings_column)
    
    negative_samples = df[["rental_history_id_embedding", "negative_outfit_ids"]].explode("negative_outfit_ids").copy()
    negative_samples.rename(columns={"negative_outfit_ids": "test_outfit_id"}, inplace=True)
    negative_samples["label"] = 0
    
    # Balance the dataset by resampling
    n_positive = len(positive_samples)
    negative_samples = negative_samples.sample(n=n_positive, random_state=42)
    
    balanced_df = pd.concat([positive_samples, negative_samples], ignore_index=True)
    
    return balanced_df

def train_gradient_boosting(df, outfits_df, embeddings_column="embeddings", sample_size=10000):
    df = prepare_data(df, outfits_df, embeddings_column)
    df = generate_negative_samples(df, outfits_df, embeddings_column)
    
    balanced_df = create_balanced_dataset(df, embeddings_column)
    
    X_outfit = np.stack(balanced_df["rental_history_id_embedding"].values)
    y_outfit = balanced_df["label"].values
    
    model_outfit = GradientBoostingClassifier()
    model_outfit.fit(X_outfit, y_outfit)
    
    return model_outfit

def predict_with_gradient_boosting(df, model_outfit, outfits_df, embeddings_column="embeddings"):
    df = prepare_data(df, outfits_df, embeddings_column)
    
    X_outfit = np.stack(df["rental_history_id_embedding"].values)
    
    outfit_predictions = model_outfit.predict_proba(X_outfit)[:, 1]  # Get probabilities for the positive class
    df["predicted_probabilities"] = outfit_predictions
    
    # Get top N recommendations
    df["id_prediction"] = df["predicted_probabilities"].apply(lambda x: np.argsort(x)[-100:][::-1].tolist())
    df["group_prediction"] = df["predicted_probabilities"].apply(lambda x: np.argsort(x)[-100:][::-1].tolist())
    
    return df

In [25]:
# Apply to dataframes
tqdm.pandas()

In [55]:
def find_rental_history_embeddings(outfit_ids, outfit_to_embedding_dict):
    outfit_ids = [outfit_id for outfit_id in outfit_ids if outfit_id != "nan"]
    return [outfit_to_embedding_dict[outfit_id] for outfit_id in outfit_ids]

def get_mean_embedding(embeddings):
    embeddings = np.array(embeddings)
    mean_embedding = np.mean(embeddings, axis=0)
    return mean_embedding

def prepare_data(df, outfits_df, embeddings_column="embeddings"):
    outfits_df.dropna(subset=[embeddings_column], inplace=True)
    outfit_to_embedding_dict = outfits_df.set_index("id")[embeddings_column].to_dict()
    group_to_embedding_dict = outfits_df.set_index("group")[embeddings_column].to_dict()
    
    df["train_id_embeddings"] = df["train_outfit_ids"].apply(lambda x: find_rental_history_embeddings(x, outfit_to_embedding_dict))
    df["train_group_embeddings"] = df["train_group"].apply(lambda x: find_rental_history_embeddings(x, group_to_embedding_dict))

    df["rental_history_id_embedding"] = df["train_id_embeddings"].apply(lambda x: get_mean_embedding(x))
    df["rental_history_group_embedding"] = df["train_group_embeddings"].apply(lambda x: get_mean_embedding(x))
    
    return df

def generate_negative_samples(df, outfits_df, embeddings_column="embeddings"):
    all_outfit_ids = set(outfits_df["id"].values)
    all_group_ids = set(outfits_df["group"].values)
    
    df["negative_outfit_ids"] = df["train_outfit_ids"].apply(lambda x: list(all_outfit_ids - set(x)))
    df["negative_group_ids"] = df["train_group"].apply(lambda x: list(all_group_ids - set(x)))
    
    return df

def create_balanced_dataset(df,outfits_df, embeddings_column="embeddings"):
    positive_samples = df[["rental_history_id_embedding", "test_outfit_id"]].copy()
    positive_samples["label"] = 1
    
    # Ensure negative samples are generated
    if "negative_outfit_ids" not in df.columns:
        df = generate_negative_samples(df, outfits_df, embeddings_column)
    
    negative_samples = df[["rental_history_id_embedding", "negative_outfit_ids"]].explode("negative_outfit_ids").copy()
    negative_samples.rename(columns={"negative_outfit_ids": "test_outfit_id"}, inplace=True)
    negative_samples["label"] = 0
    
    # Balance the dataset by resampling
    n_positive = len(positive_samples)
    negative_samples = negative_samples.sample(n=n_positive, random_state=42)
    
    balanced_df = pd.concat([positive_samples, negative_samples], ignore_index=True)
    
    return balanced_df

def train_gradient_boosting(df, outfits_df, embeddings_column="embeddings"):
    df = prepare_data(df, outfits_df, embeddings_column)
    
    balanced_df = create_balanced_dataset(df,outfits_df, embeddings_column)
    
    X_outfit = np.stack(balanced_df["rental_history_id_embedding"].values)
    y_outfit = balanced_df["label"].values
    
    model_outfit = GradientBoostingClassifier()
    model_outfit.fit(X_outfit, y_outfit)
    
    return model_outfit

def predict_with_gradient_boosting(df, model_outfit, outfits_df, embeddings_column="embeddings"):
    df = prepare_data(df, outfits_df, embeddings_column)
    
    X_outfit = np.stack(df["rental_history_id_embedding"].values)
    
    outfit_predictions = model_outfit.predict_proba(X_outfit)[:, 1]  # Get probabilities for the positive class
    df["predicted_probabilities"] = outfit_predictions
    
    # Map indices to outfit IDs
    outfit_id_map = {i: outfit_id for i, outfit_id in enumerate(outfits_df["id"].values)}
    
    # Get top N recommendations
    df["id_prediction"] = df["predicted_probabilities"].apply(lambda x: [outfit_id_map[i] for i in np.argsort(x)[-100:][::-1]])
    df["group_prediction"] = df["predicted_probabilities"].apply(lambda x: [outfit_id_map[i] for i in np.argsort(x)[-100:][::-1]])
    
    return df

In [61]:
def find_rental_history_embeddings(outfit_ids, outfit_to_embedding_dict):
    outfit_ids = [outfit_id for outfit_id in outfit_ids if outfit_id != "nan"]
    return [outfit_to_embedding_dict[outfit_id] for outfit_id in outfit_ids]

def get_mean_embedding(embeddings):
    embeddings = np.array(embeddings)
    mean_embedding = np.mean(embeddings, axis=0)
    return mean_embedding

def prepare_data(df, outfits_df, embeddings_column="embeddings"):
    outfits_df.dropna(subset=[embeddings_column], inplace=True)
    outfit_to_embedding_dict = outfits_df.set_index("id")[embeddings_column].to_dict()
    group_to_embedding_dict = outfits_df.set_index("group")[embeddings_column].to_dict()
    
    df["train_id_embeddings"] = df["train_outfit_ids"].apply(lambda x: find_rental_history_embeddings(x, outfit_to_embedding_dict))
    df["train_group_embeddings"] = df["train_group"].apply(lambda x: find_rental_history_embeddings(x, group_to_embedding_dict))

    df["rental_history_id_embedding"] = df["train_id_embeddings"].apply(lambda x: get_mean_embedding(x))
    df["rental_history_group_embedding"] = df["train_group_embeddings"].apply(lambda x: get_mean_embedding(x))
    
    return df

def generate_negative_samples(df, outfits_df, embeddings_column="embeddings"):
    all_outfit_ids = set(outfits_df["id"].values)
    all_group_ids = set(outfits_df["group"].values)
    
    df["negative_outfit_ids"] = df["train_outfit_ids"].apply(lambda x: list(all_outfit_ids - set(x)))
    df["negative_group_ids"] = df["train_group"].apply(lambda x: list(all_group_ids - set(x)))
    
    return df

def create_balanced_dataset(df,outfits_df, embeddings_column="embeddings"):
    positive_samples = df[["rental_history_id_embedding", "test_outfit_id"]].copy()
    positive_samples["label"] = 1
    
    # Ensure negative samples are generated
    if "negative_outfit_ids" not in df.columns:
        df = generate_negative_samples(df, outfits_df, embeddings_column)
    
    negative_samples = df[["rental_history_id_embedding", "negative_outfit_ids"]].explode("negative_outfit_ids").copy()
    negative_samples.rename(columns={"negative_outfit_ids": "test_outfit_id"}, inplace=True)
    negative_samples["label"] = 0
    
    # Balance the dataset by resampling
    n_positive = len(positive_samples)
    negative_samples = negative_samples.sample(n=n_positive, random_state=42)
    
    balanced_df = pd.concat([positive_samples, negative_samples], ignore_index=True)
    
    return balanced_df

def train_gradient_boosting(df, outfits_df, embeddings_column="embeddings"):
    df = prepare_data(df, outfits_df, embeddings_column)
    
    balanced_df = create_balanced_dataset(df,outfits_df, embeddings_column)
    
    X_outfit = np.stack(balanced_df["rental_history_id_embedding"].values)
    y_outfit = balanced_df["label"].values
    
    model_outfit = GradientBoostingClassifier()
    model_outfit.fit(X_outfit, y_outfit)
    
    return model_outfit

def predict_with_gradient_boosting(df, model_outfit, outfits_df, embeddings_column="embeddings"):
    df = prepare_data(df, outfits_df, embeddings_column)
    
    X_outfit = np.stack(df["rental_history_id_embedding"].values)
    
    outfit_predictions = model_outfit.predict_proba(X_outfit)[:, 1]  # Get probabilities for the positive class
    df["predicted_probabilities"] = outfit_predictions
    
    # Map indices to outfit IDs
    outfit_id_map = {i: outfit_id for i, outfit_id in enumerate(outfits_df["id"].values)}
    
    # Get top N recommendations for each user
    df["id_prediction"] = df.apply(lambda row: [outfit_id_map[i] for i in np.argsort(row["predicted_probabilities"])[-100:][::-1]], axis=1)
    df["group_prediction"] = df.apply(lambda row: [outfit_id_map[i] for i in np.argsort(row["predicted_probabilities"])[-100:][::-1]], axis=1)
    
    return df

In [None]:
def train_images(df, outfits_df, embeddings_column="embeddings"):
    outfits_df["mean_embeddings"] = outfits_df[embeddings_column].apply(lambda x: get_mean_embedding(x))
    return train_gradient_boosting(df, outfits_df, embeddings_column="mean_embeddings")
    
def predict_nearest_neighbors_images(df, outfits_df, embeddings_column="embeddings"):
    outfits_df["mean_embeddings"] = outfits_df[embeddings_column].apply(lambda x: get_mean_embedding(x))
    return predict_with_gradient_boosting(df, outfits_df, embeddings_column="mean_embeddings")

In [58]:
predictions_df['contains_value'] = predictions_df['id_prediction'].apply(lambda x: 'outfit.fffdaa715c3646f8b1c0f04d549ff07e' in x)
predictions_df[predictions_df['contains_value']!=True]

Unnamed: 0,train_outfit_ids,test_outfit_id,train_group,test_group,train_booking_times,test_booking_time,train_id_embeddings,train_group_embeddings,rental_history_id_embedding,rental_history_group_embedding,negative_outfit_ids,negative_group_ids,predicted_probabilities,id_prediction,group_prediction,id_hit_rate_at_100,id_hit_rate_at_10,group_hit_rate_at_100,group_hit_rate_at_10,contains_value


In [57]:
predictions_df.head()

Unnamed: 0,train_outfit_ids,test_outfit_id,train_group,test_group,train_booking_times,test_booking_time,train_id_embeddings,train_group_embeddings,rental_history_id_embedding,rental_history_group_embedding,negative_outfit_ids,negative_group_ids,predicted_probabilities,id_prediction,group_prediction,id_hit_rate_at_100,id_hit_rate_at_10,group_hit_rate_at_100,group_hit_rate_at_10
0,"[outfit.85f26909d8334ab78f30c2fc9c73faf7, outf...",[outfit.9f5058295098471abdfaf0a7c74ddbfe],"[group.c79c907b6c94a9bd2005e038943ab529, group...",[group.f6f0b9ebb3228aab27a79ac658c76682],"[2023-11-22 00:00:00, 2023-11-24 00:00:00, 202...",[2023-12-06 00:00:00],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[outfit.7815f82c055c4ecebb71e3ba2dbb764f, outf...","[group.178ab9526dc35eaa207170039c1381d2, group...",0.553572,[outfit.fffdaa715c3646f8b1c0f04d549ff07e],[outfit.fffdaa715c3646f8b1c0f04d549ff07e],0,0,0,0
1,"[outfit.d7bff1b799a34575a47ce0f531791c9f, outf...","[outfit.98fa1b5287182a9d, outfit.dd04098010f74...","[group.287dba5268fb7b20e8ef81c053970691, group...","[group.a4449ee16d7951f425083623efd0dcec, group...","[2021-08-02 00:00:00, 2021-08-02 00:00:00, 202...","[2021-11-01 00:00:00, 2021-12-01 00:00:00, 202...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[outfit.7815f82c055c4ecebb71e3ba2dbb764f, outf...","[group.178ab9526dc35eaa207170039c1381d2, group...",0.518006,[outfit.fffdaa715c3646f8b1c0f04d549ff07e],[outfit.fffdaa715c3646f8b1c0f04d549ff07e],0,0,0,0
2,[outfit.9fde090f117fb9d9],[outfit.849ace7e1811150d],[group.27808d969027a4e243c8945176f280c0],[group.caafbed55494b0c93dab58d58d526f0a],[2018-09-06 00:00:00],[2018-09-06 00:00:00],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[outfit.7815f82c055c4ecebb71e3ba2dbb764f, outf...","[group.178ab9526dc35eaa207170039c1381d2, group...",0.499688,[outfit.fffdaa715c3646f8b1c0f04d549ff07e],[outfit.fffdaa715c3646f8b1c0f04d549ff07e],0,0,0,0
3,"[outfit.98eebea274f23dd6, outfit.648db79508724...","[outfit.b2c68e50868a46a8872e81bcd3a17870, outf...","[group.a02de08741b879719c3ea97e24e5f230, group...","[group.69217601bce159dcf21b4c8e6f059f42, group...","[2021-08-25 00:00:00, 2021-08-25 00:00:00, 202...","[2022-02-28 00:00:00, 2022-02-28 00:00:00, 202...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[outfit.7815f82c055c4ecebb71e3ba2dbb764f, outf...","[group.178ab9526dc35eaa207170039c1381d2, group...",0.45034,[outfit.fffdaa715c3646f8b1c0f04d549ff07e],[outfit.fffdaa715c3646f8b1c0f04d549ff07e],0,0,0,0
4,"[outfit.5e1b9778e36d475699772148e5d4e27b, outf...",[outfit.7321c26a479e46cd9fb07fa3ab7d7594],"[group.0a736bffd33390d7693442e6eecd0f35, group...",[group.cce63b3a8de0f3495c0744990e88b78f],"[2019-11-20 00:00:00, 2019-11-20 00:00:00]",[2019-11-20 00:00:00],"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[outfit.7815f82c055c4ecebb71e3ba2dbb764f, outf...","[group.178ab9526dc35eaa207170039c1381d2, group...",0.501527,[outfit.fffdaa715c3646f8b1c0f04d549ff07e],[outfit.fffdaa715c3646f8b1c0f04d549ff07e],0,0,0,0


In [62]:
# Tag based predictions
user_splits=user_splits_df.copy()
#user_splits_unique=user_splits_unique_df.copy()
model_outfit = train_gradient_boosting(user_splits, outfits ,embeddings_column="one_hot_encoded")
predictions_df = predict_with_gradient_boosting(user_splits, model_outfit, outfits,embeddings_column="one_hot_encoded")
#user_splits_unique = predict_nearest_neighbors(user_splits_unique, outfits, embeddings_column="one_hot_encoded")
final_df, hit_rate_results = evaluate_df_hit_rate_at_n(predictions_df, n=10)
#user_splits_unique = evaluate_df_hit_rate_at_n(user_splits_unique, n=10)

id_hit_rate_at_100       0.0
id_hit_rate_at_10        0.0
group_hit_rate_at_100    0.0
group_hit_rate_at_10     0.0
dtype: float64

In [40]:
final_df[final_df['id_hit_rate_at_100']!=0]

Unnamed: 0,train_outfit_ids,test_outfit_id,train_group,test_group,train_booking_times,test_booking_time,train_id_embeddings,train_group_embeddings,rental_history_id_embedding,rental_history_group_embedding,negative_outfit_ids,negative_group_ids,predicted_probabilities,id_prediction,group_prediction,id_hit_rate_at_100,id_hit_rate_at_10,group_hit_rate_at_100,group_hit_rate_at_10


In [None]:
# Image based predictions
user_splits=user_splits_df.copy()
user_splits_unique=user_splits_unique_df.copy()
user_splits = predict_nearest_neighbors_images(user_splits, outfits, embeddings_column="embeddings")
user_splits_unique = predict_nearest_neighbors_images(user_splits_unique, outfits, embeddings_column="embeddings")
user_splits = evaluate_df_hit_rate_at_n(user_splits, n=10)
user_splits_unique = evaluate_df_hit_rate_at_n(user_splits_unique, n=10)

In [None]:
# Combined predictions
"""
user_splits=user_splits_df.copy()
user_splits_unique=user_splits_unique_df.copy()
user_splits = predict_nearest_neighbors(user_splits, outfits, embeddings_column="outfit_embeddings")
user_splits_unique = predict_nearest_neighbors(user_splits_unique, outfits, embeddings_column="outfit_embeddings")
user_splits = evaluate_df_hit_rate_at_n(user_splits, n=10)
user_splits_unique = evaluate_df_hit_rate_at_n(user_splits_unique, n=10)
"""

In [None]:
# Concat predictions
user_splits=user_splits_df.copy()
user_splits_unique=user_splits_unique_df.copy()
user_splits = predict_nearest_neighbors(user_splits, outfits, embeddings_column="concatenated_embeddings")
user_splits_unique = predict_nearest_neighbors(user_splits_unique, outfits, embeddings_column="concatenated_embeddings")
user_splits = evaluate_df_hit_rate_at_n(user_splits, n=10)
user_splits_unique = evaluate_df_hit_rate_at_n(user_splits_unique, n=10)