# Gradient Boosting
In this notebook, I implement a Gradient Boosting model based on outfit tags encoding.  

This approach failed because it returned a score of 0 for all evaluation metrics.

In [1]:
from google.colab import drive
drive.mount('/content/drive')
path='/content/drive/MyDrive/RecSys_206894495'
%run /content/drive/MyDrive/RecSys_206894495/models/evaluate_models.ipynb

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
empty json


In [2]:
!pip install pyarrow



In [3]:
file_path_outfit = path + "/models/val_outfit.json"
file_path_group = path + "/models/val_group.json"

In [4]:
#import
#warning
import warnings
warnings.filterwarnings('ignore')

#general
import pandas as pd
import numpy as np
from itertools import product
import itertools
import json
from tqdm.notebook import tqdm
from datetime import datetime
import gc

#for predictions
from sklearn.ensemble import GradientBoostingClassifier as GradientBoosting
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer

In [5]:
#load data
user_splits=pd.read_parquet(path+'/models/user_splits.parquet')

In [6]:
# Function to create a new DataFrame for a specific set (train, val, test)
def create_new_df(df, set_type):
    new_data = []

    for index, row in df.iterrows():
        outfit_ids = row[f'{set_type}_outfit_ids']
        unique_outfit_ids, counts = np.unique(outfit_ids, return_counts=True)
        for outfit_id, count in zip(unique_outfit_ids, counts):
            new_data.append({
                'customer_id': row['customer_id'],
                'outfit_id': outfit_id,
                'label': 1
            })

    return pd.DataFrame(new_data)

# Create new DataFrames for train, val, and test
train_df = create_new_df(user_splits, 'train')
val_df = create_new_df(user_splits, 'val')
test_df = create_new_df(user_splits, 'test')


In [7]:
# Function to create a new DataFrame for a specific set (train, val, test)
def create_new_df_group(df, set_type):
    new_data = []

    for index, row in df.iterrows():
        outfit_ids = row[f'{set_type}_group']
        unique_outfit_ids, counts = np.unique(outfit_ids, return_counts=True)
        for outfit_id, count in zip(unique_outfit_ids, counts):
            new_data.append({
                'customer_id': row['customer_id'],
                'group': outfit_id,
                'label': 1
            })

    return pd.DataFrame(new_data)

# Create new DataFrames for train, val, and test
train_df_group = create_new_df_group(user_splits, 'train')
val_df_group = create_new_df_group(user_splits, 'val')
test_df_group = create_new_df_group(user_splits, 'test')

In [8]:
orders=pd.read_parquet(path+'/archive/data/orders.parquet')
groups=pd.read_parquet(path+'/archive/data/outfits.parquet')[['id','group']]
orders=orders.merge(groups, left_on='outfit.id', right_on='id', how='left').drop('id', axis=1)
del groups

In [9]:
def generate_balanced_dataset(df,all_combinations, customer_col='customer_id', outfit_col='outfit_id'):

    # Identify positive examples
    positive_examples = df[[customer_col, outfit_col]]

    # Get negative examples by removing positive examples from all combinations
    negative_examples = pd.merge(all_combinations, positive_examples, on=[customer_col, outfit_col], how='left', indicator=True)
    negative_examples = negative_examples[negative_examples['_merge'] == 'left_only'].drop(columns=['_merge'])

    # Sample negative examples to match the size of positive examples
    negative_examples = negative_examples.sample(n=len(positive_examples), random_state=42)

    # Add a label column to distinguish between positive and negative examples
    positive_examples['label'] = 1
    negative_examples['label'] = 0

    # Combine positive and negative examples
    final_df = pd.concat([positive_examples, negative_examples], ignore_index=True)

    return final_df

In [10]:
# Get unique customer_ids and outfit_ids
customer_ids = orders['customer.id'].unique()
outfit_ids = orders['outfit.id'].unique()
# Generate all possible combinations of customer_id and outfit_id
all_combinations_outfits = pd.DataFrame(list(itertools.product(customer_ids, outfit_ids)), columns=['customer_id', 'outfit_id'])

In [11]:
# Generate balanced dataset
train_df = generate_balanced_dataset(train_df,all_combinations_outfits)
val_df = generate_balanced_dataset(val_df,all_combinations_outfits)
test_df = generate_balanced_dataset(test_df,all_combinations_outfits)
del all_combinations_outfits

In [12]:
# Get unique customer_ids and group
customer_ids = orders['customer.id'].unique()
groups = orders['group'].unique()
# Generate all possible combinations of customer_id and outfit_id
all_combinations_group = pd.DataFrame(list(itertools.product(customer_ids, groups)), columns=['customer_id', 'group'])

In [13]:
# Generate balanced dataset

train_df_group = generate_balanced_dataset(train_df_group,all_combinations_group,outfit_col='group')
val_df_group = generate_balanced_dataset(val_df_group,all_combinations_group,outfit_col='group')
test_df_group = generate_balanced_dataset(test_df_group,all_combinations_group,outfit_col='group')
del all_combinations_group

In [14]:
train_df.sample(5)

Unnamed: 0,customer_id,outfit_id,label
55002,2663,outfit.5bea48bb837d43a381bec3cd63635c95,0
5108,892,outfit.1fb2b83eb1be4926a2668d100e7b7041,1
80955,1153,outfit.8ca9084a3dbf45dcab6459962a407480,0
52102,3823,outfit.6a39c9ed2e3c40d7a84ba3e42c87f7a0,0
6852,1192,outfit.abe970f711444589b6ce9e81bf35615d,1


In [15]:
# Concatenate the two DataFrames
combined_df = pd.concat([train_df, val_df])

# Group by customer_id and outfit_id and sum the counts
join_df = combined_df.groupby(['customer_id', 'outfit_id'], as_index=False)['label'].max()

del combined_df

# Concatenate the two DataFrames
combined_df_group = pd.concat([train_df_group, val_df_group])

# Group by customer_id and outfit_id and sum the counts
join_df_group = combined_df_group.groupby(['customer_id', 'group'], as_index=False)['label'].max()

del combined_df_group

In [16]:
join_df.sample(5)

Unnamed: 0,customer_id,outfit_id,label
23703,1683,outfit.89b7c4936135cfb5,1
1307,89,outfit.011102e2453d4812931e342a46cddbbf,0
57217,3988,outfit.a294399ed5bfba8f,1
74874,5215,outfit.33beac6c203e41a2934cb611bc1c14bb,1
32237,2293,outfit.8d93ab54d4f348038868e96c0e430504,1


In [17]:
outfits=pd.read_parquet(path+'/archive/data/outfits.parquet')

In [18]:
#prepare tags
all_tags = outfits["outfit_tags"].values.tolist()
mlb = MultiLabelBinarizer()
one_hot_encoded = mlb.fit_transform(all_tags)
outfits_encoded=pd.concat([outfits[['id','group']], pd.DataFrame(one_hot_encoded,columns=mlb.classes_)], axis=1)
del all_tags, one_hot_encoded, outfits

In [19]:
groups=outfits_encoded.groupby('group').max().drop('id',axis=1)
outfits_encoded.drop(columns='group',inplace=True)

In [20]:
train_df=train_df.merge(outfits_encoded, left_on='outfit_id', right_on='id').drop('id', axis=1)
val_df=val_df.merge(outfits_encoded, left_on='outfit_id', right_on='id').drop('id', axis=1)
test_df=test_df.merge(outfits_encoded, left_on='outfit_id', right_on='id').drop('id', axis=1)
join_df=join_df.merge(outfits_encoded, left_on='outfit_id', right_on='id').drop('id', axis=1)

In [21]:
train_df_group=train_df_group.merge(groups, on='group')
val_df_group=val_df_group.merge(groups, on='group')
test_df_group=test_df_group.merge(groups, on='group')
join_df_group=join_df_group.merge(groups, on='group')

In [22]:
del outfits_encoded, groups

In [23]:
"""customers_outfits=pd.concat([train_df[['customer_id','outfit_id']],val_df[['customer_id','outfit_id']],test_df[['customer_id','outfit_id']]])
customers_outfits.drop_duplicates(inplace=True)"""

"customers_outfits=pd.concat([train_df[['customer_id','outfit_id']],val_df[['customer_id','outfit_id']],test_df[['customer_id','outfit_id']]])\ncustomers_outfits.drop_duplicates(inplace=True)"

In [24]:
"""enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(customers_outfits)
del customers_outfits

df=enc.transform(train_df[['customer_id', 'outfit_id']])
train_df.drop(['customer_id', 'outfit_id'], axis=1, inplace=True)
train_df=pd.concat([pd.DataFrame(df.toarray()),train_df],axis=1)

df=enc.transform(val_df[['customer_id', 'outfit_id']])
val_df.drop(['customer_id', 'outfit_id'], axis=1, inplace=True)
val_df=pd.concat([pd.DataFrame(df.toarray()),val_df],axis=1)

df=enc.transform(test_df[['customer_id', 'outfit_id']])
test_df.drop(['customer_id', 'outfit_id'], axis=1, inplace=True)
test_df=pd.concat([pd.DataFrame(df.toarray()),test_df],axis=1)

df=enc.transform(join_df[['customer_id', 'outfit_id']])
join_df.drop(['customer_id', 'outfit_id'], axis=1, inplace=True)
join_df=pd.concat([pd.DataFrame(df.toarray()),join_df],axis=1)

del enc, df"""

"enc = OneHotEncoder(handle_unknown='ignore')\nenc.fit(customers_outfits)\ndel customers_outfits\n\ndf=enc.transform(train_df[['customer_id', 'outfit_id']])\ntrain_df.drop(['customer_id', 'outfit_id'], axis=1, inplace=True)\ntrain_df=pd.concat([pd.DataFrame(df.toarray()),train_df],axis=1)\n\ndf=enc.transform(val_df[['customer_id', 'outfit_id']])\nval_df.drop(['customer_id', 'outfit_id'], axis=1, inplace=True)\nval_df=pd.concat([pd.DataFrame(df.toarray()),val_df],axis=1)\n\ndf=enc.transform(test_df[['customer_id', 'outfit_id']])\ntest_df.drop(['customer_id', 'outfit_id'], axis=1, inplace=True)\ntest_df=pd.concat([pd.DataFrame(df.toarray()),test_df],axis=1)\n\ndf=enc.transform(join_df[['customer_id', 'outfit_id']])\njoin_df.drop(['customer_id', 'outfit_id'], axis=1, inplace=True)\njoin_df=pd.concat([pd.DataFrame(df.toarray()),join_df],axis=1)\n\ndel enc, df"

In [25]:
"""customers_groups=pd.concat([train_df_group[['customer_id','group']],val_df_group[['customer_id','group']],test_df_group[['customer_id','group']]])
customers_groups.drop_duplicates(inplace=True)"""

"customers_groups=pd.concat([train_df_group[['customer_id','group']],val_df_group[['customer_id','group']],test_df_group[['customer_id','group']]])\ncustomers_groups.drop_duplicates(inplace=True)"

In [26]:
"""enc=OneHotEncoder(handle_unknown='ignore')
enc.fit(customers_groups)
del customers_groups

df=enc.transform(train_df_group[['customer_id', 'group']])
train_df_group.drop(['customer_id', 'group'], axis=1, inplace=True)
train_df_group=pd.concat([pd.DataFrame(df.toarray()),train_df_group],axis=1)

df=enc.transform(val_df_group[['customer_id', 'group']])
val_df_group.drop(['customer_id', 'group'], axis=1, inplace=True)
val_df_group=pd.concat([pd.DataFrame(df.toarray()),val_df_group],axis=1)

df=enc.transform(test_df_group[['customer_id', 'group']])
test_df_group.drop(['customer_id', 'group'], axis=1, inplace=True)
test_df_group=pd.concat([pd.DataFrame(df.toarray()),test_df_group],axis=1)

df=enc.transform(join_df_group[['customer_id', 'group']])
join_df_group.drop(['customer_id', 'group'], axis=1, inplace=True)
join_df_group=pd.concat([pd.DataFrame(df.toarray()),join_df_group],axis=1)

del df, enc"""

"enc=OneHotEncoder(handle_unknown='ignore')\nenc.fit(customers_groups)\ndel customers_groups\n\ndf=enc.transform(train_df_group[['customer_id', 'group']])\ntrain_df_group.drop(['customer_id', 'group'], axis=1, inplace=True)\ntrain_df_group=pd.concat([pd.DataFrame(df.toarray()),train_df_group],axis=1)\n\ndf=enc.transform(val_df_group[['customer_id', 'group']])\nval_df_group.drop(['customer_id', 'group'], axis=1, inplace=True)\nval_df_group=pd.concat([pd.DataFrame(df.toarray()),val_df_group],axis=1)\n\ndf=enc.transform(test_df_group[['customer_id', 'group']])\ntest_df_group.drop(['customer_id', 'group'], axis=1, inplace=True)\ntest_df_group=pd.concat([pd.DataFrame(df.toarray()),test_df_group],axis=1)\n\ndf=enc.transform(join_df_group[['customer_id', 'group']])\njoin_df_group.drop(['customer_id', 'group'], axis=1, inplace=True)\njoin_df_group=pd.concat([pd.DataFrame(df.toarray()),join_df_group],axis=1)\n\ndel df, enc"

In [27]:
gc.collect()

0

##Gradient Boosting

In [28]:
def train_gb(x,y,n_estimators,learning_rate,max_depth):
  algo=GradientBoosting(n_estimators=n_estimators,learning_rate=learning_rate,max_depth=max_depth)
  algo.fit(x,y)
  return algo
def test_gb(algo,x_test,user_splits_df,col,group=False):
  x_test['label_predicted']=algo.predict(x_test.drop(['customer_id',col],axis=1))
  results=x_test[x_test['label_predicted']==1]
  df_merged = user_splits_df.merge(results, on='customer_id', how='left').groupby('customer_id').agg({
      **{col: 'first' for col in user_splits_df.columns if col != 'customer_id'},
      'label_predicted': lambda x: list(x)
  }).reset_index()
  # Rename the aggregated column to id_prediction
  if group:
    df_merged.rename(columns={'label_predicted': 'group_prediction'}, inplace=True)
  else:
    df_merged.rename(columns={'label_predicted': 'id_prediction'}, inplace=True)
  return x_test,algo,df_merged

def train_and_test_gb(x_train,y_train,x_test,user_splits_df,n_estimators,learning_rate,max_depth,group=False,col='outfit_id'):
  if group==True:
    col='group'
  algo=train_gb(x_train.drop(['customer_id',col],axis=1),y_train,n_estimators,learning_rate,max_depth)
  test,algo,df_merged=test_gb(algo,x_test,user_splits_df,col,group)
  return test,algo,df_merged

In [29]:
n_estimators_list=[100,150,200]
learning_rate_list=[0.1,0.05,0.2]
max_depth_list=[3,4,5]

In [30]:
x_train=train_df.drop(['label'],axis=1)
y_train=train_df['label']
x_val=val_df.drop(['label'],axis=1)
y_val=val_df['label']

In [31]:
##Hyper-Parameters tuning for outfit

test_permutations = list(product(n_estimators_list,learning_rate_list,max_depth_list))

for n_estimators,learning_rate,max_depth in tqdm(test_permutations):
    user_splits_df=user_splits.copy()
    predictions,algo,user_splits_df = train_and_test_gb(x_train,y_train,x_val.copy(),user_splits_df,n_estimators,learning_rate,max_depth)
    param_dict = {'n_estimators':n_estimators,'learning_rate':learning_rate,'max_depth':max_depth}
    user_splits_df, all_dict = evaluate_val_metrics_at_n_outfit(user_splits_df,'Gradient_Boosting_tag', n=10,model_params=param_dict)

  0%|          | 0/27 [00:00<?, ?it/s]

In [32]:
del x_train,y_train,x_val,y_val

In [36]:
x_train_group=train_df_group.drop(['label'],axis=1)
y_train_group=train_df_group['label']
x_val_group=val_df_group.drop(['label'],axis=1)
y_val_group=val_df_group['label']

In [37]:
##Hyper-Parameters tuning for group

test_permutations = list(product(n_estimators_list,learning_rate_list,max_depth_list))

for n_estimators,learning_rate,max_depth in tqdm(test_permutations):
    user_splits_df=user_splits.copy()
    predictions,algo,user_splits_df = train_and_test_gb(x_train_group,y_train_group,x_val_group.copy(),user_splits_df
                                                        ,n_estimators,learning_rate,max_depth,group=True)
    param_dict = {'n_estimators':n_estimators,'learning_rate':learning_rate,'max_depth':max_depth}
    user_splits_df, all_dict = evaluate_val_metrics_at_n_group(user_splits_df,'Gradient_Boosting_tag', n=10,model_params=param_dict)

  0%|          | 0/27 [00:00<?, ?it/s]

In [38]:
del x_train_group,y_train_group,x_val_group,y_val_group

In [40]:
# Retrieve the best model parameters from the JSON file for the specified method
best_model_params_outfit = retrieve_best_model_params_from_file_outfit('Gradient_Boosting_tag', n=10)
best_model_params_group = retrieve_best_model_params_from_file_group('Gradient_Boosting_tag', n=10)

user_splits_df=user_splits.copy()
n_estimators_outfit=best_model_params_outfit['n_estimators']
learning_rate_outfit=best_model_params_outfit['learning_rate']
max_depth_outfit=best_model_params_outfit['max_depth']
n_estimators_group=best_model_params_group['n_estimators']
learning_rate_group=best_model_params_group['learning_rate']
max_depth_group=best_model_params_group['max_depth']

In [41]:
x_test=test_df.drop(['label'],axis=1)
y_test=test_df['label']
x_join=join_df.drop(['label'],axis=1)
y_join=join_df['label']

In [43]:
# Train the model with the best parameters
predictions,algo,user_splits_df = train_and_test_gb(x_join,y_join,x_test,user_splits_df,
                                                    n_estimators_outfit, learning_rate_outfit, max_depth_outfit)

In [44]:
del x_join,y_join,x_test,y_test

In [45]:
x_test_group=test_df_group.drop(['label'],axis=1)
y_test_group=test_df_group['label']
x_join_group=join_df_group.drop(['label'],axis=1)
y_join_group=join_df_group['label']

In [46]:
predictions,algo,user_splits_df = train_and_test_gb(x_join_group,y_join_group,x_test_group,user_splits_df,
                                                    n_estimators_group, learning_rate_group, max_depth_group,group=True)

In [47]:
del x_join_group,y_join_group,x_test_group,y_test_group

In [48]:
# Evaluate the model
user_splits_df, all_dict = evaluate_df_metrics_at_n(user_splits_df,'Gradient_Boosting_tag', n=10)

Unnamed: 0,0
id_hit_rate_at_10,0.0
id_precision_at_10,0.0
id_recall_at_10,0.0
id_f1_score_at_10,0.0
id_ndcg_at_10,0.0
group_hit_rate_at_10,0.0
group_precision_at_10,0.0
group_recall_at_10,0.0
group_f1_score_at_10,0.0
group_ndcg_at_10,0.0
