# Embedding Creation  
In this notebook, different types of embeddings and encodings for outfits are created or organized. These will later be used for content-based recommendations.

In [None]:
from google.colab import drive
drive.mount('/content/drive')
path='/content/drive/MyDrive/RecSys_206894495'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#import
#warning
import warnings
warnings.filterwarnings('ignore')

#general
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from itertools import product
import pickle
import os
from sklearn.preprocessing import MultiLabelBinarizer #for Multi Label of the tags

#torch
import torch
import torch.nn as nn
import torch.optim as optim

In [None]:
LOCAL_EMBEDDINGS_FOLDER_PATH = path+"/archive/embeddings/"
EMBEDDING_MODEL_NAME = "EfficientNet_V2_L_final"
COMPUTED_EMBEDDINGS_PATH = f"{LOCAL_EMBEDDINGS_FOLDER_PATH}{EMBEDDING_MODEL_NAME}"
EMBEDDING_MODEL_PICKLE_PATH = f"{LOCAL_EMBEDDINGS_FOLDER_PATH}{EMBEDDING_MODEL_NAME}.pkl"
EMBEDDING_MODEL_DICT_PICKLE_PATH = path+'/archive/EfficientNet_V2_L_final_dict.pkl'


In [None]:
# Load each of the embeddings from the embeddings folder, and return them in a dictionary with the hash as the key
def load_embeddings_from_folder():
    saved_embeddings = os.listdir(COMPUTED_EMBEDDINGS_PATH)
    embeddings_dict = {}
    for embedding in tqdm(saved_embeddings):
        embedding_name = ".".join(embedding.split(".")[:-1])
        embeddings_dict[embedding_name] = np.load(os.path.join(COMPUTED_EMBEDDINGS_PATH, embedding))
    return embeddings_dict

def load_embeddings_from_pickle():
    embeddings_df = pd.read_pickle(EMBEDDING_MODEL_PICKLE_PATH)
    embeddings_df = embeddings_df.groupby("outfit.id").agg({"picture.id": list, "embeddings": list}).reset_index()
    return embeddings_df

In [None]:
# Loading embeddings
#loaded_embeddings_dict = load_embeddings_from_folder() #run this code on the first time of runing the code
#pickle.dump(loaded_embeddings_dict, open(EMBEDDING_MODEL_DICT_PICKLE_PATH, "wb")) #run this code on the first time of runing the code
loaded_embeddings_dict = pickle.load(open(EMBEDDING_MODEL_DICT_PICKLE_PATH, "rb"))

#load data
pictures_df = pd.read_parquet(path+'/archive/data/picture_triplets.parquet',engine='pyarrow')
orders=pd.read_parquet(path+'/archive/data/orders.parquet',engine='pyarrow')
outfits=pd.read_parquet(path+'/archive/data/outfits.parquet',engine='pyarrow')
my_encoding=pd.read_parquet(path+'/models/my_encoding.parquet',engine='pyarrow')
#my_encoding_count_pairs_outfit=pd.read_parquet(path+'/models/my_encoding_count_pairs_outfit.parquet',engine='pyarrow')

In [None]:
pictures_df[pictures_df['outfit.id']=='outfit.875f6f3519ddf095']

Unnamed: 0,picture.id,outfit.id,displayOrder,file_name,embeddings


In [None]:
#prepare data

# Introduce the embeddings for each outfit, if the outfit has no embeddings, we drop it
pictures_df["embeddings"] = pictures_df["picture.id"].map(loaded_embeddings_dict)
outfit_pictures_df = pictures_df.groupby("outfit.id").agg({"picture.id": list, "embeddings": list}).reset_index()
outfits["embeddings"] = outfits["id"].map(outfit_pictures_df.set_index("outfit.id")["embeddings"])


In [None]:
outfits[outfits['id']=='outfit.875f6f3519ddf095']

Unnamed: 0,id,name,description,group,owner,timeCreated,retailPrice,pricePerWeek,pricePerMonth,outfit_tags,tag_categories,embeddings


In [None]:
def replace_nan_with_zeros(df, column_name):
    # Determine the shape of the arrays in the column
    shapes = set()
    for item in df[column_name]:
        if isinstance(item, list):
            for array in item:
                if isinstance(array, np.ndarray):
                    shapes.add(array.shape)

    # Assuming all arrays have the same shape
    array_shape = shapes.pop() if shapes else None

    # Replace NaN with a list containing one np array of zeros with the determined shape
    if array_shape:
        df[column_name] = df[column_name].apply(
            lambda x: [np.zeros(array_shape)] if isinstance(x, float) and pd.isna(x) else x
        )
    # Convert all arrays to float type
    df[column_name] = df[column_name].apply(
        lambda x: [array.astype(float) for array in x] if isinstance(x, list) else x
    )

replace_nan_with_zeros(outfits, 'embeddings')

In [None]:
outfits[outfits['id']=='outfit.875f6f3519ddf095']

Unnamed: 0,id,name,description,group,owner,timeCreated,retailPrice,pricePerWeek,pricePerMonth,outfit_tags,tag_categories,embeddings


In [None]:
#convert tag_categories and outfit_tags to lists
outfits["tag_categories"] = outfits["tag_categories"].apply(eval)
outfits["outfit_tags"] = outfits["outfit_tags"].apply(eval)

outfits['group']=outfits['group'].astype(str)

# Convert to sets and find common IDs
common_ids = set(orders['outfit.id']).intersection(set(outfits['id']))

# Filter the dataframes to keep only the common ids
orders = orders[orders['outfit.id'].isin(common_ids)]
outfits = outfits[outfits['id'].isin(common_ids)]

In [None]:
outfits[outfits['id']=='outfit.875f6f3519ddf095']

Unnamed: 0,id,name,description,group,owner,timeCreated,retailPrice,pricePerWeek,pricePerMonth,outfit_tags,tag_categories,embeddings


In [None]:
#prepare tags
all_tags = outfits["outfit_tags"].values.tolist()
mlb = MultiLabelBinarizer()
one_hot_encoded = mlb.fit_transform(all_tags)
outfits["MultiLabel_encoded"] = [np.array(oh_list) for oh_list in one_hot_encoded.tolist()]

In [None]:
outfits[outfits['id']=='outfit.875f6f3519ddf095']

Unnamed: 0,id,name,description,group,owner,timeCreated,retailPrice,pricePerWeek,pricePerMonth,outfit_tags,tag_categories,embeddings,MultiLabel_encoded


In [None]:
#prepare my encoding
# Function to create the new column with arrays of other columns excluding 'id'
def create_encoding_column(row):
    return row.drop('id').values.tolist()

# Apply the function to each row and create a new column 'my_encoding'
my_encoding['my_encoding'] = my_encoding.apply(create_encoding_column, axis=1)
outfits=outfits.merge(my_encoding[['id','my_encoding']],on='id')

# Apply the function to each row and create a new column 'my_encoding_count_pairs_outfit'
#my_encoding_count_pairs_outfit['my_encoding_count_pairs_outfit'] = my_encoding.apply(create_encoding_column, axis=1)
#outfits=outfits.merge(my_encoding_count_pairs_outfit[['id','my_encoding_count_pairs_outfit']],on='id')

In [None]:
outfits[outfits['id']=='outfit.875f6f3519ddf095']

Unnamed: 0,id,name,description,group,owner,timeCreated,retailPrice,pricePerWeek,pricePerMonth,outfit_tags,tag_categories,embeddings,MultiLabel_encoded,my_encoding


In [None]:
#prepare embeddings
def get_mean_embedding(embeddings):
    embeddings = np.array(embeddings)
    mean_embedding = np.mean(embeddings, axis=0)
    return mean_embedding

def concatenate_embeddings(oh_embeddings, image_embeddings, oh_weighting):
    oh_embeddings = np.array(oh_embeddings) * oh_weighting
    return np.concatenate((oh_embeddings, image_embeddings))

outfits["mean_embeddings"] = outfits["embeddings"].apply(lambda x: get_mean_embedding(x))
#one_hot_encoded = np.array(outfits_df["one_hot_encoded"].tolist())
#mean_embeddings = np.array(outfits_df["mean_embeddings"].tolist())

outfits["concatenated_embeddings"] = outfits.apply(lambda x: concatenate_embeddings(x["MultiLabel_encoded"], x["mean_embeddings"], oh_weighting=4), axis=1)

In [None]:
outfits[outfits['id']=='outfit.875f6f3519ddf095']

Unnamed: 0,id,name,description,group,owner,timeCreated,retailPrice,pricePerWeek,pricePerMonth,outfit_tags,tag_categories,embeddings,MultiLabel_encoded,my_encoding,mean_embeddings,concatenated_embeddings


In [None]:
outfits.to_parquet(path+'/models/outfits_embeddings_without_outfit_embeddings.parquet')

In [None]:
#outfits=pd.read_parquet(path+'/models/outfits_embeddings_without_outfit_embeddings.parquet',engine='pyarrow')

In [None]:
"""# Converting lists to tensors is inefficient, so we convert them to numpy arrays first. Saves a couple of seconds.
input_embeddings = np.vstack(outfits["concatenated_embeddings"].values)
input_embeddings = input_embeddings.astype(np.float32)
input_embeddings = torch.tensor(input_embeddings)"""

In [None]:
"""class Autoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, latent_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

input_dim = input_embeddings.shape[1]
hidden_dim = 2048
latent_dim = 512

model = Autoencoder(input_dim, hidden_dim, latent_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 3
batch_size = 32

for epoch in tqdm(range(num_epochs)):
    permutation = torch.randperm(input_embeddings.size()[0])

    for i in range(0, input_embeddings.size()[0], batch_size):
        indices = permutation[i:i+batch_size]
        batch_inputs = input_embeddings[indices]

        encoded, decoded = model(batch_inputs)
        loss = criterion(decoded, batch_inputs)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Save the model
torch.save(model.state_dict(), path+'/models/autoencoder_model.pth')"""

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch [1/3], Loss: 0.1968
Epoch [2/3], Loss: 0.1806
Epoch [3/3], Loss: 0.1619


In [None]:
"""def get_outfit_embeddings(outfits_df, model):
    MultiLabel_encoded = np.array(outfits_df["MultiLabel_encoded"].tolist())
    mean_embeddings = np.array(outfits_df["mean_embeddings"].tolist())
    input_embeddings = np.concatenate((MultiLabel_encoded, mean_embeddings), axis=1)
    input_embeddings = torch.tensor(input_embeddings, dtype=torch.float32)
    with torch.no_grad():
        encoded, decoded = model(input_embeddings)
    return encoded

outfit_embeddings = get_outfit_embeddings(outfits, model)
outfits["outfit_embeddings"] = [x.numpy() for x in outfit_embeddings]
print(np.stack(outfits["outfit_embeddings"].values).shape)
outfits.to_parquet(path+'/models/outfits_embeddings.parquet')"""

(12427, 512)
