#Content-Based Using KNN
This notebook uses different types of encoding (see embedding.ipynb for details on their creation) to generate recommendations.

Please embedding.ipynb before the first time of runing this code.


In [None]:
from google.colab import drive
drive.mount('/content/drive')
path='/content/drive/MyDrive/RecSys_206894495'
%run /content/drive/MyDrive/RecSys_206894495/models/evaluate_models.ipynb

Mounted at /content/drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install pyarrow



In [None]:
#import
#warning
import warnings
warnings.filterwarnings('ignore')

#general
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

#for predictions
from sklearn.neighbors import NearestNeighbors

In [None]:
outfits=pd.read_parquet(path+'/models/outfits_embeddings.parquet')
user_splits_df=pd.read_parquet(path+'/models/user_splits.parquet')

In [None]:
outfits.columns

Index(['id', 'name', 'description', 'group', 'owner', 'timeCreated',
       'retailPrice', 'pricePerWeek', 'pricePerMonth', 'outfit_tags',
       'tag_categories', 'embeddings', 'MultiLabel_encoded', 'my_encoding',
       'mean_embeddings', 'concatenated_embeddings', 'outfit_embeddings'],
      dtype='object')

In [None]:
# Function to check if all numpy arrays in a column have the same shape
def check_same_shape(df, column_name):
    shapes = df[column_name].apply(lambda x: x.shape)
    return shapes.nunique() == 1
def check_same_length(df, column_name):
    lengths = df[column_name].apply(len)
    return lengths.nunique() == 1

In [None]:
NUM_ITEMS = 10

def find_rental_history_embeddings(outfit_ids, outfit_to_embedding_dict):
    outfit_ids = [outfit_id for outfit_id in outfit_ids if outfit_id != "nan"]
    # TODO: Find out where these nan values are coming from, only two of them for now, as far as I can tell.
    return [outfit_to_embedding_dict[outfit_id] for outfit_id in outfit_ids]

def get_mean_embedding(embeddings):
    embeddings = np.array(embeddings)
    mean_embedding = np.mean(embeddings, axis=0)
    return mean_embedding

def get_nearest_neighbors_batch(embeddings, nn, num_items, index_to_id):
    distances, indices = nn.kneighbors(embeddings, n_neighbors=num_items+1)
    ids = [[index_to_id[i] for i in idx[1:]] for idx in indices]
    distances = [dist[1:] for dist in distances]
    return ids, distances


def predict_nearest_neighbors(df, outfits_df, embeddings_column="embeddings"):
    outfits_df.dropna(subset=embeddings_column,inplace=True)
    outfit_to_embedding_dict = outfits_df.set_index("id")[embeddings_column].to_dict()
    index_to_outfit_dict = {i: outfit_id for i, outfit_id in enumerate(outfits_df["id"].values)}
    group_to_embedding_dict = outfits_df.set_index("group")[embeddings_column].to_dict()
    index_to_group_dict = {i: group for i, group in enumerate(outfits_df["group"].values)}

    df["train_id_embeddings"] = df["join_outfit_ids"].apply(lambda x: find_rental_history_embeddings(x, outfit_to_embedding_dict))
    df["train_group_embeddings"] = df["join_group"].apply(lambda x: find_rental_history_embeddings(x, group_to_embedding_dict))

    df["rental_history_id_embedding"] = df["train_id_embeddings"].apply(lambda x: get_mean_embedding(x))
    df["rental_history_group_embedding"] = df["train_group_embeddings"].apply(lambda x: get_mean_embedding(x))

    nearest_neighbors = NearestNeighbors(n_neighbors=NUM_ITEMS+1, metric="cosine")
    embeddings = np.stack(outfits_df[embeddings_column].values)
    nearest_neighbors.fit(embeddings)

    id_embeddings = np.stack(df["rental_history_id_embedding"].values)
    group_embeddings = np.stack(df["rental_history_group_embedding"].values)

    id_predictions, id_distances = get_nearest_neighbors_batch(id_embeddings, nearest_neighbors, NUM_ITEMS, index_to_outfit_dict)
    group_predictions, group_distances = get_nearest_neighbors_batch(group_embeddings, nearest_neighbors, NUM_ITEMS, index_to_group_dict)

    df["id_prediction"], df["id_prediction_distances"] = id_predictions, id_distances
    df["group_prediction"], df["group_prediction_distances"] = group_predictions, group_distances

    return df


# Apply to dataframes
tqdm.pandas()

In [None]:
# Tag based predictions
user_splits=user_splits_df.copy()
user_splits = predict_nearest_neighbors(user_splits, outfits, embeddings_column="MultiLabel_encoded")
user_splits = evaluate_df_metrics_at_n(user_splits,'Tag_encoding', n=10)

Unnamed: 0,0
id_hit_rate_at_10,0.046691
id_precision_at_10,0.004823
id_recall_at_10,0.027905
id_f1_score_at_10,0.00688
group_hit_rate_at_10,0.053617
group_precision_at_10,0.005567
group_recall_at_10,0.031491
group_f1_score_at_10,0.007961
method_name,Tag_encoding


In [None]:
# Image based predictions
user_splits=user_splits_df.copy()
user_splits = predict_nearest_neighbors(user_splits, outfits, embeddings_column="mean_embeddings")
user_splits = evaluate_df_metrics_at_n(user_splits,'picture_embeddings', n=10)


Unnamed: 0,0
id_hit_rate_at_10,0.029502
id_precision_at_10,0.00295
id_recall_at_10,0.02435
id_f1_score_at_10,0.004877
group_hit_rate_at_10,0.031811
group_precision_at_10,0.003181
group_recall_at_10,0.025867
group_f1_score_at_10,0.005243
method_name,picture_embeddings


In [None]:
# Combined predictions
user_splits=user_splits_df.copy()
user_splits = predict_nearest_neighbors(user_splits, outfits, embeddings_column="outfit_embeddings")
user_splits = evaluate_df_metrics_at_n(user_splits,'outfit_embeddings', n=10)

Unnamed: 0,0
id_hit_rate_at_10,0.027707
id_precision_at_10,0.002771
id_recall_at_10,0.020085
id_f1_score_at_10,0.004376
group_hit_rate_at_10,0.030272
group_precision_at_10,0.003027
group_recall_at_10,0.021079
group_f1_score_at_10,0.00469
method_name,outfit_embeddings


In [None]:
# Concat predictions
user_splits=user_splits_df.copy()
user_splits = predict_nearest_neighbors(user_splits, outfits, embeddings_column="concatenated_embeddings")
user_splits = evaluate_df_metrics_at_n(user_splits,'concatenated_embeddings', n=10)


Unnamed: 0,0
id_hit_rate_at_10,0.044382
id_precision_at_10,0.004592
id_recall_at_10,0.030765
id_f1_score_at_10,0.007022
group_hit_rate_at_10,0.047204
group_precision_at_10,0.004874
group_recall_at_10,0.032606
group_f1_score_at_10,0.007466
method_name,concatenated_embeddings


In [None]:
# My Encoding based predictions
user_splits=user_splits_df.copy()
user_splits = predict_nearest_neighbors(user_splits, outfits, embeddings_column="my_encoding")
user_splits = evaluate_df_metrics_at_n(user_splits,'my_encoding', n=10)

Unnamed: 0,0
id_hit_rate_at_10,0.024885
id_precision_at_10,0.00254
id_recall_at_10,0.022343
id_f1_score_at_10,0.004309
group_hit_rate_at_10,0.037712
group_precision_at_10,0.003822
group_recall_at_10,0.026244
group_f1_score_at_10,0.00584
method_name,my_encoding
