# Content-Based Using KNN – With Rental Count  
This notebook uses different types of encoding (see *embedding.ipynb* for details on their creation) **along with the number of times a customer has rented an outfit** to generate recommendations.

In [1]:
from google.colab import drive
drive.mount('/content/drive')
path='/content/drive/MyDrive/RecSys_206894495'
%run /content/drive/MyDrive/RecSys_206894495/models/evaluate_models.ipynb

Mounted at /content/drive
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install pyarrow



In [3]:
#import
#warning
import warnings
warnings.filterwarnings('ignore')

#general
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from collections import Counter

#for predictions
from sklearn.neighbors import NearestNeighbors

In [4]:
outfits=pd.read_parquet(path+'/models/outfits_embeddings.parquet')
user_splits_df=pd.read_parquet(path+'/models/user_splits.parquet')

In [5]:
outfits.columns

Index(['id', 'name', 'description', 'group', 'owner', 'timeCreated',
       'retailPrice', 'pricePerWeek', 'pricePerMonth', 'outfit_tags',
       'tag_categories', 'embeddings', 'MultiLabel_encoded', 'my_encoding',
       'mean_embeddings', 'concatenated_embeddings', 'outfit_embeddings'],
      dtype='object')

In [6]:
NUM_ITEMS = 10

def find_rental_history_embeddings(outfit_ids, outfit_to_embedding_dict):
    outfit_ids = [outfit_id for outfit_id in outfit_ids if outfit_id != "nan"]
    return [outfit_to_embedding_dict[outfit_id] for outfit_id in outfit_ids]

def get_mean_embedding(embeddings):
    embeddings = np.array(embeddings)
    mean_embedding = np.mean(embeddings, axis=0)
    return mean_embedding

def get_nearest_neighbors_batch(embeddings, nn, num_items, index_to_id):
    distances, indices = nn.kneighbors(embeddings, n_neighbors=num_items+1)
    ids = [[index_to_id[i] for i in idx[1:]] for idx in indices]
    distances = [dist[1:] for dist in distances]
    return ids, distances

def count_occurrences(column):
    return column.apply(lambda x: dict(Counter(x)))

def pad_sequences(sequences, maxlen=None, dtype='int32', padding='post', truncating='post', value=0.):
    lengths = [len(s) for s in sequences]
    nb_samples = len(sequences)
    if maxlen is None:
        maxlen = np.max(lengths)
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break
    x = (np.ones((nb_samples, maxlen) + sample_shape) * value).astype(dtype)
    for idx, s in enumerate(sequences):
        if len(s) == 0:
            continue  # empty list was found
        trunc = s[:maxlen]
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError('Shape of sample %s of sequence at position %s is different from expected shape %s' %
                             (trunc.shape[1:], idx, sample_shape))
        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
    return x

def predict_nearest_neighbors(df, outfits_df, embeddings_column="embeddings"):
    outfits_df.dropna(subset=embeddings_column,inplace=True)
    outfit_to_embedding_dict = outfits_df.set_index("id")[embeddings_column].to_dict()
    index_to_outfit_dict = {i: outfit_id for i, outfit_id in enumerate(outfits_df["id"].values)}
    group_to_embedding_dict = outfits_df.set_index("group")[embeddings_column].to_dict()
    index_to_group_dict = {i: group for i, group in enumerate(outfits_df["group"].values)}

    df["train_id_embeddings"] = df["join_outfit_ids"].apply(lambda x: find_rental_history_embeddings(x, outfit_to_embedding_dict))
    df["train_group_embeddings"] = df["join_group"].apply(lambda x: find_rental_history_embeddings(x, group_to_embedding_dict))

    df["rental_history_id_embedding"] = df["train_id_embeddings"].apply(lambda x: get_mean_embedding(x))
    df["rental_history_group_embedding"] = df["train_group_embeddings"].apply(lambda x: get_mean_embedding(x))

    # Add count columns
    df['count_join_outfit_ids'] = count_occurrences(df['join_outfit_ids'])
    df['count_join_group'] = count_occurrences(df['join_group'])

    # Integrate counts into embeddings
    join_counts_list_outfit = [list(count.values()) for count in df['count_join_outfit_ids']]
    join_counts_list_group = [list(count.values()) for count in df['count_join_group']]

    max_len_join_counts_outfit = max(len(lst) for lst in join_counts_list_outfit)
    max_len_join_counts_group = max(len(lst) for lst in join_counts_list_group)

    join_counts_padded_outfit = pad_sequences(join_counts_list_outfit, maxlen=max_len_join_counts_outfit)
    join_counts_padded_group = pad_sequences(join_counts_list_group, maxlen=max_len_join_counts_group)

    # Fit the nearest neighbors model with join counts
    nearest_neighbors_outfit = NearestNeighbors(n_neighbors=NUM_ITEMS+1, metric="cosine")
    embeddings_with_counts_fit = np.hstack((np.stack(outfits_df[embeddings_column].values), np.zeros((len(outfits_df), max_len_join_counts_outfit))))
    nearest_neighbors_outfit.fit(embeddings_with_counts_fit)

    nearest_neighbors_group = NearestNeighbors(n_neighbors=NUM_ITEMS+1, metric="cosine")
    embeddings_with_counts_fit = np.hstack((np.stack(outfits_df[embeddings_column].values), np.zeros((len(outfits_df), max_len_join_counts_group))))
    nearest_neighbors_group.fit(embeddings_with_counts_fit)

    # Predict with test counts
    id_embeddings_with_counts_predict = np.hstack((np.stack(df["rental_history_id_embedding"].values), join_counts_padded_outfit))
    group_embeddings_with_counts_predict = np.hstack((np.stack(df["rental_history_group_embedding"].values), join_counts_padded_group))

    id_predictions, id_distances = get_nearest_neighbors_batch(id_embeddings_with_counts_predict, nearest_neighbors_outfit, NUM_ITEMS, index_to_outfit_dict)
    group_predictions, group_distances = get_nearest_neighbors_batch(group_embeddings_with_counts_predict, nearest_neighbors_group, NUM_ITEMS, index_to_group_dict)

    df["id_prediction"], df["id_prediction_distances"] = id_predictions, id_distances
    df["group_prediction"], df["group_prediction_distances"] = group_predictions, group_distances

    return df



In [7]:
# Tag based predictions
user_splits=user_splits_df.copy()
user_splits = predict_nearest_neighbors(user_splits, outfits, embeddings_column="MultiLabel_encoded")
user_splits = evaluate_df_metrics_at_n(user_splits,'Tag_encoding_with_repeat_count', n=10)

Unnamed: 0,0
id_hit_rate_at_10,0.04823
id_precision_at_10,0.004977
id_recall_at_10,0.028821
id_f1_score_at_10,0.007113
group_hit_rate_at_10,0.053874
group_precision_at_10,0.005593
group_recall_at_10,0.032614
group_f1_score_at_10,0.008085
method_name,Tag_encoding_with_repeat_count


In [8]:
# Image based predictions
user_splits=user_splits_df.copy()
user_splits = predict_nearest_neighbors(user_splits, outfits, embeddings_column="mean_embeddings")
user_splits = evaluate_df_metrics_at_n(user_splits,'picture_embeddings_with_repeat_count', n=10)


Unnamed: 0,0
id_hit_rate_at_10,0.029502
id_precision_at_10,0.00295
id_recall_at_10,0.02435
id_f1_score_at_10,0.004877
group_hit_rate_at_10,0.031811
group_precision_at_10,0.003181
group_recall_at_10,0.025867
group_f1_score_at_10,0.005243
method_name,picture_embeddings_with_repeat_count


In [9]:
# Combined predictions
user_splits=user_splits_df.copy()
user_splits = predict_nearest_neighbors(user_splits, outfits, embeddings_column="outfit_embeddings")
user_splits = evaluate_df_metrics_at_n(user_splits,'outfit_embeddings_with_repeat_count', n=10)

Unnamed: 0,0
id_hit_rate_at_10,0.027707
id_precision_at_10,0.002771
id_recall_at_10,0.020085
id_f1_score_at_10,0.004376
group_hit_rate_at_10,0.030272
group_precision_at_10,0.003027
group_recall_at_10,0.021079
group_f1_score_at_10,0.00469
method_name,outfit_embeddings_with_repeat_count


In [10]:
# Concat predictions
user_splits=user_splits_df.copy()
user_splits = predict_nearest_neighbors(user_splits, outfits, embeddings_column="concatenated_embeddings")
user_splits = evaluate_df_metrics_at_n(user_splits,'concatenated_embeddings_with_repeat_count', n=10)


Unnamed: 0,0
id_hit_rate_at_10,0.044382
id_precision_at_10,0.004592
id_recall_at_10,0.030765
id_f1_score_at_10,0.007022
group_hit_rate_at_10,0.047204
group_precision_at_10,0.004874
group_recall_at_10,0.032606
group_f1_score_at_10,0.007466
method_name,concatenated_embeddings_with_repeat_count


In [11]:
# My Encoding based predictions
user_splits=user_splits_df.copy()
user_splits = predict_nearest_neighbors(user_splits, outfits, embeddings_column="my_encoding")
user_splits = evaluate_df_metrics_at_n(user_splits,'my_encoding_with_repeat_count', n=10)

Unnamed: 0,0
id_hit_rate_at_10,0.024628
id_precision_at_10,0.002514
id_recall_at_10,0.022319
id_f1_score_at_10,0.004285
group_hit_rate_at_10,0.037712
group_precision_at_10,0.003822
group_recall_at_10,0.026244
group_f1_score_at_10,0.00584
method_name,my_encoding_with_repeat_count
