In [45]:
import pandas as pd
import pickle
from tqdm import tqdm
import numpy as np

from utils import *

In [46]:
train_set_raw=pd.read_csv('data/train.csv')

# Spliting the data into train and validation

In [47]:
# sample 3 positive items per user
df_val = train_set_raw.groupby('UserID').sample(n=3, random_state=10)
# take the rest of the data as validation set
df_train = train_set_raw[~train_set_raw.index.isin(df_val.index)].copy()

In [48]:
items_list = list(train_set_raw['ItemID'].unique())
train_items_list = list(df_train['ItemID'].unique())
val_items_list = list(df_val['ItemID'].unique())
print(f'Number of items in train set: {len(train_items_list)}')
print(f'Number of items in validation set: {len(val_items_list)}')

Number of items in train set: 3705
Number of items in validation set: 2468


In [49]:
users_list = list(train_set_raw['UserID'].unique())
print(f'Number of users in train set: {len(users_list)}')
print(f'Number of users in validation set: {len(df_val["UserID"].unique())}')

Number of users in train set: 6040
Number of users in validation set: 6040


## Creating the train and validation data sets with negative and positive samples

In [50]:
user_items_dict_train = create_user_items_dict(df_train)
user_items_dict_val = create_user_items_dict(df_val)

In [51]:
item_probability_dict = create_item_popularity_dict(train_set_raw)

### Load negative samples

In [52]:
# train set
train_negative_random = load_negative_samples(user_items_dict_train, items_list, 'train', 'random')
train_negative_popularity = load_negative_samples(user_items_dict_train, items_list, 'train', 'popularity', item_probability_dict)
# validation set
val_negative_random = load_negative_samples(user_items_dict_val, items_list, 'validation', 'random')
val_negative_popularity = load_negative_samples(user_items_dict_val, items_list, 'validation', 'popularity', item_probability_dict)

In [54]:
def validation_regularization(user_embs, item_embs, alpha_item, alpha_user):
    """
    Calculate regularization loss for user and item embeddings.
    Args:
        user_embs (dict): dictionary of user embeddings
        item_embs (dict): dictionary of item embeddings
        reg_lambda (float): regularization parameter
    """
    user_reg = 0
    item_reg = 0
    for user in user_embs:
        user_reg += np.linalg.norm(user_embs[user])
    for item in item_embs:
        item_reg += np.sum(item_embs[item]**2)
    return (alpha_user/2) * user_reg + (alpha_item/2) * item_reg

In [55]:
def validation_log_loss(positive_samples: dict,
                        negative_samples: dict,
                        user_embeddings: dict,
                        item_embeddings: dict,
                        )-> float:
    """
    Calculate log loss for a given set of positive and negative samples per user.
    Args:
        positive_samples (dict): dictionary of positive samples per user
        negative_samples (dict): dictionary of negative samples per user
        user_vectors (dict): dictionary of user vectors
        item_vectors (dict): dictionary of item vectors
    """
    loss = 0
    for user in tqdm(positive_samples):
        # get user vector
        user_vector = user_embeddings[user]
        # get positive and negative items vectors 
        pos_item_vectors = [item_embeddings[x] for x in positive_samples[user]]
        neg_item_vectors = [item_embeddings[x] for x in negative_samples[user]]
        # convert lists of arrays to matrices
        pos_items_matrix = np.vstack(pos_item_vectors)
        neg_items_matrix = np.vstack(neg_item_vectors)

        # calculate loss for positive items
        pos_loss = np.log(np.array([sigmoid(x) for x in np.dot(user_vector, pos_items_matrix.T)]))
        # calculate loss for negative item
        neg_loss = np.log(1 - np.array([sigmoid(x) for x in np.dot(user_vector, pos_items_matrix.T)]))
        # add up losses
        loss += np.sum(pos_loss)/len(pos_loss) + np.sum(neg_loss)/len(neg_loss)
    
    log_loss = -loss/(len(positive_samples))
    return log_loss 

In [56]:
def validation_loss_func(  positive_samples,
                            negative_samples,
                            user_embeddings,
                            item_embeddings,
                            alpha_user,
                            alpha_item)-> float:
    log_loss = validation_log_loss(positive_samples, negative_samples, user_embeddings, item_embeddings)
    reg = validation_regularization(user_embeddings, item_embeddings, alpha_user, alpha_item)
    return log_loss + reg

In [75]:
def train_loss_func(prediction: float,
                    rating:int,
                    user_embedding:np.ndarray,
                    item_embedding:np.ndarray,
                    alpha_item:float,
                    alpha_user:float
                    )-> float:
    """
    Calculate loss for a given prediction, rating, user embedding and item embedding.
    Args:
        prediction (float): prediction for a given user-item pair
        rating (int): actual rating for a given user-item pair
        user_embedding (np.ndarray): user embedding
        item_embedding (np.ndarray): item embedding
    """
    log_loss =  rating * np.log(prediction) + (1 - rating) * np.log(1 -prediction)
    regularization = (alpha_user/2) * np.linalg.norm(user_embedding) + (alpha_item/2) * np.linalg.norm(item_embedding)
    return log_loss + regularization 

### Creating datasets for training loop

In [69]:
df = create_dataset(train_negative_random, df_train)
df.shape

100%|██████████| 6040/6040 [01:27<00:00, 68.82it/s]


(1939105, 3)

In [76]:
def training_loop(  train_df: pd.DataFrame,
                    user_items_dict_validation: dict,
                    negative_samples_validation: dict,
                    user_list: list,
                    items_list: list,
                    alpha_item: float,
                    alpha_user: float,
                    epochs: int,
                    k: int,
                    lr: float,
                    ) -> tuple:
    
    items_embeddings = create_embeddings(items_list, alpha_item, k)
    users_embeddings = create_embeddings(user_list, alpha_user, k)
    train = train_df.values
    for e in range(epochs):
        np.random.shuffle(train)
        loss = 0
        for user, item, rating in tqdm(train, desc=f'Epoch {e+1}'):
            prediction = sigmoid(np.dot(users_embeddings[user], items_embeddings[item]))
            error = rating - prediction
            users_embeddings[user] += lr * (error * items_embeddings[item] - alpha_user * users_embeddings[user])
            items_embeddings[item] += lr * (error * users_embeddings[user] - alpha_item * items_embeddings[item])
            # calculate loss
            loss += train_loss_func(prediction, rating, users_embeddings[user], items_embeddings[item], alpha_item, alpha_user)
        print(f'Epoch {e+1} loss: {-loss/len(train)}')
    return users_embeddings, items_embeddings

In [78]:
users_embeddings, items_embeddings = training_loop( df,
                                                    user_items_dict_val, val_negative_random,
                                                    users_list, items_list,
                                                    alpha_item = 0.0001,
                                                    alpha_user = 0.0001,
                                                    epochs = 2,
                                                    k = 16,
                                                    lr = 0.1)

Epoch 1: 100%|██████████| 1939105/1939105 [01:05<00:00, 29414.68it/s]


Epoch 1 loss: 0.5976826048234564


Epoch 2: 100%|██████████| 1939105/1939105 [01:06<00:00, 29240.47it/s]

Epoch 2 loss: 0.4531605242126664





In [None]:
def prediction_on_test_set(row:pd.Series, users_embeddings:dict, items_embeddings:dict)->pd.Series:
    user = row['UserID']
    item_1 = row['Item1']
    item_2 = row['Item2']

    item_1_score = np.dot(users_embeddings[user], items_embeddings[item_1])
    item_2_score = np.dot(users_embeddings[user], items_embeddings[item_2])

    if item_1_score > item_2_score:
        row['prediction'] = 0
    else:
        row['prediction'] = 1