In [1]:
import pandas as pd
import pickle
from tqdm import tqdm
import numpy as np

from utils import *

In [2]:
train_set_raw=pd.read_csv('data/train.csv')

# Spliting the data into train and validation

In [3]:
df_train = train_set_raw.groupby('UserID').sample(frac=0.8, random_state=42)
# take the rest of the data as validation set
df_val = train_set_raw[~train_set_raw.index.isin(df_train.index)].copy()

In [4]:
items_list = list(train_set_raw['ItemID'].unique())
train_items_list = list(df_train['ItemID'].unique())
items_not_in_train = set(items_list) - set(train_items_list)
print(f'Number of items in train set: {len(train_items_list)}')
print(f'Number of items not in train set that will be moved: {len(items_not_in_train)}')


Number of items in train set: 3682
Number of items not in train set that will be moved: 23


In [5]:
# move items that are not in the train set from the validation set
moving_rows = df_val[df_val['ItemID'].isin(items_not_in_train)].groupby('ItemID').sample(1, random_state=42)
df_val = df_val[~df_val.index.isin(moving_rows.index)]
df_train = pd.concat([df_train, moving_rows])

In [6]:
train_items_list = list(df_train['ItemID'].unique())
items_not_in_train = set(items_list) - set(train_items_list)
print(f'Number of items not in train set after moving: {len(items_not_in_train)}')

Number of items not in train set after moving: 0


In [17]:
users_list = list(train_set_raw['UserID'].unique())
print(f'Number of users in train set: {len(users_list)}')

Number of users in train set: 6040


## Creating the train and validation data sets with negative and positive samples

In [7]:
user_items_dict_train = create_user_items_dict(df_train)
user_items_dict_val = create_user_items_dict(df_val)

In [8]:
item_probability_dict = create_item_popularity_dict(train_set_raw)

### Load negative samples

In [9]:
# train set
train_negative_random = load_negative_samples(user_items_dict_train, items_list, 'train', 'random')
train_negative_popularity = load_negative_samples(user_items_dict_train, items_list, 'train', 'popularity', item_probability_dict)
# validation set
val_negative_random = load_negative_samples(user_items_dict_val, items_list, 'validation', 'random')
val_negative_popularity = load_negative_samples(user_items_dict_val, items_list, 'validation', 'popularity', item_probability_dict)

### Creating datasets for training loop

In [14]:
df = create_dataset(train_negative_random, df_train)
df.shape

100%|██████████| 6040/6040 [01:04<00:00, 93.14it/s] 


(1580992, 3)

In [23]:
def training_loop(  train_df: pd.DataFrame,
                    user_items_dict_validation: dict,
                    negative_samples_validation: dict,
                    user_list: list,
                    items_list: list,
                    alpha_item: float,
                    alpha_user: float,
                    epochs: int,
                    k: int,
                    lr: float,
                    ) -> tuple:
    
    items_embeddings = create_embeddings(items_list, alpha_item, k)
    users_embeddings = create_embeddings(user_list, alpha_user, k)
    train = train_df.values
    for e in range(epochs):
        np.random.shuffle(train)
        for user, item, rating in tqdm(train, desc=f'Epoch {e+1}'):
            prediction = sigmoid(np.dot(users_embeddings[user], items_embeddings[item]))
            error = rating - prediction
            users_embeddings[user] += lr * error * items_embeddings[item] - alpha_user * users_embeddings[user]
            items_embeddings[item] += lr * error * users_embeddings[user] - alpha_item * items_embeddings[item]
    
    return users_embeddings, items_embeddings

In [24]:
users_embeddings, items_embeddings = training_loop( df,
                                                    user_items_dict_val, val_negative_random,
                                                    users_list, items_list,
                                                    alpha_item = 0.01,
                                                    alpha_user = 0.01,
                                                    epochs = 2,
                                                    k = 16,
                                                    lr = 0.01)

Epoch 1: 100%|██████████| 1580992/1580992 [00:21<00:00, 72495.96it/s]
Epoch 2: 100%|██████████| 1580992/1580992 [00:21<00:00, 72690.07it/s]


In [None]:
def prediction_on_test_set(row:pd.Series, users_embeddings:dict, items_embeddings:dict)->pd.Series:
    user = row['UserID']
    item_1 = row['Item1']
    item_2 = row['Item2']

    item_1_score = np.dot(users_embeddings[user], items_embeddings[item_1])
    item_2_score = np.dot(users_embeddings[user], items_embeddings[item_2])

    if item_1_score > item_2_score:
        row['prediction'] = 0
    else:
        row['prediction'] = 1