In [1]:
import pandas as pd
import pickle
from tqdm import tqdm
import numpy as np

from utils import *

In [2]:
train_set_raw=pd.read_csv('data/train.csv')

# Spliting the data into train and validation

In [3]:
# sample 3 positive items per user
df_val = train_set_raw.groupby('UserID').sample(n=3, random_state=10)
# take the rest of the data as validation set
df_train = train_set_raw[~train_set_raw.index.isin(df_val.index)].copy()

In [4]:
items_list = list(train_set_raw['ItemID'].unique())
train_items_list = list(df_train['ItemID'].unique())
val_items_list = list(df_val['ItemID'].unique())
print(f'Number of items in train set: {len(train_items_list)}')
print(f'Number of items in validation set: {len(val_items_list)}')

Number of items in train set: 3705
Number of items in validation set: 2468


In [5]:
users_list = list(train_set_raw['UserID'].unique())
print(f'Number of users in train set: {len(users_list)}')
print(f'Number of users in validation set: {len(df_val["UserID"].unique())}')

Number of users in train set: 6040
Number of users in validation set: 6040


## Creating the train and validation data sets with negative and positive samples

In [6]:
user_items_dict_train = create_user_items_dict(df_train)
user_items_dict_val = create_user_items_dict(df_val)

In [7]:
item_probability_dict = create_item_popularity_dict(train_set_raw)

### Load negative samples

In [8]:
# train set
train_negative_random = load_negative_samples(user_items_dict_train, items_list, 'train', 'random')
train_negative_popularity = load_negative_samples(user_items_dict_train, items_list, 'train', 'popularity', item_probability_dict)
# validation set
val_negative_random = load_negative_samples(user_items_dict_val, items_list, 'validation', 'random')
val_negative_popularity = load_negative_samples(user_items_dict_val, items_list, 'validation', 'popularity', item_probability_dict)

In [9]:
len(val_negative_popularity[1])

97

### Creating datasets for training loop

In [10]:
df_random = create_dataset(train_negative_random, df_train)
df_popularity = create_dataset(train_negative_popularity, df_train)

100%|██████████| 6040/6040 [01:33<00:00, 64.44it/s]
100%|██████████| 6040/6040 [01:33<00:00, 64.39it/s]


---

# Training the model

In [11]:
radnom_users_embeddings, random_items_embeddings = training_loop( 
                                                    df_random,
                                                    user_items_dict_val, val_negative_random,
                                                    users_list, items_list,
                                                    alpha_item = 1e-5,
                                                    alpha_user = 1e-5,
                                                    item_init_noise=1,
                                                    user_init_noise=1,
                                                    epochs = 10,
                                                    k = 16,
                                                    lr = 0.1)

Epoch 1: 100%|██████████| 1939105/1939105 [01:05<00:00, 29477.74it/s]
  neg_loss = np.log(1 - np.array([sigmoid(x) for x in np.dot(user_vector, neg_items_matrix.T)]))
100%|██████████| 6040/6040 [00:01<00:00, 5101.48it/s]


Epoch 1 train loss: 0.752 validation loss: inf


  log_loss =  rating * np.log(prediction + epsilon) + (1 - rating) * np.log(1 -prediction + epsilon)
  log_loss =  rating * np.log(prediction + epsilon) + (1 - rating) * np.log(1 -prediction + epsilon)
Epoch 2: 100%|██████████| 1939105/1939105 [01:05<00:00, 29383.20it/s]
  pos_loss = np.log(np.array([sigmoid(x) for x in np.dot(user_vector, pos_items_matrix.T)]))
100%|██████████| 6040/6040 [00:01<00:00, 5045.07it/s]


Epoch 2 train loss: nan validation loss: inf


Epoch 3:   1%|          | 19351/1939105 [00:00<01:10, 27337.42it/s]


UnboundLocalError: local variable 'regularization' referenced before assignment

In [None]:
popularity_users_embeddings, popularity_items_embeddings = training_loop( 
                                                    df_popularity,
                                                    user_items_dict_val, val_negative_random,
                                                    users_list, items_list,
                                                    alpha_item = 0.0001,
                                                    alpha_user = 0.0001,
                                                    epochs = 20,
                                                    k = 16,
                                                    lr = 0.1)

In [None]:
k = 50
base_user_emb = create_embeddings(users_list, 0.1,16)
base_item_emb = create_embeddings(items_list, 0.1,16)
hitrate_10_base = Hit_Rate_at_k(user_items_dict_val, val_negative_random, base_user_emb, base_item_emb, k)
print(f'Hit rate at {k}: {hitrate_10_base}')
hitrate_10_trained = Hit_Rate_at_k(user_items_dict_val, val_negative_random, radnom_users_embeddings, random_items_embeddings, k)
print(f'Hit rate at {k}: {hitrate_10_trained}')

In [None]:
#MPR calc
MPR_base = MPR_calculation(user_items_dict_val, val_negative_random, base_user_emb, base_item_emb)
print(f'MPR: {MPR_base}')
MPR_trained = MPR_calculation(user_items_dict_val, val_negative_random, radnom_users_embeddings, random_items_embeddings)
print(f'MPR: {MPR_trained}')


In [None]:
def prediction_on_test_set(row:pd.Series, users_embeddings:dict, items_embeddings:dict)->pd.Series:
    user = row['UserID']
    item_1 = row['Item1']
    item_2 = row['Item2']

    item_1_score = np.dot(users_embeddings[user], items_embeddings[item_1])
    item_2_score = np.dot(users_embeddings[user], items_embeddings[item_2])

    if item_1_score > item_2_score:
        row['prediction'] = 0
    else:
        row['prediction'] = 1

In [11]:
np.log(0)

  np.log(0)


-inf

In [22]:
math.exp(1e5)

OverflowError: math range error