In [1]:
import pandas as pd
import pickle
from tqdm import tqdm
import numpy as np

from utils import *

In [2]:
train_set_raw=pd.read_csv('data/train.csv')

# Spliting the data into train and validation

In [3]:
# sample 3 positive items per user
df_val = train_set_raw.groupby('UserID').sample(n=3, random_state=10)
# take the rest of the data as validation set
df_train = train_set_raw[~train_set_raw.index.isin(df_val.index)].copy()

In [4]:
items_list = list(train_set_raw['ItemID'].unique())
train_items_list = list(df_train['ItemID'].unique())
val_items_list = list(df_val['ItemID'].unique())
print(f'Number of items in train set: {len(train_items_list)}')
print(f'Number of items in validation set: {len(val_items_list)}')

Number of items in train set: 3705
Number of items in validation set: 2468


In [5]:
users_list = list(train_set_raw['UserID'].unique())
print(f'Number of users in train set: {len(users_list)}')
print(f'Number of users in validation set: {len(df_val["UserID"].unique())}')

Number of users in train set: 6040
Number of users in validation set: 6040


## Creating the train and validation data sets with negative and positive samples

In [6]:
user_items_dict_train = create_user_items_dict(df_train)
user_items_dict_val = create_user_items_dict(df_val)

In [7]:
item_probability_dict = create_item_popularity_dict(train_set_raw)

### Load negative samples

In [8]:
# train set
train_negative_random = load_negative_samples(user_items_dict_train, items_list, 'train', 'random')
train_negative_popularity = load_negative_samples(user_items_dict_train, items_list, 'train', 'popularity', item_probability_dict)
# validation set
val_negative_random = load_negative_samples(user_items_dict_val, items_list, 'validation', 'random')
val_negative_popularity = load_negative_samples(user_items_dict_val, items_list, 'validation', 'popularity', item_probability_dict)

In [9]:
len(val_negative_popularity[1])

97

### Creating datasets for training loop

In [10]:
try:
    with open('data/train_datasets/train_random.pkl', 'rb') as f:
        df_random = pickle.load(f)
except:
    df_random = create_dataset(train_negative_random, df_train)
    with open('data/train_datasets/train_random.pkl', 'wb') as f:
        pickle.dump(df_random, f)

try:
    with open('data/train_datasets/train_popularity.pkl', 'rb') as f:
        df_popularity = pickle.load(f)
except:
    df_popularity = create_dataset(train_negative_popularity, df_train)
    with open('data/train_datasets/train_popularity.pkl', 'wb') as f:
        pickle.dump(df_popularity, f)

---

# Training the model

In [11]:
radnom_users_embeddings, random_items_embeddings = training_loop( 
                                                    df_random,
                                                    user_items_dict_val, val_negative_random,
                                                    users_list, items_list,
                                                    alpha_item = 1e-4,
                                                    alpha_user = 1e-4,
                                                    item_init_noise=0.1,
                                                    user_init_noise=0.1,
                                                    epochs = 10,
                                                    k = 32,
                                                    lr = 0.1)

Epoch 1: 100%|██████████| 1939105/1939105 [00:53<00:00, 36461.01it/s]
100%|██████████| 6040/6040 [00:01<00:00, 5412.26it/s]


Epoch 1 train loss: 0.658 validation loss: 0.918


Epoch 2: 100%|██████████| 1939105/1939105 [00:52<00:00, 36811.42it/s]
100%|██████████| 6040/6040 [00:01<00:00, 5934.47it/s]


Epoch 2 train loss: 0.606 validation loss: 1.415


Epoch 3: 100%|██████████| 1939105/1939105 [00:52<00:00, 36975.35it/s]
100%|██████████| 6040/6040 [00:01<00:00, 5811.73it/s]


Epoch 3 train loss: 0.61 validation loss: 5.966
Early stopping


Epoch 4: 100%|██████████| 1939105/1939105 [00:52<00:00, 36887.13it/s]
100%|██████████| 6040/6040 [00:01<00:00, 5071.95it/s]


Epoch 4 train loss: 0.651 validation loss: 11.629


Epoch 5: 100%|██████████| 1939105/1939105 [00:54<00:00, 35498.65it/s]
100%|██████████| 6040/6040 [00:01<00:00, 4705.53it/s]


Epoch 5 train loss: 0.793 validation loss: 13.267


Epoch 6: 100%|██████████| 1939105/1939105 [00:54<00:00, 35382.11it/s]
100%|██████████| 6040/6040 [00:01<00:00, 4743.75it/s]


Epoch 6 train loss: 29.396 validation loss: 137.721


Epoch 7:  46%|████▌     | 896466/1939105 [00:25<00:29, 34929.36it/s]

In [12]:
popularity_users_embeddings, popularity_items_embeddings = training_loop( 
                                                    df_popularity,
                                                    user_items_dict_val, val_negative_popularity,
                                                    users_list, items_list,
                                                    alpha_item = 1e-4,
                                                    alpha_user = 1e-4,
                                                    item_init_noise=0.1,
                                                    user_init_noise=0.1,
                                                    epochs = 10,
                                                    k = 32,
                                                    lr = 0.01)

Epoch 1: 100%|██████████| 1939105/1939105 [00:56<00:00, 34488.86it/s]
100%|██████████| 6040/6040 [00:01<00:00, 4949.13it/s]


Epoch 1 train loss: 0.724 validation loss: 1.386


Epoch 2: 100%|██████████| 1939105/1939105 [00:56<00:00, 34458.73it/s]
100%|██████████| 6040/6040 [00:01<00:00, 4964.22it/s]


Epoch 2 train loss: 0.72 validation loss: 1.332


Epoch 3: 100%|██████████| 1939105/1939105 [00:56<00:00, 34601.03it/s]
100%|██████████| 6040/6040 [00:01<00:00, 5322.30it/s]


Epoch 3 train loss: 0.689 validation loss: 1.17


Epoch 4: 100%|██████████| 1939105/1939105 [00:55<00:00, 35050.05it/s]
100%|██████████| 6040/6040 [00:01<00:00, 4982.83it/s]


Epoch 4 train loss: 0.661 validation loss: 1.087


Epoch 5: 100%|██████████| 1939105/1939105 [00:56<00:00, 34422.61it/s]
100%|██████████| 6040/6040 [00:01<00:00, 4879.93it/s]


Epoch 5 train loss: 0.649 validation loss: 1.034


Epoch 6: 100%|██████████| 1939105/1939105 [00:56<00:00, 34324.67it/s]
100%|██████████| 6040/6040 [00:01<00:00, 5006.73it/s]


Epoch 6 train loss: 0.64 validation loss: 0.992


Epoch 7: 100%|██████████| 1939105/1939105 [00:56<00:00, 34550.25it/s]
100%|██████████| 6040/6040 [00:01<00:00, 4599.44it/s]


Epoch 7 train loss: 0.633 validation loss: 0.957


Epoch 8: 100%|██████████| 1939105/1939105 [00:55<00:00, 34728.47it/s]
100%|██████████| 6040/6040 [00:01<00:00, 5089.76it/s]


Epoch 8 train loss: 0.627 validation loss: 0.929


Epoch 9: 100%|██████████| 1939105/1939105 [00:56<00:00, 34611.41it/s]
100%|██████████| 6040/6040 [00:01<00:00, 4922.72it/s]


Epoch 9 train loss: 0.621 validation loss: 0.905


Epoch 10: 100%|██████████| 1939105/1939105 [00:56<00:00, 34414.57it/s]
100%|██████████| 6040/6040 [00:01<00:00, 4981.03it/s]

Epoch 10 train loss: 0.616 validation loss: 0.885





In [17]:
def Hit_Rate_at_k(positive_samples:dict, negative_samples:dict, users_embeddings:dict, items_embeddings:dict, k):
    """
    Calculate average hitrate@k for a given set of positive and negative samples per user.
    Args:
        positive_samples (dict): dictionary of positive samples per user
        negative_samples (dict): dictionary of negative samples per user
        users_embeddings (dict): dictionary of user embeddings
        items_embeddings (dict): dictionary of item embeddings
        k (int): number of items to consider for hitrate calculation
    """
    hit_rate = 0
    for user in tqdm(positive_samples.keys()):
        user_hit_rate=0
        items_score=[]
        for item in positive_samples[user]:
            positive_score = np.dot(users_embeddings[user], items_embeddings[item])
            items_score.append((positive_score,1))
        
        negative_scores = [np.dot(users_embeddings[user], items_embeddings[item]) for item in negative_samples[user]]
        negative_scores = [(score,0) for score in negative_scores]
        items_score.extend(negative_scores)
        items_score = sorted(items_score, key=lambda x: x[0], reverse=True)
        items_score = items_score[:k]
        user_hit_rate = sum([x[1] for x in items_score])
        user_hit_rate = user_hit_rate/len(positive_samples[user])
        hit_rate+=user_hit_rate
    hit_rate = hit_rate/len(positive_samples.keys())
    return hit_rate

In [23]:
k = 10
base_user_emb = create_embeddings(users_list, 0.1,32)
base_item_emb = create_embeddings(items_list, 0.1,32)
hitrate_10_base = Hit_Rate_at_k(user_items_dict_val, val_negative_popularity, base_user_emb, base_item_emb, k)
print(f'Baseline: Hit rate at {k}: {hitrate_10_base}')
hitrate_10_trained = Hit_Rate_at_k(user_items_dict_val, val_negative_popularity, popularity_users_embeddings, popularity_items_embeddings, k)
print(f'Hit rate at {k}: {hitrate_10_trained}')

100%|██████████| 6040/6040 [00:01<00:00, 4675.60it/s]


Baseline: Hit rate at 10: 0.10104856512141253


100%|██████████| 6040/6040 [00:01<00:00, 4939.36it/s]

Hit rate at 10: 0.5788631346578321





In [None]:
k = 50
base_user_emb = create_embeddings(users_list, 0.1,16)
base_item_emb = create_embeddings(items_list, 0.1,16)
hitrate_10_base = Hit_Rate_at_k(user_items_dict_val, val_negative_random, base_user_emb, base_item_emb, k)
print(f'Hit rate at {k}: {hitrate_10_base}')
hitrate_10_trained = Hit_Rate_at_k(user_items_dict_val, val_negative_random, radnom_users_embeddings, random_items_embeddings, k)
print(f'Hit rate at {k}: {hitrate_10_trained}')

In [38]:
def MPR_calculation(positive_samples:dict, negative_samples:dict, users_embeddings:dict, items_embeddings:dict)->float:
    MPR = 0
    for user in tqdm(positive_samples.keys(), desc='MPR calculation'):
        user_mpr=0
        for item in positive_samples[user]:
            positive_score = np.dot(users_embeddings[user], items_embeddings[item])
            negative_scores = [np.dot(users_embeddings[user], items_embeddings[item]) for item in negative_samples[user]]
            neg_lst = [(x,0) for x in negative_scores]
            scores =  neg_lst + [(positive_score,1)]
            #add positive score to the list of negative scores, sort the list and find the index of the positive score
            scores = sorted(scores, key=lambda x: x[0], reverse=True)
            for i in range(len(scores)):
                rating = scores[i][1]
                user_mpr += rating*(i+1)/len(scores)

        MPR+=user_mpr/len(positive_samples[user])
    MPR = MPR/len(positive_samples.keys())
    return MPR 

In [39]:
#MPR calc
MPR_base = MPR_calculation(user_items_dict_val, val_negative_popularity, base_user_emb, base_item_emb)
print(f'MPR: {MPR_base}')
MPR_trained = MPR_calculation(user_items_dict_val, val_negative_popularity, popularity_users_embeddings, popularity_items_embeddings)
print(f'MPR: {MPR_trained}')


MPR calculation: 100%|██████████| 6040/6040 [00:03<00:00, 1633.48it/s]


MPR: 0.5013188719196281


MPR calculation: 100%|██████████| 6040/6040 [00:03<00:00, 1621.13it/s]

MPR: 0.12622370140108344





In [22]:
MPR_trained/MPR_base

1.760289930897027

In [None]:
def prediction_on_test_set(row:pd.Series, users_embeddings:dict, items_embeddings:dict)->pd.Series:
    user = row['UserID']
    item_1 = row['Item1']
    item_2 = row['Item2']

    item_1_score = np.dot(users_embeddings[user], items_embeddings[item_1])
    item_2_score = np.dot(users_embeddings[user], items_embeddings[item_2])

    if item_1_score > item_2_score:
        row['prediction'] = 0
    else:
        row['prediction'] = 1

In [11]:
np.log(0)

  np.log(0)


-inf

In [21]:
a = np.array([0.01,0.01,0.01])
# check if all values in array are equal to 0
np.all(a == 0)

False

In [24]:
np.allclose(a,0,atol=1e-3)

False

In [None]:
if np.allclose(users_embeddings[user], 0, atol=1e-8):
    print('user', user)
if np.allclose(items_embeddings[item], 0, atol=1e-8): 
    print('item', item)

In [36]:
positive_score = np.array([4,2,3,1,6])
negative_scores = np.array([7,2,3,4,5])
a = [(x,1) for x in positive_score]
b = [(x,0) for x in negative_scores]
a.extend(b)
# scores = np.append(positive_score, negative_scores)
# scores
sorted_scores = sorted(a, key=lambda x: x[0], reverse=True)
    mpr = 0 
    for i in range(len(sorted_scores)):
        rating = sorted_scores[i][1]
        mpr += rating*(i+1)/len(sorted_scores)
    mpr/len(positive_score)

0.6

In [37]:
[1,2,3] + [4,5,6]

[1, 2, 3, 4, 5, 6]