In [1]:
import pandas as pd
import pickle
from tqdm import tqdm
import numpy as np

from utils import *

In [2]:
train_set_raw=pd.read_csv('data/train.csv')

# Spliting the data into train and validation

In [3]:
# sample 3 positive items per user
df_val = train_set_raw.groupby('UserID').sample(n=3, random_state=10)
# take the rest of the data as validation set
df_train = train_set_raw[~train_set_raw.index.isin(df_val.index)].copy()

In [4]:
items_list = list(train_set_raw['ItemID'].unique())
train_items_list = list(df_train['ItemID'].unique())
val_items_list = list(df_val['ItemID'].unique())
print(f'Number of items in train set: {len(train_items_list)}')
print(f'Number of items in validation set: {len(val_items_list)}')

Number of items in train set: 3705
Number of items in validation set: 2468


In [5]:
users_list = list(train_set_raw['UserID'].unique())
print(f'Number of users in train set: {len(users_list)}')
print(f'Number of users in validation set: {len(df_val["UserID"].unique())}')

Number of users in train set: 6040
Number of users in validation set: 6040


## Creating the train and validation data sets with negative and positive samples

In [6]:
user_items_dict_train = create_user_items_dict(df_train)
user_items_dict_val = create_user_items_dict(df_val)

In [7]:
item_probability_dict = create_item_popularity_dict(train_set_raw)

### Load negative samples

In [8]:
# train set
train_negative_random = load_negative_samples(user_items_dict_train, items_list, 'train', 'random')
train_negative_popularity = load_negative_samples(user_items_dict_train, items_list, 'train', 'popularity', item_probability_dict)
# validation set
val_negative_random = load_negative_samples(user_items_dict_val, items_list, 'validation', 'random')
val_negative_popularity = load_negative_samples(user_items_dict_val, items_list, 'validation', 'popularity', item_probability_dict)

### Creating datasets for training loop

In [26]:
try:
    with open('data/train_datasets/train_random.pkl', 'rb') as f:
        df_random = pickle.load(f)
except:
    df_random = create_dataset(train_negative_random, df_train)
    with open('data/train_datasets/train_random.pkl', 'wb') as f:
        pickle.dump(df_random, f)

try:
    with open('data/train_datasets/train_popularity.pkl', 'rb') as f:
        df_popularity = pickle.load(f)
except:
    df_popularity = create_dataset(train_negative_popularity, df_train)
    with open('data/train_datasets/train_popularity.pkl', 'wb') as f:
        pickle.dump(df_popularity, f)

100%|██████████| 6040/6040 [01:13<00:00, 81.85it/s] 
100%|██████████| 6040/6040 [01:10<00:00, 86.12it/s] 


---

# Training the model

In [None]:
radnom_users_embeddings, random_items_embeddings = training_loop( 
                                                    df_random,
                                                    user_items_dict_val, val_negative_random,
                                                    users_list, items_list,
                                                    alpha_item = 1e-4,
                                                    alpha_user = 1e-4,
                                                    item_init_noise=0.1,
                                                    user_init_noise=0.1,
                                                    epochs = 10,
                                                    k = 32,
                                                    lr = 0.1)

In [None]:
popularity_users_embeddings, popularity_items_embeddings = training_loop( 
                                                    df_popularity,
                                                    user_items_dict_val, val_negative_popularity,
                                                    users_list, items_list,
                                                    alpha_item = 1e-4,
                                                    alpha_user = 1e-4,
                                                    item_init_noise=0.1,
                                                    user_init_noise=0.1,
                                                    epochs = 10,
                                                    k = 32,
                                                    lr = 0.01)

---

# Model Evaluation

In [10]:
res_dic = {}
for sample_type in ['random', 'popularity']:
    for emb in ['users','items']:
        with open(f'data/results/{sample_type}_{emb}_embeddings.pkl', 'rb') as f:
            res_dic[f'{sample_type}_{emb}'] = pickle.load(f)

In [11]:
negative_sample_dics = {'random': val_negative_random, 'popularity': val_negative_popularity}
res_list = []
for train_sample_type in ['random', 'popularity']:
    for validation_sample_type in ['random', 'popularity']:
        print (f'Calculating measures on model trained on {train_sample_type} using {validation_sample_type} as validation set')
        positive_samples = user_items_dict_val
        negative_samples = negative_sample_dics[validation_sample_type]
        user_emb = res_dic[f'{train_sample_type}_users']
        item_emb = res_dic[f'{train_sample_type}_items']
        # calculate metrics
        mpr = MPR_calculation(positive_samples, negative_samples, user_emb, item_emb)
        hr1 = Hit_Rate_at_k(positive_samples, negative_samples, user_emb, item_emb, 1)
        hr10 = Hit_Rate_at_k(positive_samples, negative_samples, user_emb, item_emb, 10)
        hr50 = Hit_Rate_at_k(positive_samples, negative_samples, user_emb, item_emb, 50)
        validation_loss = validation_loss_func(positive_samples, negative_samples, user_emb, item_emb, 1e-4, 1e-4)
        # add to results
        res_list.append([train_sample_type, validation_sample_type, mpr, hr1, hr10, hr50, validation_loss])
# save results to dataframe
df_res = pd.DataFrame(res_list, columns=['train_sample_type', 'validation_sample_type', 'MPR', 'HR@1', 'HR@10', 'HR@50', 'validation_loss'])

Calculating measures on model trained on random using random as validation set


MPR calculation: 100%|██████████| 6040/6040 [00:03<00:00, 1923.91it/s]
Hit Rate @1 calculation: 100%|██████████| 6040/6040 [00:00<00:00, 6144.92it/s]
Hit Rate @10 calculation: 100%|██████████| 6040/6040 [00:00<00:00, 6174.62it/s]
Hit Rate @50 calculation: 100%|██████████| 6040/6040 [00:01<00:00, 5692.97it/s]
100%|██████████| 6040/6040 [00:01<00:00, 5880.90it/s]


Calculating measures on model trained on random using popularity as validation set


MPR calculation: 100%|██████████| 6040/6040 [00:03<00:00, 1929.50it/s]
Hit Rate @1 calculation: 100%|██████████| 6040/6040 [00:00<00:00, 6110.07it/s]
Hit Rate @10 calculation: 100%|██████████| 6040/6040 [00:01<00:00, 5829.02it/s]
Hit Rate @50 calculation: 100%|██████████| 6040/6040 [00:01<00:00, 5931.96it/s]
100%|██████████| 6040/6040 [00:00<00:00, 6077.15it/s]


Calculating measures on model trained on popularity using random as validation set


MPR calculation: 100%|██████████| 6040/6040 [00:03<00:00, 1919.38it/s]
Hit Rate @1 calculation: 100%|██████████| 6040/6040 [00:00<00:00, 6126.39it/s]
Hit Rate @10 calculation: 100%|██████████| 6040/6040 [00:00<00:00, 6203.70it/s]
Hit Rate @50 calculation: 100%|██████████| 6040/6040 [00:01<00:00, 5733.72it/s]
100%|██████████| 6040/6040 [00:01<00:00, 5814.50it/s]


Calculating measures on model trained on popularity using popularity as validation set


MPR calculation: 100%|██████████| 6040/6040 [00:03<00:00, 1971.91it/s]
Hit Rate @1 calculation: 100%|██████████| 6040/6040 [00:01<00:00, 5979.09it/s]
Hit Rate @10 calculation: 100%|██████████| 6040/6040 [00:01<00:00, 5580.35it/s]
Hit Rate @50 calculation: 100%|██████████| 6040/6040 [00:01<00:00, 5912.03it/s]
100%|██████████| 6040/6040 [00:00<00:00, 6089.94it/s]


In [31]:
df_res

Unnamed: 0,train_sample_type,validation_sample_type,MPR,HR@1,HR@10,HR@50,validation_loss
0,random,random,0.110508,0.116943,0.632726,0.971523,0.818106
1,random,popularity,0.110477,0.11628,0.634106,0.973068,0.817685
2,popularity,random,0.110242,0.119592,0.635265,0.972461,0.818295
3,popularity,popularity,0.11016,0.121689,0.635541,0.972848,0.817602


# Prediction On test data

In [63]:
def prediction_on_test_set(row:pd.Series, users_embeddings:dict, items_embeddings:dict)->pd.Series:
    user = row['UserID']
    item_1 = row['Item1']
    item_2 = row['Item2']
    if item_1 not in items_embeddings.keys() or item_2 not in items_embeddings.keys():
        return np.random.randint(0,2)
        
    item_1_score = np.dot(users_embeddings[user], items_embeddings[item_1])
    item_2_score = np.dot(users_embeddings[user], items_embeddings[item_2])

    if item_1_score > item_2_score:
        return 0
    else:
        return 1

In [12]:
# load test set
popularity_test = pd.read_csv('data/PopularityTest.csv')
random_test = pd.read_csv('data/RandomTest.csv')

In [14]:
item_embeddings = res_dic['popularity_items']
user_embeddings = res_dic['popularity_users']

In [64]:
popularity_test['bitClassification'] = popularity_test.apply(lambda row: prediction_on_test_set(row, user_embeddings, item_embeddings), axis=1)
random_test['bitClassification'] = random_test.apply(lambda row: prediction_on_test_set(row, user_embeddings, item_embeddings), axis=1)