In [1]:
import pandas as pd
import pickle
from tqdm import tqdm
import numpy as np

from utils import *

In [2]:
train_set=pd.read_csv('data/train.csv')

In [3]:
items_list = list(train_set['ItemID'].unique())

In [4]:
user_items_dict = create_user_items_dict(train_set)

In [5]:
item_probability_dict = create_item_popularity_dict(train_set)

In [6]:
#check if user_negative_samples_by_popularity.pkl exists
try:
    with open('users_negative_samples/user_negative_samples_by_popularity.pkl', 'rb') as f:
        user_negative_samples_by_popularity = pickle.load(f)
except:
    user_negative_samples_by_popularity = sample_negative_examples_by_popularity(user_items_dict, items_list, item_probability_dict)
    #save user_negative_samples_by_popularity.pkl
    with open('users_negative_samples/user_negative_samples_by_popularity.pkl', 'wb') as f:
        pickle.dump(user_negative_samples_by_popularity, f)

In [10]:
try:
    with open('users_negative_samples/user_negative_samples_randomly.pkl', 'rb') as f:
        user_negative_samples_randomly = pickle.load(f)
except:
    user_negative_samples_randomly = sample_negative_examples_randomly(user_items_dict, items_list)
    #save user_negative_samples_randomly.pkl
    with open('users_negative_samples/user_negative_samples_randomly.pkl', 'wb') as f:
        pickle.dump(user_negative_samples_randomly, f)

  8%|▊         | 475/6040 [01:44<20:20,  4.56it/s]  


KeyboardInterrupt: 

In [8]:
def training_loop(user_items_dict:dict,
                    items_list:list,
                    alpha_user:float,
                    alpha_item:float,
                    k:int,
                    lr:float,
                    epochs:int,
                    user_negative_samples_by_popularity:dict,
                    user_negative_samples_randomly:dict,
                    sample_negative_by_popularity:bool=False)->tuple:
    items_embeddings = create_items_embeddings(items_list, alpha_item, k)
    users_embeddings = create_users_embeddings(user_items_dict, alpha_user, k)

    for epoch in tqdm(range(epochs)):
        for user in tqdm(user_items_dict):
            if sample_negative_by_popularity:
                negative_item = user_negative_samples_by_popularity(user)
            else:
                negative_item = user_negative_samples_randomly(user)
            for item in user_items_dict[user]:
                prediction = sigmoid(np.dot(users_embeddings[user], items_embeddings[item]))
                error = 1 - prediction
                users_embeddings[user] += lr * error * items_embeddings[item] - alpha_user * users_embeddings[user]
                items_embeddings[item] += lr * error * users_embeddings[user] - alpha_item * items_embeddings[item]

            for item in negative_item:
                prediction = sigmoid(-1*(users_embeddings[user]).T.dot(items_embeddings[item]))
                error = 0 - prediction
                users_embeddings[user] += lr * error * items_embeddings[item] - alpha_user * users_embeddings[user]
                items_embeddings[item] += lr * error * users_embeddings[user] - alpha_item * items_embeddings[item]
            
    return users_embeddings, items_embeddings

In [9]:
users_embeddings, items_embeddings = training_loop(user_items_dict, items_list, 0.1, 0.1, 20, 0.01, 10, sample_negative_by_popularity=True)

TypeError: training_loop() missing 2 required positional arguments: 'user_negative_samples_by_popularity' and 'user_negative_samples_randomly'

In [None]:
def prediction_on_test_set(row:pd.Series, users_embeddings:dict, items_embeddings:dict)->pd.Series:
    user = row['UserID']
    item_1 = row['Item1']
    item_2 = row['Item2']

    item_1_score = np.dot(users_embeddings[user], items_embeddings[item_1])
    item_2_score = np.dot(users_embeddings[user], items_embeddings[item_2])

    if item_1_score > item_2_score:
        row['prediction'] = 0
    else:
        row['prediction'] = 1