In [1]:
import pandas as pd
import numpy as np
import torch
import math
from tqdm import tqdm

In [2]:
train_set=pd.read_csv('train.csv')

In [3]:
items_list = list(train_set['ItemID'].unique())

In [4]:
def create_user_items_dict(df: pd.DataFrame) -> dict:
    user_items_dict = {}
    for user, item in zip(df['UserID'], df['ItemID']):
        if user not in user_items_dict:
            user_items_dict[user] = []
        user_items_dict[user].append(item)
    return user_items_dict

user_items_dict = create_user_items_dict(train_set)

In [5]:
def sample_negative_examples_randomly(user_items_dict:dict, user:int, items_list:list)->list:
    relevant_samples = [x for x in items_list if x not in user_items_dict[user]]
    return list(np.random.choice(relevant_samples, len(user_items_dict[user]), replace=False))

In [6]:
def sample_negative_examples_by_popularity(user_items_dict:dict, user:int, items_list:list, train_set:pd.DataFrame)->list:
    popularity_df = train_set.groupby('ItemID').size().reset_index(name='counts')
    popularity_df['probability'] = popularity_df['counts'] / popularity_df['counts'].sum()
    item_probability_dict = dict(zip(popularity_df['ItemID'], popularity_df['probability']))
    relevant_samples = [x for x in items_list if x not in user_items_dict[user]]
    probabilities = [item_probability_dict[x] for x in relevant_samples]
    return list(np.random.choice(relevant_samples, len(user_items_dict[user]), replace=False, p=probabilities))

In [7]:
def create_items_embeddings(items_list:list, alpha_item ,k:int)->dict:
    items_embeddings = {}
    for item in items_list:
        items_embeddings[item] = np.random.normal(0, alpha_item, k)
    return items_embeddings

In [8]:
def create_users_embeddings(user_items_dict:dict, alpha_user, k:int)->dict:
    users_embeddings = {}
    for user in user_items_dict:
        users_embeddings[user] = np.random.normal(0, alpha_user, k)
    return users_embeddings

In [9]:
def sigmoid(x):
    return 1 / (1 + math.exp(-x))

In [22]:
def training_loop(user_items_dict, items_list, alpha_user, alpha_item, k, lr, epochs, sample_negative_by_popularity=False)->tuple:
    items_embeddings = create_items_embeddings(items_list, alpha_item, k)
    users_embeddings = create_users_embeddings(user_items_dict, alpha_user, k)

    for epoch in tqdm(range(epochs)):
        for user in user_items_dict:
            if sample_negative_by_popularity:
                negative_item = sample_negative_examples_by_popularity(user_items_dict, user, items_list, train_set)
            else:
                negative_item = sample_negative_examples_randomly(user_items_dict, user, items_list)
            for item in user_items_dict[user]:
                print(np.dot(users_embeddings[user], items_embeddings[item]))
                prediction = math.log2(sigmoid(np.dot(users_embeddings[user], items_embeddings[item])))
                error = 1 - prediction
                users_embeddings[user] += -lr * -error * items_embeddings[item] + alpha_user * users_embeddings[user]
                items_embeddings[item] += -lr * -error * users_embeddings[user] + alpha_item * items_embeddings[item]

            for item in negative_item:
                prediction = math.log2(sigmoid(-1*(users_embeddings[user]).T.dot(items_embeddings[item])))
                error = 0 - prediction
                users_embeddings[user] += -lr * -error * items_embeddings[item] + alpha_user * users_embeddings[user]
                items_embeddings[item] += -lr * -error * users_embeddings[user] + alpha_item * items_embeddings[item]
            
    return users_embeddings, items_embeddings

In [23]:
users_embeddings, items_embeddings = training_loop(user_items_dict, items_list, 0.1, 0.1, 20, 0.01, 10, sample_negative_by_popularity=False)

  0%|          | 0/10 [00:00<?, ?it/s]

-0.03349641210852546
-0.012208793319974732
-0.00016418435439653745
0.09875591048637698
0.0004447559691591425
-0.05241870567690552
0.03912948631129917
-0.022594963503161786
-0.1764321042117724
0.022851211791438977
0.028955221815384277
-0.15433471628835504
-0.0949828739937689
-0.1507943355565096
0.3879970293974609
-0.17521007310923073
-0.21531740823676587
-0.4049316475075954
-0.3804923560725815
0.3773473376610537
0.14029198984137894
0.34458952403637577
-0.544891838403028
0.0881081139741447
0.21422437096289887
-0.01567479707979763
0.7092430195176087
-0.5270941451740879
0.09832678268350173
0.9570004237510079
0.6738242400673814
-0.6766410840129422
-0.0385469552550214
0.6275493330727093
-0.7820003471960738
-0.031221636284405913
0.42011471255223154
-1.0917141049013857
-0.2544413114476536
-2.008661491369921
1.5532757383576818
-2.4399589383469364
3.3647749061613177
-0.15416955306443042
0.7941099566284389
-1.46302916986631
8.161654407266907
1.4060066708823693
-0.08170228149736802
1.0362081688881

  0%|          | 0/10 [00:00<?, ?it/s]

0.028258012011809684
-0.00022216148582771336
0.0072549292363601105
0.08170302004442688
0.12133108721769455
-0.0439146837190533
-0.01691344721406558
0.03315169207196703
-0.038453433021355295
-0.1037779722074211
0.05664026845666503
0.10401246415674006
-0.15188469709665775
0.024172419138931422
-0.0787982005872884
-0.3303527573623083
-0.3247324425112008
-0.10409176486639055
0.09834666556733751
-0.04853861401869465
0.28901616906357647
-0.08513307986995046
0.12923574358507126
-0.054758221758446024
0.17619821319491752
-0.34382046467140936
-0.051610351270104105
-0.5295566443540202
1.0109288395098328
-0.3220689466359552
-0.6216070842484278
-0.38901144630058726
0.2918415253382285
-0.18545958685181876
0.2972796129282317
-0.7680685085593566
0.8608190797442685
-1.7574375880385056
2.105169099923748
-1.3696462231042816
-0.16219015018291494
1.7825879352606275
1.0175631939550778
-1.2654367342210477
2.950954207322851
0.7177975690759648
0.6127643672865176
-2.290413446716829
2.4887161876219848
-0.38072645




OverflowError: math range error

In [None]:
def prediction_on_test_set(row:pd.Series, users_embeddings:dict, items_embeddings:dict)->pd.Series:
    user = row['UserID']
    item_1 = row['Item1']
    item_2 = row['Item2']

    item_1_score = np.dot(users_embeddings[user], items_embeddings[item_1])
    item_2_score = np.dot(users_embeddings[user], items_embeddings[item_2])

    if item_1_score > item_2_score:
        row['prediction'] = 0
    else:
        row['prediction'] = 1