In [1]:
import pandas as pd
import numpy as np
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import random as rnd

# Load Data

In [2]:
def load_data(file, sep='\t'):
    return pd.read_csv(f'./lfm-challenge-data/{file}', delimiter=sep)

In [3]:
users = load_data('lfm-challenge.user')
items = load_data('lfm-challenge.item')
inter_train = load_data('lfm-challenge.inter_train')
inter_test = load_data('lfm-challenge.inter_test')
test_users = pd.read_csv(f'./lfm-challenge-data/test_indices.txt')['users'].values

n_users = users['user_id'].values.size
n_items = items.index.values.size

In [4]:
users

Unnamed: 0,user_id,country,age_at_registration,gender,registration_date
0,0,RU,27,m,2007-03-27 19:50:20
1,1,IT,33,m,2006-06-18 21:07:33
2,2,BR,19,m,2010-01-14 05:55:11
3,3,RU,25,m,2007-10-12 18:42:00
4,4,UK,25,m,2005-06-15 22:02:11
...,...,...,...,...,...
9694,9694,RU,24,m,2010-10-25 00:55:47
9695,9695,US,24,f,2005-04-15 04:29:35
9696,9696,RU,20,m,2011-07-29 21:09:24
9697,9697,DE,18,m,2010-05-14 14:07:05


# Interaction Matrix

In [5]:
def create_interaction_matrix(users, items, inter, threshold=1, binary=False):
    interaction_matrix = np.zeros((n_users, n_items), dtype=np.int8)
    
    for user in range(n_users):
        interacted_items = inter.loc[inter['user_id'] == user, 'item_id'].values
        rate_of_items = inter.loc[inter['user_id'] == user, 'listening_events'].values
        
        for item in range(interacted_items.size):
            rating = rate_of_items[item]
            if binary:
                rating = 0 if rating < threshold else 1
            
            interaction_matrix[user, interacted_items[item]] = rating
    
    return interaction_matrix

In [6]:
interaction_matrix = create_interaction_matrix(users, items, inter_train, binary=True)
test_interaction_matrix = create_interaction_matrix(users, items, inter_test, binary=True)

## Test Predictions

In [12]:
def get_ndcg_score(predictions: np.ndarray, test_interaction_matrix: np.ndarray, topK=10) -> float:
    """
    predictions - np.ndarray - predictions of the recommendation algorithm for each user.
    test_interaction_matrix - np.ndarray - test interaction matrix for each user.
    topK - int - topK recommendations should be evaluated.
    
    returns - average ndcg score over all users.
    """
    n_users = predictions.shape[0]
    discounts = np.log2(np.arange(2, topK+2)) # discounts for positions 1 to topK (0-indexed)
    ndcg_scores = np.zeros(n_users)

    for user in range(n_users):
        top_items = predictions[user]
        relevant_items = test_interaction_matrix[user].nonzero()[0]
        if len(relevant_items) == 0:
            continue

        # calculate DCG
        dcg = 0
        for i, item in enumerate(top_items):
            if item in relevant_items:
                dcg += 1 / discounts[i]

        # calculate IDCG
        n_relevant = min(topK, len(relevant_items))
        idcg = np.sum(1 / discounts[:n_relevant]) # it is 1 for each relevant item in ideal case
        
        ndcg_scores[user] = dcg / idcg

    return np.mean(ndcg_scores)


In [13]:
predictions = np.load('itemknn_rec.npy')
predictions

array([[7258,  148, 3432, 6904, 7767,  970,  157, 1518,  152,  271],
       [3015, 2349,  170, 5423, 8566, 5190, 3394, 2707, 6955, 4144],
       [1999, 3580, 2011, 9302,  410, 3999, 5210, 8126, 3174, 8167],
       [ 986, 3867, 7682, 4293, 7038, 3490, 6921, 3717,  255, 1561],
       [1358,  351, 9408, 1263,  203,  345, 1274, 1273, 9410, 1189],
       [ 593,  114,   56,  383, 2359,   86,  400, 2605,   68,  152],
       [5941,  458, 7802, 6212, 8773, 8320, 5666,  710, 2366, 5820],
       [ 410,  420,  449,  415,  728, 2359, 2203,  593, 2605,  928],
       [7580, 9391, 1697, 5086, 9702,  104,   88, 7846, 2664, 8259],
       [7700, 8364, 1526, 1556,  897, 4914,  649, 6151,  352, 5682],
       [ 502,  251, 8664,  231,  239, 5340, 1545,  298, 5342,    2],
       [ 157, 1509,  313,  148,  368, 1134, 3305,  160, 1540, 4144],
       [5286, 2282, 5324, 5330, 2214, 5328,  157, 7513, 1305, 3808],
       [1069, 8607, 3194,  669, 3063, 2963, 7136, 8478,  647, 3173],
       [3173, 4094, 1851, 1040, 10

In [14]:
score = get_ndcg_score(predictions, test_interaction_matrix)
score

0.0013886244387355457

# POP Recommender Country

In [70]:
def recTopKPopByCountry(inter_matr: np.array,
               user: int,
               top_k: int,
               users: pd.DataFrame) -> np.array:
    '''
    inter_matr - np.array from the task 1;
    user - user_id, integer;
    top_k - expected length of the resulting list;
    users: pandas Dataframe consisting of user information for all users, requires a "country" column

    returns - list/array of top K popular items that the user has never seen
              (sorted in the order of descending popularity);
    '''
    # TODO: YOUR IMPLEMENTATION
    top_pop = None

    rec_items_before = np.where(inter_matr[user] == 1)[0]
    age_at_registration = users.loc[users['user_id'] == user, 'age_at_registration'].values[0]
    users_from_same_age_at_registration = users.loc[users['age_at_registration'] == age_at_registration, 'user_id'].values
    new_inter_matr = inter_matr[users_from_same_age_at_registration]
    
    interactions = pd.Series(np.sum(new_inter_matr, axis=0)).drop(rec_items_before).sort_values(ascending=False, kind='mergesort')
    top_pop = interactions.index[:top_k].to_numpy()
    
    return top_pop

In [72]:
predictions = []

for user in range(n_users):
    print(f'User {user}/{n_users}', end='\r')
    top_10 = recTopKPopByCountry(inter_matr=interaction_matrix, user=user, top_k=10, users=users)
    predictions.append(top_10)

predictions = np.array(predictions)

User 9698/9699

In [73]:
predictions.shape

(9699, 10)

In [74]:
score = get_ndcg_score(predictions, test_interaction_matrix)
score

0.027594118519486035

In [14]:
inter_train

Unnamed: 0,user_id,item_id,listening_events
0,0,3,4
1,0,4,2
2,0,6,2
3,0,7,3
4,1,11,3
...,...,...,...
138767,9698,720,7
138768,9698,727,3
138769,9698,2586,2
138770,9698,2963,2


# Neural Network

In [15]:
class MF(nn.Module):

    def __init__(self, n_users: int, n_items: int, n_factors: int):
        """
        n_users - int - number of users.
        n_items - int - number of items.
        n_factors - int - dimensionality of the latent space.
        """

        super(MF, self).__init__()

        self.embedding_user = nn.Embedding(n_users, n_factors)
        self.embedding_item = nn.Embedding(n_items, n_factors)
        
        # TODO: YOUR IMPLEMENTATION.


    def forward(self, user: torch.Tensor, item: torch.Tensor) -> torch.Tensor:
        """
        We allow for some flexibility giving lists of ids as inputs:
        if the training data is small we can deal with it in a single forward pass,
        otherwise we could fall back to mini-batches, limiting users and items we pass
        every time.
        
        user - torch.Tensor - user_ids.
        item - torch.Tensor - item_ids.
        
        returns - torch.Tensor - Reconstructed Interaction matrix of shape (n_users, n_items).
        """
        u = self.embedding_user(user)
        v = self.embedding_item(item)

        # TODO: YOUR IMPLEMENTATION.
        
        return (u @ v.T)

In [16]:
def compute_loss(logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
    """
    logits - torch.Tensor - output of model.
    labels - torch.Tensor - labels / interaction matrix model should learn to reconstruct.
    
    returns - torch.Tensor - BCELoss over all logits and labels.
    """
    loss = nn.BCELoss()(torch.sigmoid(logits), labels)
    
    # TODO: YOUR IMPLEMENTATION.

    return loss

In [17]:
def train(model: nn.Module, train_data_inter: np.ndarray, epochs: int, optimizer, loss_func) -> list:
    """
    model - nn.Module - torch module to train.
    train_data_inter - np.ndarray - interaction matrix of the training data.
    epochs - int - number of epochs to perform.
    optimizer - optim - optimizer for training.
    loss_func - loss function for training.
    
    returns - list - list of loss values over all epochs.
    """
    losses = []

    model.train()

    user_ids = torch.Tensor(list(range(train_data_inter.shape[0]))).long()
    item_ids = torch.Tensor(list(range(train_data_inter.shape[1]))).long()
    y = torch.Tensor(train_data_inter).long()

    for e in range(epochs):
        optimizer.zero_grad()

        y_hat = model(user_ids, item_ids)

        loss = loss_func(y_hat.unsqueeze(0).float(), y.unsqueeze(0).float())
        loss.backward()
        optimizer.step()

        if e % 100 == 0:
            print("Loss ", e, ": ", loss.item())

        losses.append(loss.item())

    return losses

In [18]:
# Do not change the seed.
torch.manual_seed(1234)
rnd.seed(1234)
np.random.seed(1234)

model_128 = MF(n_users=n_users, n_items=n_items, n_factors=128)
optimizer = optim.Adam(model_128.parameters(), lr=0.001)

# TODO: YOUR IMPLEMENATION.
# Initialize the model and optimizer as prescribed


In [19]:
loss_model_128 = train(model=model_128,
                       train_data_inter=interaction_matrix,
                       epochs=1000,
                       optimizer=optimizer,
                       loss_func=compute_loss)

assert len(loss_model_128) == 1000, "Loss should have 1000 elements, one for each epoch."

Loss  0 :  10.083086013793945
Loss  100 :  7.214507579803467
Loss  200 :  3.5739328861236572
Loss  300 :  0.8108418583869934
Loss  400 :  0.21792972087860107
Loss  500 :  0.09922955930233002
Loss  600 :  0.064073845744133
Loss  700 :  0.050044916570186615
Loss  800 :  0.04316549748182297
Loss  900 :  0.039255231618881226


In [20]:
def itMF_recommend(user_id: int, seen_item_ids: list, model=None, topK=10) -> np.ndarray:
    """
    Recommend with the trained model to selected users
    
    user_id - int - id of target user.
    seen_item_ids - list[list[int]] ids of items already seen by the users (to exclude from recommendation)
    model - trainted factorization model to use for scoring
    topK - number of recommendations per user to be returned
    
    returns - np.ndarray - np.ndarray - list of ids of recommended items in the order of descending score
                           use -1 as a place holder item index, when it is impossible to recommend topK items
    """
    recs = None

    model.eval()
    user_embedding = model.embedding_user(torch.tensor([user_id]))
    item_embeddings = model.embedding_item.weight
    
    # Compute the score for each item
    all_ratings = torch.matmul(user_embedding, item_embeddings.T).flatten()
    
    # Set scores of seen items to -inf to ignore them
    all_ratings[seen_item_ids] = float('-inf')
    
    # Get the indices of the top K items
    _, top_indices = torch.topk(all_ratings, topK)
    recs = top_indices.tolist()

    return np.array(recs)

In [22]:
user_id = 5
seen_list = np.where(interaction_matrix[user_id] != 0)

itMF_recommend(user_id, seen_list, model_128)

array([2401, 5887, 1043, 8751, 4345, 8473, 1708, 5059, 5805, 4029])

In [25]:
def recommend():
    recommendations = []
    
    for user_id in range(n_users):
        print(f'{user_id}/{n_users}', end='\r')
        seen_list = np.where(interaction_matrix[user_id] != 0)
        rec = itMF_recommend(user_id, seen_list, model_128)
        recommendations.append(rec)
    
    return np.array(recommendations)

In [26]:
recs = recommend()

0/96991/96992/96993/96994/96995/96996/96997/96998/96999/969910/969911/969912/969913/969914/969915/969916/969917/969918/969919/969920/969921/969922/969923/969924/969925/969926/969927/969928/969929/969930/969931/969932/969933/969934/969935/969936/969937/969938/969939/969940/969941/969942/969943/969944/969945/969946/969947/969948/969949/969950/969951/969952/969953/969954/969955/969956/969957/969958/969959/969960/969961/969962/969963/969964/969965/969966/969967/969968/969969/969970/969971/969972/969973/969974/969975/969976/969977/969978/969979/969980/969981/969982/969983/969984/969985/969986/969987/969988/969989/969990/969991/969992/969993/969994/969995/969996/969997/969998/969999/9699100/9699101/9699102/9699103/9699104/9699105/9699106/9699107/9699108/9699109/9699110/9699111/9699112/9699113/9699114/9699115/9699116/9699117/9699118/9699119/9699120/9699121/9699122/9699123

1677/96991678/96991679/96991680/96991681/96991682/96991683/96991684/96991685/96991686/96991687/96991688/96991689/96991690/96991691/96991692/96991693/96991694/96991695/96991696/96991697/96991698/96991699/96991700/96991701/96991702/96991703/96991704/96991705/96991706/96991707/96991708/96991709/96991710/96991711/96991712/96991713/96991714/96991715/96991716/96991717/96991718/96991719/96991720/96991721/96991722/96991723/96991724/96991725/96991726/96991727/96991728/96991729/96991730/96991731/96991732/96991733/96991734/96991735/96991736/96991737/96991738/96991739/96991740/96991741/96991742/96991743/96991744/96991745/96991746/96991747/96991748/96991749/96991750/96991751/96991752/96991753/96991754/96991755/96991756/96991757/96991758/96991759/96991760/96991761/96991762/96991763/96991764/96991765/96991766/96991767/96991768/96991769/96991770/96991771/96991772/96991773/96991774/96991775/96991776/9699

3638/96993639/96993640/96993641/96993642/96993643/96993644/96993645/96993646/96993647/96993648/96993649/96993650/96993651/96993652/96993653/96993654/96993655/96993656/96993657/96993658/96993659/96993660/96993661/96993662/96993663/96993664/96993665/96993666/96993667/96993668/96993669/96993670/96993671/96993672/96993673/96993674/96993675/96993676/96993677/96993678/96993679/96993680/96993681/96993682/96993683/96993684/96993685/96993686/96993687/96993688/96993689/96993690/96993691/96993692/96993693/96993694/96993695/96993696/96993697/96993698/96993699/96993700/96993701/96993702/96993703/96993704/96993705/96993706/96993707/96993708/96993709/96993710/96993711/96993712/96993713/96993714/96993715/96993716/96993717/96993718/96993719/96993720/96993721/96993722/96993723/96993724/96993725/96993726/96993727/96993728/96993729/96993730/96993731/96993732/96993733/96993734/96993735/96993736/96993737/9699

5645/96995646/96995647/96995648/96995649/96995650/96995651/96995652/96995653/96995654/96995655/96995656/96995657/96995658/96995659/96995660/96995661/96995662/96995663/96995664/96995665/96995666/96995667/96995668/96995669/96995670/96995671/96995672/96995673/96995674/96995675/96995676/96995677/96995678/96995679/96995680/96995681/96995682/96995683/96995684/96995685/96995686/96995687/96995688/96995689/96995690/96995691/96995692/96995693/96995694/96995695/96995696/96995697/96995698/96995699/96995700/96995701/96995702/96995703/96995704/96995705/96995706/96995707/96995708/96995709/96995710/96995711/96995712/96995713/96995714/96995715/96995716/96995717/96995718/96995719/96995720/96995721/96995722/96995723/96995724/96995725/96995726/96995727/96995728/96995729/96995730/96995731/96995732/96995733/96995734/96995735/96995736/96995737/96995738/96995739/96995740/96995741/96995742/96995743/96995744/9699

6943/96996944/96996945/96996946/96996947/96996948/96996949/96996950/96996951/96996952/96996953/96996954/96996955/96996956/96996957/96996958/96996959/96996960/96996961/96996962/96996963/96996964/96996965/96996966/96996967/96996968/96996969/96996970/96996971/96996972/96996973/96996974/96996975/96996976/96996977/96996978/96996979/96996980/96996981/96996982/96996983/96996984/96996985/96996986/96996987/96996988/96996989/96996990/96996991/96996992/96996993/96996994/96996995/96996996/96996997/96996998/96996999/96997000/96997001/96997002/96997003/96997004/96997005/96997006/96997007/96997008/96997009/96997010/96997011/96997012/96997013/96997014/96997015/96997016/96997017/96997018/96997019/96997020/96997021/96997022/96997023/96997024/96997025/96997026/96997027/96997028/96997029/96997030/96997031/96997032/96997033/96997034/96997035/96997036/96997037/96997038/96997039/96997040/96997041/96997042/9699

8209/96998210/96998211/96998212/96998213/96998214/96998215/96998216/96998217/96998218/96998219/96998220/96998221/96998222/96998223/96998224/96998225/96998226/96998227/96998228/96998229/96998230/96998231/96998232/96998233/96998234/96998235/96998236/96998237/96998238/96998239/96998240/96998241/96998242/96998243/96998244/96998245/96998246/96998247/96998248/96998249/96998250/96998251/96998252/96998253/96998254/96998255/96998256/96998257/96998258/96998259/96998260/96998261/96998262/96998263/96998264/96998265/96998266/96998267/96998268/96998269/96998270/96998271/96998272/96998273/96998274/96998275/96998276/96998277/96998278/96998279/96998280/96998281/96998282/96998283/96998284/96998285/96998286/96998287/96998288/96998289/96998290/96998291/96998292/96998293/96998294/96998295/96998296/96998297/96998298/96998299/96998300/96998301/96998302/96998303/96998304/96998305/96998306/96998307/96998308/9699

9517/96999518/96999519/96999520/96999521/96999522/96999523/96999524/96999525/96999526/96999527/96999528/96999529/96999530/96999531/96999532/96999533/96999534/96999535/96999536/96999537/96999538/96999539/96999540/96999541/96999542/96999543/96999544/96999545/96999546/96999547/96999548/96999549/96999550/96999551/96999552/96999553/96999554/96999555/96999556/96999557/96999558/96999559/96999560/96999561/96999562/96999563/96999564/96999565/96999566/96999567/96999568/96999569/96999570/96999571/96999572/96999573/96999574/96999575/96999576/96999577/96999578/96999579/96999580/96999581/96999582/96999583/96999584/96999585/96999586/96999587/96999588/96999589/96999590/96999591/96999592/96999593/96999594/96999595/96999596/96999597/96999598/96999599/96999600/96999601/96999602/96999603/96999604/96999605/96999606/96999607/96999608/96999609/96999610/96999611/96999612/96999613/96999614/96999615/96999616/9699

In [35]:
recs.shape

(9699, 10)

In [39]:
a = recs[test_users]
b = test_interaction_matrix[test_users]
b.shape

(100, 10000)

In [40]:
score = get_ndcg_score(a, b)
score

0.0010778915452451094