In [14]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [15]:
class MovieLensDataset(Dataset):
    def __init__(self, ratings, num_negatives=4, all_movies=None, user_item_set=None):
        self.ratings = ratings
        self.num_negatives = num_negatives
        self.users = ratings['userId'].values
        self.items = ratings['movie_idx'].values
        self.labels = ratings['label'].values

        self.all_movies = all_movies if all_movies is not None else set(ratings['movie_idx'].unique())
        self.user_item_set = user_item_set if user_item_set is not None else set(zip(self.users, self.items))
        self.num_users = max(ratings['userId'].max() + 1, ratings['userId'].nunique())
        self.num_items = max(ratings['movie_idx'].max() + 1, ratings['movie_idx'].nunique())

    def __len__(self):
        return len(self.ratings) * (self.num_negatives + 1)

    def __getitem__(self, idx):
        real_idx = idx // (self.num_negatives + 1)
        user = self.users[real_idx]
        item = self.items[real_idx]
        label = self.labels[real_idx]

        if idx % (self.num_negatives + 1) == 0: #+ve
            return torch.LongTensor([user]), torch.LongTensor([item]), torch.FloatTensor([label])
        else:
            neg_item = self._negative_sampling(user) #-ve
            return torch.LongTensor([user]), torch.LongTensor([neg_item]), torch.FloatTensor([0])

    def _negative_sampling(self, user):
        neg_item = np.random.choice(list(self.all_movies))
        while (user, neg_item) in self.user_item_set:
            neg_item = np.random.choice(list(self.all_movies))
        return neg_item

In [16]:
# 2. NCF Model
class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_size=32, mlp_layers=[64, 32, 16]):
        super(NCF, self).__init__()
        self.user_embedding_gmf = nn.Embedding(num_users, embedding_size)
        self.item_embedding_gmf = nn.Embedding(num_items, embedding_size)
        self.user_embedding_mlp = nn.Embedding(num_users, embedding_size)
        self.item_embedding_mlp = nn.Embedding(num_items, embedding_size)

        mlp_modules = []
        input_size = embedding_size * 2
        for output_size in mlp_layers:
            mlp_modules.append(nn.Linear(input_size, output_size))
            mlp_modules.append(nn.ReLU())
            input_size = output_size
        self.mlp = nn.Sequential(*mlp_modules)

        self.fusion_layer = nn.Linear(embedding_size + mlp_layers[-1], 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, user, item):
        user_emb_gmf = self.user_embedding_gmf(user)
        item_emb_gmf = self.item_embedding_gmf(item)
        gmf_output = user_emb_gmf * item_emb_gmf

        user_emb_mlp = self.user_embedding_mlp(user)
        item_emb_mlp = self.item_embedding_mlp(item)
        mlp_input = torch.cat([user_emb_mlp, item_emb_mlp], dim=-1)
        mlp_output = self.mlp(mlp_input)

        fusion_input = torch.cat([gmf_output, mlp_output], dim=-1)
        output = self.fusion_layer(fusion_input)
        return self.sigmoid(output)



In [17]:
# 3. Recall@10
def recall_at_k(model, test_loader, k=10):
    model.eval()
    recalls = []
    with torch.no_grad():
        for user, item, label in test_loader:
            user, item, label = user.cuda(), item.cuda(), label.cuda()
            pred = model(user, item)
            _, indices = torch.topk(pred, k)
            relevant = label[indices].sum()
            total_relevant = label.sum()
            if total_relevant > 0:
                recalls.append(relevant / total_relevant)
    return np.mean(recalls) if recalls else 0

In [18]:
def main():

    train_df = pd.read_csv('train.csv')
    val_df = pd.read_csv('val.csv')
    test_df = pd.read_csv('test.csv')

    #0-based indexing
    for df in [train_df, val_df, test_df]:
        df['userId'] = df['userId'] - 1
        df['movie_idx'] = df['movie_idx'] - 1

    # global sets for negative samplingg
    all_movies = set(train_df['movie_idx'].unique()) | set(val_df['movie_idx'].unique()) | set(test_df['movie_idx'].unique())
    user_item_set = set(zip(train_df['userId'].values, train_df['movie_idx'].values))

    #datasets
    train_dataset = MovieLensDataset(train_df, all_movies=all_movies, user_item_set=user_item_set)
    val_dataset = MovieLensDataset(val_df, all_movies=all_movies, user_item_set=user_item_set)
    test_dataset = MovieLensDataset(test_df, all_movies=all_movies, user_item_set=user_item_set)


    train_loader = DataLoader(train_dataset, batch_size=256)
    val_loader = DataLoader(val_dataset, batch_size=256)
    test_loader = DataLoader(test_dataset, batch_size=256)

if __name__ == "__main__":
    main()