In [None]:
import pandas as pd
import numpy as np
import random
from copy import deepcopy

In [None]:
ml1m_dir = 'ml-1m/ratings.dat'
ml1m_rating = pd.read_csv(ml1m_dir, sep='::', header=None, names=['uid', 'mid', 'rating', 'timestamp'],  engine='python')

In [None]:
user_id = ml1m_rating[['uid']].drop_duplicates().reindex()
user_id['userId'] = np.arange(len(user_id))

In [None]:
item_id = ml1m_rating[['mid']].drop_duplicates()
item_id['itemId'] = np.arange(len(item_id))

In [None]:
ml1m_rating = pd.merge(ml1m_rating, user_id, on=['uid'], how='left')
ml1m_rating = pd.merge(ml1m_rating, item_id, on=['mid'], how='left')

In [None]:
ml1m_rating = ml1m_rating[['userId', 'itemId', 'rating', 'timestamp']]

In [None]:
user_pool = set(ml1m_rating['userId'].unique())
item_pool = set(ml1m_rating['itemId'].unique())

In [None]:
# convert to binary data
ratings = deepcopy(ml1m_rating)
ratings['rating'][ratings['rating'] > 0] = 1.0

In [None]:
# split train, test data
# we adopt the leave-one-out evaluation
# For each user, we held-out her latest interaction as the test set and utilized the rematining data for training
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)
test = ratings[ratings['rank_latest'] == 1]
train = ratings[ratings['rank_latest'] > 1]

train = train[['userId', 'itemId', 'rating']]
test = test[['userId', 'itemId', 'rating']]

In [None]:
interact_status = ratings.groupby("userId")["itemId"].apply(set).reset_index().rename(columns={"itemId":"interacted_items"})
interact_status['negative_items'] = interact_status['interacted_items'].apply(lambda x: item_pool - x)
# we uniformly sample negative instances from unobserved interactions in each iteration
# and control the sampling ratio w.r.t. the number of observed interations
interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, 99))

In [None]:
# Custom Dataset
# https://wikidocs.net/57165
class RatingDataset(Dataset):
    """
    torch.utils.data.Dataset 상속
    """
    def __init__(self, user_tensor, item_tensor, target_tensor):
        self.user_tensor = user_tensor
        self.item_tensor = item_tensor
        self.target_tensor = target_tensor
        
    def __len__(self):
        return self.user_tensor.size(0)
    
    def __getitem__(self, index):
        return self.user_tensor[index], self.item_tensor[index], self.target_tensor[index]

In [None]:
users, items, ratings = [], [], []
train_ratings = pd.merge(train, interact_status)
# sample 10 negative items
train_ratings["negatives"] = train_ratings["negative_items"].apply(lambda x : random.sample(x, 10))
train_ratings

In [None]:
for row in train_ratings.itertuples():
    users.append(int(row.userId))
    items.append(int(row.itemId))
    ratings.append(float(row.rating))
    for i in range(num_negatives):
        users.append(int(row.userId))
        items.append(int(row.negatives[i]))