In [21]:
import pandas as pd
import numpy as np
import random
from copy import deepcopy
import torch
from torch.utils.data import DataLoader, Dataset

In [2]:
ml1m_dir = 'ml-1m/ratings.dat'
ml1m_rating = pd.read_csv(ml1m_dir, sep='::', header=None, names=['uid', 'mid', 'rating', 'timestamp'],  engine='python')

In [3]:
user_id = ml1m_rating[['uid']].drop_duplicates().reindex()
user_id['userId'] = np.arange(len(user_id))

In [4]:
item_id = ml1m_rating[['mid']].drop_duplicates()
item_id['itemId'] = np.arange(len(item_id))

In [5]:
ml1m_rating = pd.merge(ml1m_rating, user_id, on=['uid'], how='left')
ml1m_rating = pd.merge(ml1m_rating, item_id, on=['mid'], how='left')

In [6]:
ml1m_rating = ml1m_rating[['userId', 'itemId', 'rating', 'timestamp']]

In [7]:
user_pool = set(ml1m_rating['userId'].unique())
item_pool = set(ml1m_rating['itemId'].unique())

In [8]:
# convert to binary data
ratings = deepcopy(ml1m_rating)
ratings['rating'][ratings['rating'] > 0] = 1.0

In [9]:
# split train, test data
# we adopt the leave-one-out evaluation
# For each user, we held-out her latest interaction as the test set and utilized the rematining data for training
ratings['rank_latest'] = ratings.groupby(['userId'])['timestamp'].rank(method='first', ascending=False)
test = ratings[ratings['rank_latest'] == 1]
train = ratings[ratings['rank_latest'] > 1]

train = train[['userId', 'itemId', 'rating']]
test = test[['userId', 'itemId', 'rating']]

In [10]:
interact_status = ratings.groupby("userId")["itemId"].apply(set).reset_index().rename(columns={"itemId":"interacted_items"})
interact_status['negative_items'] = interact_status['interacted_items'].apply(lambda x: item_pool - x)
# we uniformly sample negative instances from unobserved interactions in each iteration
# and control the sampling ratio w.r.t. the number of observed interations
interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, 99))

In [19]:
# Custom Dataset
# https://wikidocs.net/57165
class RatingDataset(Dataset):
    """
    torch.utils.data.Dataset 상속
    """
    def __init__(self, user_tensor, item_tensor, target_tensor):
        self.user_tensor = user_tensor
        self.item_tensor = item_tensor
        self.target_tensor = target_tensor
        
    def __len__(self):
        return self.user_tensor.size(0)
    
    def __getitem__(self, index):
        return self.user_tensor[index], self.item_tensor[index], self.target_tensor[index]

In [12]:
users, items, ratings = [], [], []
train_ratings = pd.merge(train, interact_status)
# sample 10 negative items
train_ratings["negatives"] = train_ratings["negative_items"].apply(lambda x : random.sample(x, 10))
train_ratings

Unnamed: 0,userId,itemId,rating,interacted_items,negative_items,negative_samples,negatives
0,0,0,1,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[1102, 2500, 3569, 1120, 2035, 510, 2781, 3365...","[3091, 623, 176, 2474, 1838, 754, 3281, 529, 3..."
1,0,1,1,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[1102, 2500, 3569, 1120, 2035, 510, 2781, 3365...","[1355, 3702, 2006, 2206, 448, 1618, 3584, 617,..."
2,0,2,1,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[1102, 2500, 3569, 1120, 2035, 510, 2781, 3365...","[543, 705, 3534, 2100, 222, 647, 2232, 2003, 2..."
3,0,3,1,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[1102, 2500, 3569, 1120, 2035, 510, 2781, 3365...","[3513, 1113, 1029, 2172, 3109, 112, 310, 3244,..."
4,0,4,1,"{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","{53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...","[1102, 2500, 3569, 1120, 2035, 510, 2781, 3365...","[3308, 403, 1368, 1863, 1738, 810, 2085, 286, ..."
...,...,...,...,...,...,...,...
994164,6039,772,1,"{0, 9, 15, 21, 22, 23, 26, 38, 39, 40, 41, 42,...","{1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 1...","[1776, 2219, 123, 1842, 2242, 1149, 2301, 1214...","[2267, 3208, 523, 3586, 2022, 1363, 667, 1768,..."
994165,6039,1106,1,"{0, 9, 15, 21, 22, 23, 26, 38, 39, 40, 41, 42,...","{1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 1...","[1776, 2219, 123, 1842, 2242, 1149, 2301, 1214...","[3559, 2306, 1107, 1810, 2127, 2987, 916, 2388..."
994166,6039,365,1,"{0, 9, 15, 21, 22, 23, 26, 38, 39, 40, 41, 42,...","{1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 1...","[1776, 2219, 123, 1842, 2242, 1149, 2301, 1214...","[1262, 2171, 1920, 624, 1656, 1200, 2520, 2341..."
994167,6039,152,1,"{0, 9, 15, 21, 22, 23, 26, 38, 39, 40, 41, 42,...","{1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 1...","[1776, 2219, 123, 1842, 2242, 1149, 2301, 1214...","[1553, 2279, 1213, 908, 1623, 3334, 2063, 1356..."


In [14]:
for row in train_ratings.itertuples():
    users.append(int(row.userId))
    items.append(int(row.itemId))
    ratings.append(float(row.rating))
    for i in range(10):
        users.append(int(row.userId))
        items.append(int(row.negatives[i]))
        ratings.append(float(0))

In [22]:
dataset = RatingDataset(user_tensor = torch.LongTensor(users),
                       item_tensor = torch.LongTensor(items),
                       target_tensor = torch.FloatTensor(ratings))