In [54]:
import os
import warnings
warnings.filterwarnings("ignore")
import torch
import torch.nn as nn
import pandas as pd
import numpy as np

In [None]:
# movielens dataset
names = ['user_id', 'item_id', 'rating', 'timestamp']
data_dir = '../../data/MovieLens/ml-100k'

In [None]:
data = pd.read_csv(os.path.join(data_dir, 'u.data'), '\t', names=names, engine='python')

In [None]:
# user & item count
num_users = data.user_id.unique().shape[0]
num_items = data.item_id.unique().shape[0]

In [None]:
# summary
sparsity =1 - len(data) / (num_users * num_items)
sparsity

In [None]:
# data split: 每个uid最新的record作为test, 其他record作为train
train_items = {} # 记录每个uid, 所有评分records: k:uid, v:[(uid, iid, rating, time), ...]
test_items = {} # 记录每个uid, 最新评分records: k:uid, v:(iid, rating, time)
train_list = [] # 记录所有评分records (uid, iid, rating, time), ...按照uid、time从小到大排序
for line in data.itertuples():
    u, i , rating, time = line[1], line[2], line[3], line[4]
    train_items.setdefault(u, []).append((u, i, rating, time)) # 记录k-v: key是uid, v是(u,i,r,t)的records
    if u not in test_items or test_items[u][-1] < time: # 若这个uid未记录在test_items, 或该行的time比test_items所记录的更大
        test_items[u] = (i, rating, time) # 设定/更换test_items中这个uid的最新评分记录
for u in range(1, num_users+1): # uid就是1到num_users的自然数字
    train_list.extend( sorted(train_items[u], key=lambda k: k[3]) ) # 记录每一个uid的所有评分记录, 按time从小到大排序
test_data =  [(key, *value) for key, value in test_items.items()] # 分解test_items到test_data: [(uid, iid, rating, time), ...]
train_data = [item for item in train_list if item not in test_data] # 不在test_data里的评分记录, 是train_data
train_data = pd.DataFrame(train_data)
test_data = pd.DataFrame(test_data)

In [None]:
test_ratio = 0.1
mask = np.random.uniform(0, 1, (len(data))) < 1 - test_ratio

In [None]:
# data split: 从所有data中随机取 test_ratio 作为test
test_ratio = 0.1
mask = np.random.uniform(0, 1, len(data)) < 1 - test_ratio
neg_mask = [not x for x in mask]
train_data = data[mask]
test_data = data[neg_mask]

In [None]:
test_data.head()

In [None]:
# 因为pd.dataframe数据结构不可以直接用dataloader,所以必须先转化成dataset.
# 在这个过程中, 放弃使用user_id和item_id, 使用user_idx = user_id - 1 和item_idx = item_id - 1, 这样可以直接用在矩阵的index上
feedback = 'explicit'
users, items, scores = [], [], [] # data所有records的user_index, item_index, 以及scores
if feedback == 'explicit':
    interactions = np.zeros((num_items, num_users)) # 2D array, 每行代表1个item,每列代表1个user,元素代表score. i行j列代表iid=i+1, uid=j+1的打分
    for line in data.itertuples():
        user_index, item_index = int(line[1]-1), int(line[2]-1) # 记录该record的uid-1作为user_index, iid-1作为item_index
        score = int(line[3])
        users.append(user_index)
        items.append(item_index)
        scores.append(score)
        interactions[item_index, user_index] = score
if feedback == 'implicit':
    interactions = {} # dict, key是user_index, value是[item_index1, item_index2,...], 是该user给出隐形行为的item_index列表
    for line in data.itertuples():
        user_index, item_index = int(line[1]-1), int(line[2]-1)
        score = 1
        users.append(user_index)
        items.append(item_index)
        scores.append(score)
        interactions.setdefault(user_index, []).append(item_index)
user_tensor = torch.tensor(users)
item_tensor = torch.tensor(items)
score_tensor = torch.tensor(scores)

In [None]:
torch.utils.data.TensorDataset(user_tensor, item_tensor, score_tensor)

In [55]:
P = nn.Parameter(torch.randn(3, 2))

In [56]:
P

Parameter containing:
tensor([[ 1.7630,  0.1453],
        [-1.4067,  1.4468],
        [ 1.5111, -0.7922]], requires_grad=True)