In [1]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, TensorDataset, DataLoader
import tqdm

In [None]:
import pandas as pd
from sklearn import model_selection
# Apply and divide Train-Data and Test-Data.
df = pd.read_csv("ml-20m/ml-20m/ratings.csv")

In [None]:
# Variable 'X' is (userID, movieID) pair.
X = df[["userId", "movieId"]].values
Y = df[["rating"]].values

In [None]:
# Divide Train-Data and Test-Data as 9 to 1.
train_X, test_X, train_Y, test_Y\
    = model_selection.train_test_split(X, Y, test_size=0.1)

In [None]:
# Convert float32 of tensor that the 'X' is ID, and Integer,
# and the 'Y' is Real-Number.
train_dataset = TensorDataset(
    torch.tensor(train_X, dtype=torch.int64),
    torch.tensor(train_Y, dtype=torch.float32)
)

test_dataset = TensorDataset(
    torch.tensor(test_X, dtype=torch.int64),
    torch.tensor(test_Y, dtype=torch.float32)
)

train_loader = DataLoader(
    train_dataset, batch_size=1024, num_workers=4, shuffle=True
)

test_loader = DataLoader(
    train_dataset, batch_size=2014, num_workers=4
)

In [None]:
class MatrixFactorization(nn.Module):
    def __init__(self, max_user, max_item, k=20):
        super().__init__()
        self.max_user = max_user
        self.max_item = max_item
        self.user_emb = nn.Embedding(max_user, k, 0)
        self.item_emb = nn.Embedding(max_item, k, 0)
        
    def forward(self, x):
        user_idx = x[:, 0]
        item_idx = x[:, 1]
        user_feature = self.user_emb(user_idx)
        item_feature = self.item_emb(item_idx)
        
        # 'user_feature * item_feature' is the dimension (batch_size, k),
        # so it becames each of internal-Samples when you get the value 
        # of 'sum' of variable 'k'.
        out = torch.sum(user_feature * item_feature, 1)
        
        # Adjust range [0, 5].
        # out = torch.sigmoid(out) * 5
        out = nn.functional.sigmoid(out) * 5
        return out

In [None]:
max_user, max_item = X.max(0)
# Cast 'np.int64' types to the Python standard type, 'int'.
max_user = int(max_user)
max_item = int(max_item)
net = MatrixFactorization(max_user + 1, max_item + 1)

In [None]:
def eval_net(net, loader, score_fn=nn.functional.l1_loss, device="cpu"):
    ys = []
    ypreds = []
    
    for x, y in loader:
        x = x.to(device)
        ys.append(y)
        
        with torch.no_grad():
            ypred = net(x).to("cpu").view(-1)
        ypreds.append(ypred)
    
    score = score_fn(torch.cat(ys).squeeze(), torch.cat(ypreds))
    return score.item()

In [None]:
from statistics import mean

net.to("cuda:0")
opt = optim.Adam(net.parameters(), lr=0.01)
loss_f = nn.MSELoss()

for epoch in range(5):
    loss_log = []
    for x, y in tqdm.tqdm(train_loader):
        x = x.to("cuda:0")
        y = y.to("cuda:0")
        o = net(x)
        loss = loss_f(o, y.view(-1))
        net.zero_grad()
        loss.backward()
        opt.step()
        loss_log.append(loss.item())
    test_score = eval_net(net, test_loader, device="cuda:0")
    print(epoch, mean(loss_log), test_score, flush=True)

In [None]:
# 훈련한 모델을 CPU로 이동
net.to("cpu")

# 사용자1의 영화 10에 대한 평가 계산
query = (1, 10)

# int64 Tensor로 변환하고 batch 차원을 추가
query = torch.tensor(query, dtype=torch.int64).view(1, -1)

# net에 전달
net(query)