In [91]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, TensorDataset, DataLoader
import tqdm

In [92]:
import pandas as pd
from sklearn import model_selection
# Apply and divide Train-Data and Test-Data.
df = pd.read_csv("ml-20m/ml-20m/ratings.csv")

In [95]:
# Variable 'X' is (userID, movieID) pair.
X = df[["userId", "movieId"]].values
Y = df[["rating"]].values
df[["rating"]] = df[["rating"]].astype(int)
# Y = df[['rating']].astype(int)
print(Y)
print(df)

[[3]
 [3]
 [3]
 ...
 [3]
 [5]
 [2]]
          userId  movieId  rating   timestamp
0              1        2       3  1112486027
1              1       29       3  1112484676
2              1       32       3  1112484819
3              1       47       3  1112484727
4              1       50       3  1112484580
...          ...      ...     ...         ...
20000258  138493    68954       4  1258126920
20000259  138493    69526       4  1259865108
20000260  138493    69644       3  1260209457
20000261  138493    70286       5  1258126944
20000262  138493    71619       2  1255811136

[20000263 rows x 4 columns]


In [96]:
# Divide Train-Data and Test-Data as 9 to 1.
train_X, test_X, train_Y, test_Y\
    = model_selection.train_test_split(X, Y, test_size=0.1)

In [97]:
# Convert float32 of tensor that the 'X' is ID, and Integer,
# and the 'Y' is Real-Number.
train_dataset = TensorDataset(
    torch.tensor(train_X, dtype=torch.int64),
    torch.tensor(train_Y, dtype=torch.float32),
)

test_dataset = TensorDataset(
    torch.tensor(test_X, dtype=torch.int64),
    torch.tensor(test_Y, dtype=torch.float32),
)

In [98]:
train_loader = DataLoader(
    train_dataset, batch_size=1024, num_workers=4, shuffle=True
)

test_loader = DataLoader(
    train_dataset, batch_size=1024, num_workers=4
)

In [99]:
class MatrixFactorization(nn.Module):
    def __init__(self, max_user, max_item, k=20):
        super().__init__()
        self.max_user = max_user
        self.max_item = max_item
        self.user_emb = nn.Embedding(max_user, k, 0)
        self.item_emb = nn.Embedding(max_item, k, 0)
        
    def forward(self, x):
        user_idx = x[:, 0]
        item_idx = x[:, 1]
        user_feature = self.user_emb(user_idx)
        item_feature = self.item_emb(item_idx)
        
        # 'user_feature * item_feature' is the dimension (batch_size, k),
        # so it becames each of internal-Samples when you get the value 
        # of 'sum' of variable 'k'.
        out = torch.sum(user_feature * item_feature, 1)
        
        # Adjust range [0, 5].
        out = torch.sigmoid(out) * 5
        # out = nn.functional.sigmoid(out) * 5
        return out

In [100]:
max_user, max_item = X.max(0)
# Cast 'np.int64' types to the Python standard type, 'int'.
max_user = int(max_user)
max_item = int(max_item)
net = MatrixFactorization(max_user + 1, max_item + 1)

In [101]:
def eval_net(net, loader, score_fn=nn.functional.l1_loss, device="cpu"):
    ys = []
    ypreds = []
    
    for x, y in loader:
        x = x.to(device)
        ys.append(y)
        
        with torch.no_grad():
            ypred = net(x).to("cpu").view(-1)
        ypreds.append(ypred)
    
    score = score_fn(torch.cat(ys).squeeze(), torch.cat(ypreds))
    return score.item()

In [102]:
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3,1112486027
1,1,29,3,1112484676
2,1,32,3,1112484819
3,1,47,3,1112484727
4,1,50,3,1112484580
...,...,...,...,...
20000258,138493,68954,4,1258126920
20000259,138493,69526,4,1259865108
20000260,138493,69644,3,1260209457
20000261,138493,70286,5,1258126944


In [103]:
net.to("cuda:0")
opt = optim.Adam(net.parameters(), lr=0.01)
loss_f = nn.MSELoss()

In [105]:
from statistics import mean
for epoch in range(5):
    loss_log = []
    for x, y in tqdm.tqdm(train_loader):
        x = x.to("cuda:0")
        y = y.to("cuda:0")
        o = net(x)
        
        loss = loss_f(o, y.view(-1))
        net.zero_grad()
        loss.backward()
        opt.step()
        loss_log.append(loss.item())
        
    test_score = eval_net(net, test_loader, device="cuda:0")
    print(epoch, mean(loss_log), test_score, flush=True)

100%|██████████| 17579/17579 [04:43<00:00, 61.94it/s]


0 0.9477123191613129 0.7142005562782288


100%|██████████| 17579/17579 [05:25<00:00, 54.07it/s]


1 0.9012071292950599 0.6995598077774048


100%|██████████| 17579/17579 [05:32<00:00, 52.95it/s]


2 0.8782200102303187 0.6920842528343201


100%|██████████| 17579/17579 [05:38<00:00, 51.87it/s]


3 0.8654022910562525 0.6881285309791565


100%|██████████| 17579/17579 [05:52<00:00, 49.93it/s]


4 0.8575298043571985 0.6837207674980164


In [106]:
# Move Trained-Model to 'CPU'.
# Thankfully, I don't use VM(Virtual-Machine), but use CUDA,
# so it works smoothly.
net.to("cpu")

# Calculate evaluation of User1 of Movie10.
query = (1, 10)
print(query)

# df[['rating']] = df[['rating']].astype(int)


# Convert as 'int64' Tensor and add batch-dimension.
query = torch.tensor(query, dtype=torch.float).view(1, -1)

# Send to 'net'.
net(query)
# print(net(query))

(1, 10)


RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.FloatTensor instead (while checking arguments for embedding)

In [None]:
# Select User1 of Top-5-movies.
query = query.float()
query = torch.stack([
    torch.zeros(max_item).fill_(1),
    torch.arange(1, max_item+1)
], 1).long()

# 'scores' is a number of Top-k scores.
# 'indices' is a number of Top-k position,
# it means movieId.
scores, indices = torch.topk(net(query), 5)