In [1]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, TensorDataset, DataLoader
import tqdm

In [2]:
import pandas as pd
from sklearn import model_selection
# Apply and divide Train-Data and Test-Data.
df = pd.read_csv("ml-20m/ratings.csv")

In [3]:
# Variable 'X' is (userID, movieID) pair.
X = df[["userId", "movieId"]].values
Y = df[['rating']].astype(int)

Y = df[["rating"]].values

# df[["rating"]] = df[["rating"]].astype(int)
print("Format of Y : ", Y)
print("df : ", df)

Format of Y :  [[3.5]
 [3.5]
 [3.5]
 ...
 [3. ]
 [5. ]
 [2.5]]
df :            userId  movieId  rating   timestamp
0              1        2     3.5  1112486027
1              1       29     3.5  1112484676
2              1       32     3.5  1112484819
3              1       47     3.5  1112484727
4              1       50     3.5  1112484580
...          ...      ...     ...         ...
20000258  138493    68954     4.5  1258126920
20000259  138493    69526     4.5  1259865108
20000260  138493    69644     3.0  1260209457
20000261  138493    70286     5.0  1258126944
20000262  138493    71619     2.5  1255811136

[20000263 rows x 4 columns]


In [4]:
# Divide Train-Data and Test-Data as 9 to 1.
train_X, test_X, train_Y, test_Y\
    = model_selection.train_test_split(X, Y, test_size=0.1)
# dtype = torch.cuda.FloatTensor if torch.cuda.is_available() else torch.FloatTensor

In [13]:
import pandas as pd
from sklearn import model_selection
# Apply and divide Train-Data and Test-Data.
df = pd.read_csv("ml-20m/ratings.csv")

In [5]:
# Convert float32 of tensor that the 'X' is ID, and Integer,
# and the 'Y' is Real-Number.
train_dataset = TensorDataset(
    torch.tensor(train_X, dtype=torch.int64),
    torch.tensor(train_Y, dtype=torch.float32)
)

test_dataset = TensorDataset(
    torch.tensor(test_X, dtype=torch.int64),
    torch.tensor(test_Y, dtype=torch.float32)
)

In [6]:
train_loader = DataLoader(
    train_dataset, batch_size=1024, num_workers=4, shuffle=True
)

test_loader = DataLoader(
    train_dataset, batch_size=1024, num_workers=4
)

In [7]:
class MatrixFactorization(nn.Module):
    def __init__(self, max_user, max_item, k=20):
        super().__init__()
        self.max_user = max_user
        self.max_item = max_item
        self.user_emb = nn.Embedding(max_user, k, 0)
        self.item_emb = nn.Embedding(max_item, k, 0)
        
    def forward(self, x):
        user_idx = x[:, 0]
        item_idx = x[:, 1]
        user_feature = self.user_emb(user_idx)
        item_feature = self.item_emb(item_idx)
        
        # 'user_feature * item_feature' is the dimension (batch_size, k),
        # so it becames each of internal-Samples when you get the value 
        # of 'sum' of variable 'k'.
        out = torch.sum(user_feature * item_feature, 1)
        
        # Adjust range [0, 5].
        out = torch.sigmoid(out) * 5
        # out = nn.functional.sigmoid(out) * 5
        return out

In [8]:
max_user, max_item = X.max(0)
# Cast 'np.int64' types to the Python standard type, 'int'.
# max_user = max_user.type(dtype)
# max_item = max_item.type(dtype)
# It occurs error when you 
max_user = int(max_user)
max_item = int(max_item)
print(max_user)
print(max_item)
net = MatrixFactorization(max_user + 1, max_item + 1)

138493
131262


In [9]:
def eval_net(net, loader, score_fn=nn.functional.l1_loss, device="cpu"):
    ys = []
    ypreds = []
    
    for x, y in loader:
        x = x.to(device)
        ys.append(y)
        
        with torch.no_grad():
            ypred = net(x).to("cpu").view(-1)
        ypreds.append(ypred)
    
    score = score_fn(torch.cat(ys).squeeze(), torch.cat(ypreds))
    return score.item()

In [10]:
from statistics import mean
net.to("cuda:0")
opt = optim.Adam(net.parameters(), lr=0.01)
loss_f = nn.MSELoss()

In [11]:
for epoch in range(5):
    loss_log = []
    for x, y in tqdm.tqdm(train_loader):
        x = x.to("cuda:0")
        y = y.to("cuda:0")
        o = net(x)
        
        loss = loss_f(o, y.view(-1))
        net.zero_grad()
        loss.backward()
        opt.step()
        loss_log.append(loss.item())
        
    test_score = eval_net(net, test_loader, device="cuda:0")
    print(epoch, mean(loss_log), test_score, flush=True)

100%|██████████| 17579/17579 [03:42<00:00, 78.98it/s]


0 1.6112729562645147 0.7146379351615906


100%|██████████| 17579/17579 [04:24<00:00, 66.58it/s]


1 0.8811834963137423 0.6807683110237122


100%|██████████| 17579/17579 [05:01<00:00, 58.27it/s]


2 0.8319680422766569 0.6667460799217224


100%|██████████| 17579/17579 [04:58<00:00, 58.84it/s]


3 0.8098843764191405 0.6586387157440186


100%|██████████| 17579/17579 [04:57<00:00, 59.05it/s]


4 0.7981393395840589 0.6553979516029358


In [12]:
# Move Trained-Model to 'CPU'.
# Thankfully, I don't use VM(Virtual-Machine), but use CUDA,
# so it works smoothly.
net.to("cpu")

MatrixFactorization(
  (user_emb): Embedding(138494, 20, padding_idx=0)
  (item_emb): Embedding(131263, 20, padding_idx=0)
)

In [13]:
# Calculate evaluation of User1 of Movie10.
query = (1, 10)

# Convert as 'int64' Tensor and add batch-dimension.
query = torch.tensor(query, dtype=torch.int64).view(1, -1)

# Send to 'net'.
net(query)
print(net(query))

tensor([3.6396], grad_fn=<MulBackward0>)


In [14]:
# Select User1 of Top-5-movies.
# max_item = max_item.type(float)
# query = query.astype(float)  --> doesn't work.
# query = query.float(query)  --> doesn't work either.
# query = torch.stack([
#     torch.zeros(max_item).fill_(1),
#     torch.arange(1, max_item+1)
# ], 1).long()

# 'scores' is a number of Top-k scores.
# 'indices' is a number of Top-k position,
# it means movieId.
scores, indices = torch.topk(net(query), 1)