<a href="https://colab.research.google.com/github/JHyunjun/SNU/blob/main/Deep%20Matrix%20Factorization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [23]:
import numpy as np

In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
# MovieLens의 1MB raw data만 활용
in_path = '/content/drive/MyDrive/Colab Notebooks/snu/w4/실습자료/deepmf-lab/data/ml-1m-raw/'
rating_file = in_path + 'ratings.dat'

In [26]:
raw = []
with open(rating_file, 'r') as f_read:
    for line in f_read.readlines():
      #user id :: item id :: rating :: time stamp인데 ::를 분기점으로 4개로 쪼갬
        line_list = line.split('::')
        raw.append(line_list)
raw = np.array(raw, dtype=np.int)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  import sys


In [27]:
raw #Time stamp : 조사한 시점

array([[        1,      1193,         5, 978300760],
       [        1,       661,         3, 978302109],
       [        1,       914,         3, 978301968],
       ...,
       [     6040,       562,         5, 956704746],
       [     6040,      1096,         4, 956715648],
       [     6040,      1097,         4, 956715569]])

In [28]:
user_ids = list()
item_ids = list()
user_map = dict() # raw -> new
item_map = dict() # raw -> new

user_ids = np.unique(raw[:, 0]) # raw[:,0] : 기존의 user id array
item_ids = np.unique(raw[:, 1]) # raw[:,1] : 기존의 item id array

#item이랑 user가 0부터 있는게 아니라서 이걸 새로 매핑시킴 0부터~
user_map = {v: i for (i, v) in enumerate(user_ids)} 
item_map = {v: i for (i, v) in enumerate(item_ids)}

# 여기서 id만 반환해서 재정렬함 ex) 첫번째 rank -> 0,0,5 ... 순서대로 재정렬
new = [[user_map[u], item_map[i], r] for (u, i, r)
      in zip(raw[:, 0], raw[:, 1], raw[:, 2])] # new array
new = np.array(new)
print(new.shape)

(1000209, 3)


In [29]:
from sklearn.model_selection import train_test_split

In [30]:
train, test = train_test_split(new, test_size=0.2, shuffle=True, random_state=42)

In [31]:
print(train.shape)
print(test.shape)

(800167, 3)
(200042, 3)


In [32]:
# Dataset : 여러 데이터를 묶어주는 역할
# Data Loader : shuffle하거나 batch_size대로 정리해주는 역할
from torch.utils.data import Dataset, DataLoader

In [33]:
class MovieLensDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y
    
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        x = torch.LongTensor(self.x[idx, :])
        y = torch.FloatTensor(self.y[idx, :])
        return x, y

In [34]:
train_dataset = MovieLensDataset(train[:, :-1], np.expand_dims(train[:, -1], axis=1))
train_dataloader = DataLoader(train_dataset, batch_size=512, shuffle=True)

In [35]:
test_dataset = MovieLensDataset(test[:, :-1], np.expand_dims(test[:, -1], axis=1))
test_dataloader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False)

In [36]:
# User와 Item의 갯수 연산
num_users = max(max(train[:, 0]), max(test[:, 0])) + 1
num_items = max(max(train[:, 1]), max(test[:, 1])) + 1

In [37]:
K = 10
# [Users],10] * [10,Movies]
lr = 1e-3
decay = 0
epochs = 20

In [38]:
#GPU Available시 이용하도록 설정
import torch
import time
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [39]:
class MF(torch.nn.Module):
    def __init__(self, num_users, num_items, K):
        super().__init__()
        self.user_emb = torch.nn.Embedding(num_users, K)
        self.item_emb = torch.nn.Embedding(num_items, K)
    
    def forward(self, user_idx, item_idx):
        out = (self.user_emb(user_idx) * self.item_emb(item_idx)).sum(1, keepdim=True) #Element wise
        return out

In [40]:
class DeepMF(torch.nn.Module):
    def __init__(self, num_users, num_items, K, hidden_dim1, hidden_dim2):
        super().__init__()
        self.user_emb = torch.nn.Embedding(num_users, K) #[user,10] 
        self.item_emb = torch.nn.Embedding(num_items, K) #[10, movies] Matrix Declaration
        self.layer1 = torch.nn.Linear(2*K, hidden_dim1)
        self.layer2 = torch.nn.Linear(hidden_dim1, hidden_dim2)
        self.out = torch.nn.Linear(hidden_dim2, 1)
        self.activation = torch.nn.ReLU()
    
    def forward(self, user_idx, item_idx):
        out = torch.cat((self.user_emb(user_idx), self.item_emb(item_idx)), dim=1)
        out = self.activation(self.layer1(out))
        out = self.activation(self.layer2(out))
        out = self.out(out)
        return out

In [41]:
model = MF(num_users, num_items, K)
model.to(DEVICE) #사용자의 실행환경이 cuda를 실행할 수 있으면 GPU 사용

MF(
  (user_emb): Embedding(6040, 10)
  (item_emb): Embedding(3706, 10)
)

In [42]:
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=decay)

In [43]:
for epoch in range(epochs):
    start_time = time.time()
    train_mse = 0.
    test_mse = 0.
    
    # train the model, 모델을 학습모드로 변경
    model.train()
    for batch_idx, (x, r) in enumerate(train_dataloader):
        # get data
        x, r = x.to(DEVICE), r.to(DEVICE) #x : item id, r : user가 item에 매긴 rating
        i, j = x[:, 0], x[:, 1] # i : x에서 첫번째열은 user index, j : x에서 두번째열은 item index
        
        # set gradients to zero
        optimizer.zero_grad()
        
        # predict ratings
        pred = model(i, j)
        
        # get loss
        loss = criterion(pred, r)
        train_mse += loss.item()
        
        # backpropagation
        loss.backward()
        
        # update the parameters
        optimizer.step()
    
    train_rmse = (train_mse/(batch_idx+1))**.5
    
    # test the model
    model.eval()
    for batch_idx, (x, r) in enumerate(test_dataloader):
        # get data
        x, r = x.to(DEVICE), r.to(DEVICE)
        i, j = x[:, 0], x[:, 1]
        
        # predict ratings
        pred = model(i, j)
        
        # get loss
        loss = criterion(pred, r)
        test_mse += loss.item()
    
    test_rmse = (test_mse/(batch_idx+1))**.5
    
    end_time = time.time()
    print(f'[{end_time-start_time:.2f}] Epoch: {epoch+1:3d}, '
          f'TrnRMSE: {train_rmse:.4f}, TestRMSE: {test_rmse:.4f}')

[14.21] Epoch:   1, TrnRMSE: 4.6678, TestRMSE: 4.4659
[14.40] Epoch:   2, TrnRMSE: 4.3060, TestRMSE: 4.2140
[14.46] Epoch:   3, TrnRMSE: 4.0713, TestRMSE: 4.0291
[14.74] Epoch:   4, TrnRMSE: 3.8508, TestRMSE: 3.7717
[16.04] Epoch:   5, TrnRMSE: 3.4192, TestRMSE: 3.1265
[18.31] Epoch:   6, TrnRMSE: 2.5780, TestRMSE: 2.2210
[15.33] Epoch:   7, TrnRMSE: 1.8222, TestRMSE: 1.6708
[15.76] Epoch:   8, TrnRMSE: 1.4135, TestRMSE: 1.3792
[14.46] Epoch:   9, TrnRMSE: 1.1974, TestRMSE: 1.2165
[15.29] Epoch:  10, TrnRMSE: 1.0775, TestRMSE: 1.1214
[14.41] Epoch:  11, TrnRMSE: 1.0088, TestRMSE: 1.0650
[15.43] Epoch:  12, TrnRMSE: 0.9678, TestRMSE: 1.0304
[13.84] Epoch:  13, TrnRMSE: 0.9423, TestRMSE: 1.0075
[14.77] Epoch:  14, TrnRMSE: 0.9251, TestRMSE: 0.9923
[15.61] Epoch:  15, TrnRMSE: 0.9128, TestRMSE: 0.9813
[17.45] Epoch:  16, TrnRMSE: 0.9031, TestRMSE: 0.9727
[14.80] Epoch:  17, TrnRMSE: 0.8949, TestRMSE: 0.9651
[14.59] Epoch:  18, TrnRMSE: 0.8876, TestRMSE: 0.9595
[16.34] Epoch:  19, TrnRMSE:

In [55]:
# Deep Matrix Factorization
K = 10
# [Users],10] * [10,Movies]
hidden_dim1 = 10
hidden_dim2 = 5
lr = 1e-3
decay = 0 # Regularizor의 Lambda
epochs = 20

model_dmf = DeepMF(num_users, num_items, K, hidden_dim1, hidden_dim2)
model_dmf.to(DEVICE)
criterion_dmf = torch.nn.MSELoss()
optimizer_dmf = torch.optim.Adam(model_dmf.parameters(), lr=lr, weight_decay=decay)

In [56]:
for epoch in range(epochs):
    start_time = time.time()
    train_mse = 0.
    test_mse = 0.
    
    # train the model
    model_dmf.train()
    for batch_idx, (x, r) in enumerate(train_dataloader):
        # get data
        x, r = x.to(DEVICE), r.to(DEVICE)
        i, j = x[:, 0], x[:, 1]
        
        # set gradients to zero
        optimizer_dmf.zero_grad() #이렇게 zero gradient로 하면 매 epoch마다 momentum이 사라지지 않는가??
        
        # predict ratings
        pred = model_dmf(i, j)
        
        # get loss
        loss = criterion_dmf(pred, r)
        train_mse += loss.item()
        
        # backpropagation
        loss.backward()
        
        # update the parameters
        optimizer_dmf.step()
    
    train_rmse = (train_mse/(batch_idx+1))**.5
    
    # test the model
    model_dmf.eval()
    for batch_idx, (x, r) in enumerate(test_dataloader):
        # get data
        x, r = x.to(DEVICE), r.to(DEVICE)
        i, j = x[:, 0], x[:, 1]
        
        # predict ratings
        pred = model_dmf(i, j)
        
        # get loss
        loss = criterion_dmf(pred, r)
        test_mse += loss.item()
    
    test_rmse = (test_mse/(batch_idx+1))**.5
    
    end_time = time.time()
    print(f'[{end_time-start_time:.2f}] Epoch: {epoch+1:3d}, '
          f'TrnRMSE: {train_rmse:.4f}, TestRMSE: {test_rmse:.4f}')

[21.07] Epoch:   1, TrnRMSE: 1.8357, TestRMSE: 1.0878
[19.46] Epoch:   2, TrnRMSE: 1.0364, TestRMSE: 1.0068
[19.73] Epoch:   3, TrnRMSE: 0.9777, TestRMSE: 0.9675
[19.72] Epoch:   4, TrnRMSE: 0.9425, TestRMSE: 0.9412
[20.19] Epoch:   5, TrnRMSE: 0.9210, TestRMSE: 0.9274
[20.34] Epoch:   6, TrnRMSE: 0.9103, TestRMSE: 0.9210
[21.65] Epoch:   7, TrnRMSE: 0.9044, TestRMSE: 0.9179
[20.10] Epoch:   8, TrnRMSE: 0.9010, TestRMSE: 0.9155
[20.05] Epoch:   9, TrnRMSE: 0.8984, TestRMSE: 0.9149
[20.14] Epoch:  10, TrnRMSE: 0.8966, TestRMSE: 0.9135
[20.03] Epoch:  11, TrnRMSE: 0.8951, TestRMSE: 0.9135
[21.24] Epoch:  12, TrnRMSE: 0.8938, TestRMSE: 0.9137
[20.85] Epoch:  13, TrnRMSE: 0.8927, TestRMSE: 0.9120
[20.51] Epoch:  14, TrnRMSE: 0.8916, TestRMSE: 0.9115
[19.37] Epoch:  15, TrnRMSE: 0.8906, TestRMSE: 0.9109
[19.63] Epoch:  16, TrnRMSE: 0.8896, TestRMSE: 0.9106
[19.91] Epoch:  17, TrnRMSE: 0.8887, TestRMSE: 0.9105
[21.44] Epoch:  18, TrnRMSE: 0.8879, TestRMSE: 0.9101
[19.54] Epoch:  19, TrnRMSE: