# 목적
간단한 movie lens data를 토대로 MF를 구현해봅시다.

In [1]:
import pandas as pd
data = pd.read_csv("../ml-100k/u.data", sep = "\t", names = ["user_id","item_id","rating","timestamp"])
data["user_id"] -= 1
data["item_id"] -= 1
data["rating"] -= 1

In [2]:
data

Unnamed: 0,user_id,item_id,rating,timestamp
0,195,241,2,881250949
1,185,301,2,891717742
2,21,376,0,878887116
3,243,50,1,880606923
4,165,345,0,886397596
...,...,...,...,...
99995,879,475,2,880175444
99996,715,203,4,879795543
99997,275,1089,0,874795795
99998,12,224,1,882399156


# 숫자 세기

In [3]:
num_u = len(data.user_id.unique())
num_i = len(data.item_id.unique())
print(f"num of user = {num_u}, num of item = {num_i}")

num of user = 943, num of item = 1682


# Embedding layer 만들기(예제)

In [4]:
import torch
train_data = "you need to know how to code"
word_set = set(train_data.split())

# 단어:정수 Mapping
vocab = {tkn:i+1 for i,tkn in enumerate(word_set)}
vocab["<unk>"] = 0
vocab["<pad>"] = 1

In [5]:
import torch.nn as nn
embedding_layer = nn.Embedding(num_embeddings = len(vocab), embedding_dim= 3)

In [6]:
print(embedding_layer.weight)

Parameter containing:
tensor([[-2.0507, -0.5085, -0.3273],
        [-0.0980,  0.9515, -1.7041],
        [ 0.4137,  1.7659,  1.2630],
        [-0.6755,  1.0496,  0.8582],
        [ 1.3402,  1.0862,  0.8682],
        [ 0.4944, -0.2391,  0.0389],
        [ 0.2993,  1.0379,  1.0367],
        [ 0.3308, -0.2118, -0.4719]], requires_grad=True)


# Embedding layer (추천용)

In [7]:
dim = 10
user_em = nn.Embedding(num_embeddings = num_u,
                       embedding_dim = dim)
item_em = nn.Embedding(num_embeddings = num_i,
                      embedding_dim = dim)
print(f"Shape of user_em = {user_em}")
print(f"Shape of item_em = {item_em}")

user_em.weight[0].shape

Shape of user_em = Embedding(943, 10)
Shape of item_em = Embedding(1682, 10)


torch.Size([10])

In [8]:
user_em.weight

Parameter containing:
tensor([[-0.3210, -1.9865, -2.7789,  ..., -0.4105, -1.2386, -0.3254],
        [ 0.7660, -0.1116,  0.3011,  ...,  0.0061,  0.8234,  0.0473],
        [-0.9845, -1.4264,  0.5728,  ..., -1.4190, -0.5739, -0.4392],
        ...,
        [ 0.2876, -0.2003, -0.5114,  ...,  0.4709, -0.1488, -1.1763],
        [-1.5660,  1.1462, -0.2695,  ..., -0.4137, -0.7866,  0.0564],
        [-0.4814,  0.1979,  0.9047,  ..., -1.0386, -0.5263,  0.6636]],
       requires_grad=True)

# Stochastic Gradient Descent

In [9]:
#어떻게 파라미터를 update 시키지 -> freeze 시키면 되는듯..?

class MF(nn.Module):
    def __init__(self, num_u:int, num_i:int, dim:int):
        super().__init__()
        self.user_em = nn.Embedding(num_embeddings = num_u, embedding_dim = dim)
        self.item_em = nn.Embedding(num_embeddings = num_i, embedding_dim = dim)
        self.fc = nn.Linear(1, 5)
        
        nn.init.normal_(self.user_em.weight, mean = 0, std = 0.01)
        nn.init.normal_(self.item_em.weight, mean = 0, std = 0.01)
        nn.init.normal_(self.fc.weight, mean = 0, std = 0.01)
    
    def forward(self, user, item):
        out = torch.sum(self.user_em(user) * self.item_em(item), dim=1)
        out = out.view(out.size(0),1)
        out = self.fc(out)
        
        return out
model = MF(num_u, num_i, dim)

In [10]:
X = data[["user_id","item_id"]]
Y = data["rating"]

In [11]:
import random
random.seed(42)
test_idx = random.sample(range(len(data)), int(len(data)*0.2 ))

test = data.iloc[test_idx,:]
train = data.drop(test_idx).sample(frac = 1) # 셔플시켜줌

X_test, Y_test = test[["user_id","item_id"]].values.tolist(), test["rating"].tolist()
X_train, Y_train = train[["user_id","item_id"]].values.tolist(), train["rating"].tolist()

In [12]:
print(f"Train = {len(X_train), len(Y_train)}")
print(f"Test = {len(X_test), len(Y_test)}")

Train = (80000, 80000)
Test = (20000, 20000)


# 한번 돌려보기

In [13]:
#
learing_rate = 0.1
training_epoch = 5

#loss = nn.MSELoss()
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(),lr = learing_rate, momentum = 0.9)

In [14]:
import copy
before_user_em = copy.deepcopy(model.user_em.weight)
before_item_em = copy.deepcopy(model.item_em.weight)
print(before_item_em)

Parameter containing:
tensor([[-8.8933e-03, -3.1267e-06,  8.8873e-03,  ..., -1.2132e-02,
         -1.8467e-03, -9.3277e-03],
        [-1.3323e-02,  6.5160e-03, -5.3129e-03,  ...,  2.3132e-03,
         -3.6649e-03, -6.5952e-03],
        [ 2.4375e-03,  8.4508e-04,  9.3000e-04,  ..., -2.9175e-05,
          4.5125e-03, -9.7918e-03],
        ...,
        [ 7.2346e-04,  4.6420e-03, -1.3461e-02,  ...,  2.4302e-03,
         -1.9118e-02,  1.0981e-02],
        [ 1.2318e-02, -9.5923e-03, -5.1666e-03,  ...,  2.7828e-02,
          6.5069e-04, -1.0249e-02],
        [-8.9261e-03, -1.0839e-02,  5.1655e-04,  ...,  3.3896e-03,
         -2.0417e-03, -9.8651e-03]], requires_grad=True)


In [15]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y
        
    def __len__(self):
        return len(self.X)
        
    def __getitem__(self,idx):
        x = self.X[idx]
        y = torch.tensor(self.Y[idx]).type(torch.LongTensor)
        return x,y


In [16]:
# Dataset & DataLoader
train_dataset = CustomDataset(X_train, Y_train)
test_dataset = CustomDataset(X_test, Y_test)

Batch_size = 128
train_loader = DataLoader(train_dataset, batch_size = Batch_size, shuffle = True, drop_last = True)
test_loader = DataLoader(test_dataset, batch_size = Batch_size, shuffle = False, drop_last = False)

In [17]:
# 돌아가는지 확인
x,y = next(iter(train_loader))
pred = model(x[0],x[1])

In [18]:
pred.shape

torch.Size([128, 5])

In [24]:
# 본격적인 학습
total_cost = []
model.train()
for epoch in range(training_epoch):
    epoch_cost = 0
    for idx, (X,Y) in enumerate(train_loader):
        pred = model(X[0],X[1]) # user, item
        cost = loss(pred, Y)
        optimizer.zero_grad()
        cost.backward()
        optimizer.step()
        
        # 계산
        total_cost.append(cost.item())
        epoch_cost += cost.item()
        
    epoch_cost /= len(train_loader)
    print(f"[Epoch:{epoch+1}], cost = {epoch_cost}")

[Epoch:1], cost = 1.4671899019241332
[Epoch:2], cost = 1.4671734085083008
[Epoch:3], cost = 1.4669626609802247
[Epoch:4], cost = 1.4673110651016235
[Epoch:5], cost = 1.466767928314209
