<a href="https://colab.research.google.com/github/mmvv11/recommender-colab/blob/main/3_NCF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import time
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.backends.cudnn as cudnn

하이퍼파라미터

In [None]:
device="cuda" # 디바이스
n_neg=4 # 네거티브 샘플링 갯수
n_layers = 3 # MLP 레이어 갯수
dropout=0.0 # dropout rate
data_path = "./ml-100k_splited.pkl" # 데이터셋 경로
batch_size = 1024 # 훈련 데이터 배치 사이즈
emb_size = 8 # MF 임베딩 크기
lr = 1e-3
top_k = 20
n_epoch=10

## 데이터 로딩

In [None]:
with open(data_path, "rb") as f:
    data = pickle.load(f)

train, val, test, all_items, user2id, id2user, item2id, id2item = data.values()

In [None]:
class MLDataset(Dataset):
    def __init__(self, df, all_items, n_neg=4):
        super().__init__()
        self.n_neg=n_neg
        self.users, self.items, self.labels = self.get_data(df, all_items)

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        return self.users[idx], self.items[idx], self.labels[idx]

    def get_data(self, df, all_items):
        users, items, labels = [], [], []
        user_item_set = set(zip(df['user'], df['item']))
        for u, i in user_item_set:
            users.append(u)
            items.append(i)
            labels.append(1)
            for _ in range(self.n_neg):
                neg_item = np.random.choice(all_items)
                while (u, neg_item) in user_item_set:
                    neg_item = np.random.choice(all_items)
                users.append(u)
                items.append(neg_item)
                labels.append(0)
        return torch.tensor(users).to(device), torch.tensor(items).to(device), torch.tensor(labels, dtype=torch.float32).to(device)

In [None]:
train_dataset = MLDataset(train, all_items, )
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [None]:
user_consumed = train.groupby("user")['item'].apply(list).to_dict()
val_true = val.groupby("user")['item'].apply(list).to_dict()
test_true = test.groupby("user")['item'].apply(list).to_dict()

# 모델링

In [None]:
class NCF(nn.Module):
    def __init__(self, n_users, n_items, emb_size, n_layers):
        super().__init__()
        # GMF layer
        self.emb_user = nn.Embedding(n_users, emb_size)
        self.emb_item = nn.Embedding(n_items, emb_size)

        # MLP layer
        self.emb_user_MLP = nn.Embedding(n_users, emb_size*(2**(n_layers-1)))
        self.emb_item_MLP = nn.Embedding(n_items, emb_size*(2**(n_layers-1)))
        MLP = []
        for i in range(n_layers):
            input_size = emb_size*(2**(n_layers-i))
            MLP.append(nn.Dropout(p=dropout))
            MLP.append(nn.Linear(input_size, input_size//2))
            MLP.append(nn.ReLU())
        self.MLP_layer = nn.Sequential(*MLP)

        # prediction layer
        self.predict_layer = nn.Linear(emb_size*2, 1)

        self._init_weight()

    def _init_weight(self):
        nn.init.normal_(self.emb_user.weight, std=1e-2)
        nn.init.normal_(self.emb_item.weight, std=1e-2)
        nn.init.normal_(self.emb_user_MLP.weight, std=1e-2)
        nn.init.normal_(self.emb_item_MLP.weight, std=1e-2)

        for m in self.MLP_layer:
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
        nn.init.kaiming_uniform_(self.predict_layer.weight, a=1, nonlinearity="sigmoid")

        for m in self.modules():
            if isinstance(m, nn.Linear) and m.bias is not None:
                m.bias.data.zero_()

    def forward(self, user, item):
        emb_user = self.emb_user(user)
        emb_item = self.emb_item(item)
        output_GMF = emb_user*emb_item

        emb_user_MLP = self.emb_user_MLP(user)
        emb_item_MLP = self.emb_item_MLP(item)
        concat_emb_MLP = torch.cat((emb_user_MLP, emb_item_MLP), dim=1)
        output_MLP = self.MLP_layer(concat_emb_MLP)

        concat_output = torch.cat((output_GMF, output_MLP), dim=1)

        prediction=self.predict_layer(concat_output)
        return prediction.view(-1)

모델, 손실 함수, 옵티마이저 정의

In [None]:
n_users, n_items = len(user2id), len(item2id)

In [None]:
model = NCF(n_users, n_items, emb_size, n_layers)
model.to(device)

loss_function = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

메트릭
* precision
* recall
* nDCG

In [None]:
def get_precision(pred, true, k=20):
    intersection = set(pred).intersection(set(true))
    return len(intersection)/ k

def get_recall(pred, true, k=20):
    intersection = set(pred).intersection(set(true))
    return len(intersection)/len(true)

def get_nDCG(pred, true, k=20):
    intersection, _, idx_in_pred = np.intersect1d(true, pred, assume_unique=True, return_indices=True)
    if intersection.size == 0:
        return 0
    rank_list = np.zeros(k, np.float32)
    rank_list[idx_in_pred] = 1
    ideal_list = np.sort(rank_list)[::-1]
    dcg = np.sum(rank_list/np.log2(np.arange(2, k+2)))
    idcg = np.sum(ideal_list /np.log2(np.arange(2, k+2)))
    return dcg/idcg

train process 정의

In [None]:
for epoch in range(1, n_epoch+1):
    model.train()
    total_loss= []
    for i, batch_data in enumerate(tqdm(train_loader, desc="train")):
        users, items, labels = batch_data
        pred = model(users, items)
        loss = loss_function(pred, labels)
        total_loss.append(loss.item())

        # backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"epoch: {epoch}, total_loss: {np.mean(total_loss):.4f}")

    # validation
    model.eval()
    recall = np.array([])
    precision = np.array([])
    ndcg = np.array([])
    for u, true in tqdm(val_true.items(), desc="eval"):
        # 유저별 소비하지 않은 아이템
        unconsumed_items = list(set(all_items)-set(user_consumed[u]))
        unconsumed_items = torch.tensor(unconsumed_items).to(device)
        uu = torch.tensor([u]*len(unconsumed_items)).to(device)

        # 추론
        pred = model(uu, unconsumed_items)
        _, pred_idx = torch.topk(pred, k=top_k)
        top_k_items = unconsumed_items[pred_idx].tolist()

        # 메트릭
        recall=np.append(recall, get_recall(top_k_items, true, k=top_k))
        precision=np.append(precision, get_precision(top_k_items, true, k=top_k))
        ndcg=np.append(ndcg, get_nDCG(top_k_items, true, k=top_k))
    print(f"recall:{np.mean(recall):.4f}\nprecision:{np.mean(precision):.4f}\nndcg:{np.mean(ndcg):.4f}")

# validation
model.eval()
recall = np.array([])
precision = np.array([])
ndcg = np.array([])
for u, true in tqdm(test_true.items(), desc="test"):
    # 유저별 소비하지 않은 아이템
    unconsumed_items = list(set(all_items)-set(user_consumed[u]))
    unconsumed_items = torch.tensor(unconsumed_items).to(device)
    uu = torch.tensor([u]*len(unconsumed_items)).to(device)

    # 추론
    pred = model(uu, unconsumed_items)
    _, pred_idx = torch.topk(pred, k=top_k)
    top_k_items = unconsumed_items[pred_idx].tolist()

    # 메트릭
    recall=np.append(recall, get_recall(top_k_items, true, k=top_k))
    precision=np.append(precision, get_precision(top_k_items, true, k=top_k))
    ndcg=np.append(ndcg, get_nDCG(top_k_items, true, k=top_k))
print(f"recall:{np.mean(recall):.4f}\nprecision:{np.mean(precision):.4f}\nndcg:{np.mean(ndcg):.4f}")

train: 100%|██████████| 391/391 [00:07<00:00, 52.43it/s]


epoch: 1, total_loss: 0.4316


eval: 100%|██████████| 943/943 [00:02<00:00, 329.27it/s]


recall:0.2012
precision:0.0978
ndcg:0.3891


train: 100%|██████████| 391/391 [00:05<00:00, 66.05it/s]


epoch: 2, total_loss: 0.3549


eval: 100%|██████████| 943/943 [00:01<00:00, 503.08it/s]


recall:0.2630
precision:0.1101
ndcg:0.4295


train: 100%|██████████| 391/391 [00:06<00:00, 59.86it/s]


epoch: 3, total_loss: 0.3259


eval: 100%|██████████| 943/943 [00:01<00:00, 490.24it/s]


recall:0.2748
precision:0.1155
ndcg:0.4395


train: 100%|██████████| 391/391 [00:06<00:00, 62.88it/s]


epoch: 4, total_loss: 0.3084


eval: 100%|██████████| 943/943 [00:02<00:00, 415.01it/s]


recall:0.2788
precision:0.1183
ndcg:0.4522


train: 100%|██████████| 391/391 [00:06<00:00, 62.07it/s]


epoch: 5, total_loss: 0.2977


eval: 100%|██████████| 943/943 [00:01<00:00, 519.19it/s]


recall:0.2860
precision:0.1203
ndcg:0.4566


train: 100%|██████████| 391/391 [00:07<00:00, 55.20it/s]


epoch: 6, total_loss: 0.2894


eval: 100%|██████████| 943/943 [00:01<00:00, 516.82it/s]


recall:0.2867
precision:0.1204
ndcg:0.4592


train: 100%|██████████| 391/391 [00:06<00:00, 58.23it/s]


epoch: 7, total_loss: 0.2826


eval: 100%|██████████| 943/943 [00:01<00:00, 517.08it/s]


recall:0.2893
precision:0.1215
ndcg:0.4568


train: 100%|██████████| 391/391 [00:05<00:00, 66.62it/s]


epoch: 8, total_loss: 0.2764


eval: 100%|██████████| 943/943 [00:02<00:00, 425.90it/s]


recall:0.2899
precision:0.1208
ndcg:0.4537


train: 100%|██████████| 391/391 [00:06<00:00, 63.48it/s]


epoch: 9, total_loss: 0.2709


eval: 100%|██████████| 943/943 [00:01<00:00, 517.23it/s]


recall:0.2915
precision:0.1221
ndcg:0.4577


train: 100%|██████████| 391/391 [00:07<00:00, 50.97it/s]


epoch: 10, total_loss: 0.2658


eval: 100%|██████████| 943/943 [00:01<00:00, 504.24it/s]


recall:0.2949
precision:0.1224
ndcg:0.4545


test: 100%|██████████| 943/943 [00:01<00:00, 510.58it/s]

recall:0.2989
precision:0.1237
ndcg:0.4592



