In [1]:
import torch
import numpy as np
import pandas as pd
import os
import random
from pathlib import Path

import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from torchmetrics import AUROC, Accuracy, Precision, Recall
from torchmetrics.classification import BinaryAUROC, BinaryF1Score

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, average_precision_score

import pickle
from load_msi_data import LoadData
from model import CombNet, CombNetSupCon
from dataset import CombinationDataset
from loss import SupConLoss

In [2]:
SEED = 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(SEED)

## Prepare dataset

In [3]:
'''
argument of CombinationDataset
- database: str, default='C_DCDB' ['C_DCDB', 'DCDB', 'DC_combined']
- neg_ratio: int, default=1
- duplicate: bool, default=False (if True, duplicate each samples) -> [a, b] & [b, a]
- use_ddi: bool, default=False (if True, use ddi dataset)
- ddi_dataset: str, default=None (if use_ddi is True, choose ddi dataset) ['DB', 'TWOSIDES']
- seed: int, default=42
'''
# without ddi
dataset = CombinationDataset(database='C_DCDB', neg_ratio=1, duplicate=False, seed=SEED)
print(len(dataset))

# with ddi
# dataset = CombinationDataset(database='C_DCDB', neg_ratio=1, duplicate=False, use_ddi=True, ddi_dataset='DB', seed=SEED)
# print(len(dataset))

Processing dataset...
Saving dataset...
Loading dataset...data/processed/C_DCDB_neg1_dup0_ddi0_None_seed42.pt
Dictionary of {train, valid, test} dataset is loaded.
3


In [4]:
train_dataset, valid_dataset, test_dataset = dataset.data['train'], dataset.data['valid'], dataset.data['test']

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## Training

In [5]:
def train_cross_entropy(model, device, train_loader, criterion, optimizer, metric_list=[accuracy_score]):

    # train
    model.train()
    train_loss = 0

    target_list = []
    pred_list = []
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.float().to(device)
        optimizer.zero_grad()
        output = model(data).view(-1) # z
        # print(output)
        loss = criterion(output, target) # z, y
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        pred_list.append(torch.sigmoid(output).detach().cpu().numpy())
        target_list.append(target.long().detach().cpu().numpy())
    
    # metric
    scores = []
    for metric in metric_list:
        if (metric == roc_auc_score) or (metric == average_precision_score):
            scores.append(metric(np.concatenate(target_list), np.concatenate(pred_list)))
        else: # accuracy_score, f1_score, precision_score, recall_score
            scores.append(metric(np.concatenate(target_list), np.concatenate(pred_list).round()))
    
    return train_loss / (batch_idx + 1), scores

In [6]:
def evaluate(model, device, loader, criterion, metric_list=[accuracy_score], checkpoint=None):
    # evaluate
    if checkpoint is not None:
        model.load_state_dict(torch.load(checkpoint))
    model.eval()
    eval_loss = 0

    target_list = []
    pred_list = []
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(loader):
            data, target = data.to(device), target.float().to(device)
            output = model(data).view(-1)
            eval_loss += criterion(output, target).item()
            pred_list.append(torch.sigmoid(output).detach().cpu().numpy())
            target_list.append(target.long().detach().cpu().numpy())

    scores = []
    for metric in metric_list:
        if (metric == roc_auc_score) or (metric == average_precision_score):
            scores.append(metric(np.concatenate(target_list), np.concatenate(pred_list)))
        else: # accuracy_score, f1_score, precision_score, recall_score
            scores.append(metric(np.concatenate(target_list), np.concatenate(pred_list).round()))
    return eval_loss / (batch_idx + 1), scores

In [7]:
def main():
    input_dim = train_dataset[0][0].shape[0]
    hidden_dim = input_dim
    output_dim = 1
    print('input_dim: {}, hidden_dim: {}, output_dim: {}'.format(input_dim, hidden_dim, output_dim))
    model = CombNet(input_dim, hidden_dim, output_dim, comb_type='cat')

    EPOCHS = 100
    LR = 0.001
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=1e-5)

    best_valid_loss = float('inf')
    for epoch in range(EPOCHS):
        train_loss, train_scores = train_cross_entropy(model, device, train_loader, criterion, optimizer, metric_list=[accuracy_score, roc_auc_score, f1_score, average_precision_score])
        valid_loss, valid_scores = evaluate(model, device, valid_loader, criterion, metric_list=[accuracy_score, roc_auc_score, f1_score, average_precision_score])
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'checkpoint.pt')
        print(f'Epoch {epoch+1:03d}: | Train Loss: {train_loss:.4f} | Train Acc: {train_scores[0]*100:.2f}% | Train AUROC: {train_scores[1]:.2f} | Train F1: {train_scores[2]:.4f} | Train AUPRC: {train_scores[3]:.2f} || Val. Loss: {valid_loss:.4f} | Val. Acc: {valid_scores[0]*100:.2f}% | Val. AUROC: {valid_scores[1]:.2f} | Val. F1: {valid_scores[2]:.4f} | Val. AUPRC: {valid_scores[3]:.2f}')
    
    test_loss, test_scores = evaluate(model, device, test_loader, criterion, metric_list=[accuracy_score, roc_auc_score, f1_score, average_precision_score], checkpoint='checkpoint.pt')
    print(f'Test Loss: {test_loss:.4f} | Test Acc: {test_scores[0]*100:.2f}% | Test AUROC: {test_scores[1]:.2f} | Test F1: {test_scores[2]:.4f} | Test AUPRC: {test_scores[3]:.2f}')
    return model

In [8]:
model = main()

input_dim: 256, hidden_dim: 256, output_dim: 1
Epoch 001: | Train Loss: 0.5806 | Train Acc: 69.96% | Train AUROC: 0.76 | Train F1: 0.6926 | Train AUPRC: 0.77 || Val. Loss: 0.5001 | Val. Acc: 75.47% | Val. AUROC: 0.83 | Val. F1: 0.7640 | Val. AUPRC: 0.85
Epoch 002: | Train Loss: 0.4655 | Train Acc: 78.54% | Train AUROC: 0.86 | Train F1: 0.7829 | Train AUPRC: 0.87 || Val. Loss: 0.4736 | Val. Acc: 76.54% | Val. AUROC: 0.86 | Val. F1: 0.7775 | Val. AUPRC: 0.87
Epoch 003: | Train Loss: 0.4133 | Train Acc: 81.07% | Train AUROC: 0.89 | Train F1: 0.8078 | Train AUPRC: 0.89 || Val. Loss: 0.4878 | Val. Acc: 76.78% | Val. AUROC: 0.85 | Val. F1: 0.7747 | Val. AUPRC: 0.85
Epoch 004: | Train Loss: 0.3820 | Train Acc: 82.76% | Train AUROC: 0.91 | Train F1: 0.8261 | Train AUPRC: 0.91 || Val. Loss: 0.4736 | Val. Acc: 79.15% | Val. AUROC: 0.87 | Val. F1: 0.7934 | Val. AUPRC: 0.87
Epoch 005: | Train Loss: 0.3493 | Train Acc: 84.33% | Train AUROC: 0.93 | Train F1: 0.8423 | Train AUPRC: 0.93 || Val. Loss: 

|Positive : Negative ratio|F1 score|AUPRC score|
|-----|-----|-----|
|1:1|0.79|0.87| 
|1:2|0.74|0.83|
|1:3|0.68|0.77|
|1:6|0.66|0.74|

class imbalance 문제를 좀 개선하는 방법이 있으면 그걸 novelty로 추가?