In [1]:
import torch
import pandas as pd
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, BertTokenizer, BertModel
from transformers.optimization import get_cosine_schedule_with_warmup
from tqdm.notebook import tqdm
from kobert_tokenizer import KoBERTTokenizer

In [2]:
##GPU 사용 시
device = torch.device("cuda:0")

In [None]:
# 파일 로드
train_df = pd.read_csv('/home/maroco/dataset/model/train_set.csv', sep='\t')
test_df = pd.read_csv('/home/maroco/dataset/model/test_set.csv', sep='\t')

In [4]:
# KoBERT Dataset
class KoBERTDataset(Dataset):
    def __init__(self, dataset, tokenizer, max_len):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        text = self.dataset.iloc[idx]['data']  # 텍스트 열 이름
        label = self.dataset.iloc[idx]['spam']  # 라벨 열 이름 (0 또는 1)

        # Tokenization
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=self.max_len
        )

        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        token_type_ids = inputs['token_type_ids'].squeeze(0)

        return input_ids, attention_mask, token_type_ids, torch.tensor(label, dtype=torch.long)

In [5]:
# KoBERT 모델
class BERTClassifier(nn.Module):
    def __init__(self, bert, hidden_size=768, num_classes=2, dr_rate=0.5):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        pooled_output = outputs[1]  # CLS token output
        if self.dr_rate:
            pooled_output = self.dropout(pooled_output)
        return self.classifier(pooled_output)

In [6]:
# Hyperparameters
max_len = 128
batch_size = 64
learning_rate = 5e-5
num_epochs = 3

In [7]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("monologg/kobert")
bert_model = BertModel.from_pretrained("monologg/kobert")
model = BERTClassifier(bert_model, num_classes=2, dr_rate=0.5).to(device)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'KoBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


In [8]:
# Dataset and DataLoader
train_dataset = KoBERTDataset(train_df, tokenizer, max_len)
test_dataset = KoBERTDataset(test_df, tokenizer, max_len)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [9]:
# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=200,
    num_training_steps=len(train_dataloader) * num_epochs
)



In [10]:
# Loss function
loss_fn = nn.CrossEntropyLoss()

In [11]:
# Accuracy calculation
def calc_accuracy(preds, labels):
    _, pred_classes = preds.max(dim=1)
    return (pred_classes == labels).sum().item()

In [12]:
# Training and Validation loop
for epoch in range(num_epochs):
    model.train()
    total_acc, total_loss = 0, 0

    # Training phase
    for input_ids, attention_mask, token_type_ids, labels in tqdm(train_dataloader):
        optimizer.zero_grad()
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask, token_type_ids)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        total_acc += calc_accuracy(outputs, labels)

    print(f"Epoch {epoch+1} | Train Loss: {total_loss / len(train_dataloader):.4f} | Train Accuracy: {total_acc / len(train_dataset):.4f}")

    # Validation phase
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for input_ids, attention_mask, token_type_ids, labels in tqdm(test_dataloader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask, token_type_ids)
            loss = loss_fn(outputs, labels)

            total_loss += loss.item()
            total_acc += calc_accuracy(outputs, labels)

    print(f"Epoch {epoch+1} | Val Loss: {total_loss / len(test_dataloader):.4f} | Val Accuracy: {total_acc / len(test_dataset):.4f}")

  0%|          | 0/3189 [00:00<?, ?it/s]

Epoch 1 | Train Loss: 0.1078 | Train Accuracy: 0.9506


  0%|          | 0/798 [00:00<?, ?it/s]

Epoch 1 | Val Loss: 0.0854 | Val Accuracy: 0.9619


  0%|          | 0/3189 [00:00<?, ?it/s]

Epoch 2 | Train Loss: 0.0857 | Train Accuracy: 0.9601


  0%|          | 0/798 [00:00<?, ?it/s]

Epoch 2 | Val Loss: 0.0818 | Val Accuracy: 0.9622


  0%|          | 0/3189 [00:00<?, ?it/s]

Epoch 3 | Train Loss: 0.0792 | Train Accuracy: 0.9625


  0%|          | 0/798 [00:00<?, ?it/s]

Epoch 3 | Val Loss: 0.0803 | Val Accuracy: 0.9627


In [13]:
# 가중치만 저장
torch.save(model.state_dict(), "/home/maroco/dataset/model/sentiment_model.pt")