In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm

import torch
import re
from torch.utils.data import Dataset, DataLoader
from transformers import ElectraTokenizer, ElectraForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW
import torch.optim as optim

from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import StepLR
import torch.nn as nn

import warnings
warnings.filterwarnings("ignore")

In [3]:
def set_seed(seed=42):
    np.random.seed(seed)  # 이 부분이 pandas의 sample 함수에도 영향을 줍니다.
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed()

In [4]:
# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [5]:
# 데이터 로드 및 분할
df = pd.read_csv("/content/gdrive/MyDrive/BOAZ/mini_pj/df_emotion.csv")
df = df[['wav_id', 'text', 'label']]
df_train, df_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=32)

print(f"Train set size: {len(df_train)}, Test set size: {len(df_test)}")

Train set size: 15499, Test set size: 3875


In [6]:
df_train.head()

Unnamed: 0,wav_id,text,label
15207,5fb9e2f0576e9378b67ac63b,조금 더 기다려볼까?,0
855,5f5f042d54b23616212849af,해피랑 산책하다가 목줄이 끊겨버렸어. 그래서 붙잡느라고 혼이 났어.,2
4204,5f681a309e04b149046cb7e1,오랜만에 가족들이랑 동해 바다에 가서 놀다 오기로 했어. 근데 비가 와서 망했어.,3
8567,5f8256f7d338b948c4e6914c,어 그것도 너무 좋을 것 같아.,1
17337,5fbc83b344697678c497b9ea,나 요즘 주식 시작했어.,3


In [7]:
df['label'].unique()

array([1, 0, 3, 4, 2, 5, 6])

In [8]:
def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

# Custom Dataset 정의
class KoreanTextDataset(Dataset):
    def __init__(self, df, tokenizer, max_len, with_labels=True):
        self.tokenizer = tokenizer
        self.data = df
        self.sentences = df["text"].values
        self.labels = df["label"].values if with_labels else None
        self.max_len = max_len
        self.with_labels = with_labels

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, index):
        sentence = self.sentences[index]

        inputs = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            truncation=True
        )

        ids = torch.tensor(inputs['input_ids'], dtype=torch.long)
        mask = torch.tensor(inputs['attention_mask'], dtype=torch.long)

        if self.with_labels:
            label = self.labels[index]
            return {
                'ids': ids,
                'mask': mask,
                'labels': torch.tensor(label, dtype=torch.long)
            }
        else:
            return {
                'ids': ids,
                'mask': mask
            }


In [9]:
7# 모델 및 토크나이저 불러오기
koelectra_tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-discriminator")
koelectra_model = ElectraForSequenceClassification.from_pretrained("monologg/koelectra-base-discriminator", num_labels=7).to(device)

tokenizer_config.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/279k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# 모델 학습 함수
def train_model(model, tokenizer, train_dataframe, epochs, epochs_stop):
    BATCH_SIZE = 16
    MAX_LEN = 256

    # Train-validation split
    train_df, val_df = train_test_split(train_dataframe, test_size=0.2, random_state=42)

    # Datasets and DataLoaders
    train_data = KoreanTextDataset(train_df, tokenizer, MAX_LEN)
    val_data = KoreanTextDataset(val_df, tokenizer, MAX_LEN)
    train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_data, batch_size=BATCH_SIZE)

    # Optimizer, loss function, and learning rate scheduler
    optimizer = AdamW(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.9, patience=5, verbose=True)

    model.train()
    no_improve = 0
    prev_loss = float('inf')

    for epoch in range(epochs):
        running_loss = 0.0
        running_acc = 0.0  # Accuracy tracking
        model.train()

        for step, data in enumerate(train_dataloader):
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            labels = data['labels'].to(device)

            outputs = model(ids, attention_mask=mask)
            loss = criterion(outputs.logits, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            running_acc += calc_accuracy(outputs.logits, labels)  # Accuracy 계산

        # Validation
        model.eval()
        val_loss = 0.0
        val_acc = 0.0  # Validation accuracy tracking

        with torch.no_grad():
            for data in val_dataloader:
                ids = data['ids'].to(device)
                mask = data['mask'].to(device)
                labels = data['labels'].to(device)

                outputs = model(ids, attention_mask=mask)
                loss = criterion(outputs.logits, labels)
                val_loss += loss.item()
                val_acc += calc_accuracy(outputs.logits, labels)  # Validation accuracy 계산

        avg_train_loss = running_loss / len(train_dataloader)
        avg_val_loss = val_loss / len(val_dataloader)
        avg_train_acc = running_acc / len(train_dataloader)
        avg_val_acc = val_acc / len(val_dataloader)

        print(f"EPOCH {epoch+1} completed: Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}, Training Accuracy: {avg_train_acc:.4f}, Validation Accuracy: {avg_val_acc:.4f}")

        # Learning rate scheduler step
        scheduler.step(avg_val_loss)

        # Early stopping check
        if prev_loss - avg_val_loss <= 0.0001:
            no_improve += 1
        else:
            no_improve = 0

        prev_loss = avg_val_loss

        if no_improve == epochs_stop:
            print("Early stopping due to no improvement in validation loss.")
            break

    return model

In [14]:
# Accuracy 계산 함수
def calc_accuracy(preds, labels):
    _, max_indices = torch.max(preds, dim=1)
    accuracy = (max_indices == labels).sum().item() / max_indices.size(0)
    return accuracy

In [15]:
# 모델 학습 실행
koelectra_model_trained = train_model(koelectra_model, koelectra_tokenizer, df_train, epochs=10, epochs_stop=5)

EPOCH 1 completed: Training Loss: 0.5370, Validation Loss: 1.5478, Training Accuracy: 0.8138, Validation Accuracy: 0.5446
EPOCH 2 completed: Training Loss: 0.4719, Validation Loss: 1.6894, Training Accuracy: 0.8380, Validation Accuracy: 0.5415
EPOCH 3 completed: Training Loss: 0.4225, Validation Loss: 1.7641, Training Accuracy: 0.8532, Validation Accuracy: 0.5359
EPOCH 4 completed: Training Loss: 0.3944, Validation Loss: 1.9416, Training Accuracy: 0.8616, Validation Accuracy: 0.5347
EPOCH 5 completed: Training Loss: 0.3766, Validation Loss: 1.8909, Training Accuracy: 0.8680, Validation Accuracy: 0.5351
EPOCH 6 completed: Training Loss: 0.3588, Validation Loss: 1.9332, Training Accuracy: 0.8722, Validation Accuracy: 0.5350
EPOCH 7 completed: Training Loss: 0.3336, Validation Loss: 1.9932, Training Accuracy: 0.8827, Validation Accuracy: 0.5421
EPOCH 8 completed: Training Loss: 0.3111, Validation Loss: 2.0144, Training Accuracy: 0.8889, Validation Accuracy: 0.5344
EPOCH 9 completed: Train

In [16]:
def evaluate_model(model, tokenizer, test_dataframe, batch_size=16, max_len=256):
    model.eval()
    test_data = KoreanTextDataset(test_dataframe, tokenizer, max_len)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)

    test_acc = 0.0
    with torch.no_grad():
        for data in test_dataloader:
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            labels = data['labels'].to(device)

            outputs = model(ids, attention_mask=mask)
            test_acc += calc_accuracy(outputs.logits, labels)

    avg_test_acc = test_acc / len(test_dataloader)
    print(f"Test Accuracy: {avg_test_acc:.4f}")
    return avg_test_acc

In [17]:
# Test 데이터 평가
evaluate_model(koelectra_model_trained, koelectra_tokenizer, df_test)

Test Accuracy: 0.5291


0.5291495198902607