In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm

import torch
import re
from torch.utils.data import Dataset, DataLoader
from transformers import ElectraTokenizer, ElectraForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW
import torch.optim as optim

from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import StepLR
import torch.nn as nn

import warnings
warnings.filterwarnings("ignore")

In [2]:
def set_seed(seed=42):
    np.random.seed(seed)  # 이 부분이 pandas의 sample 함수에도 영향을 줍니다.
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed()

In [3]:
# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
# 데이터 로드 및 분할
df = pd.read_csv("/content/df_calculation.csv")

# df = pd.read_csv("/content/gdrive/MyDrive/BOAZ_MiniProject1/data/df_emotion.csv")
# df.rename(columns={"emo_num": "label"}, inplace=True)
df = df[['wav_id', 'text', 'label']]
df_train, df_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=32)

print(f"Train set size: {len(df_train)}, Test set size: {len(df_test)}")

Train set size: 12370, Test set size: 3093


In [5]:
df_train.head()

Unnamed: 0,wav_id,text,label
7290,5f9380c7d338b948c4e6aa17,나 너무 우울해.,2
10301,5fb4cdcb576e9378b67ac08a,어. 우울하고 답답해.,2
1107,5f5c5d8b2e23c7161accd00c,정말 너무 기분이 좋더라고. 내가 이 맛을 보려고 열심히 최선을 다한 것 같아.,1
10921,5fb87c634c55eb78bd7cdc77,나 오늘 짭새 구경하다왔어.,3
9033,5f9228099e04b149046cdc09,단톡방에서 아는 지인이 주식을 추천해줬어.,2


In [6]:
df['label'].unique()

array([1, 2, 0, 3])

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import torch.nn as nn

def set_seed(seed=42):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

# Custom Dataset 정의
class KoreanTextDataset(Dataset):
    def __init__(self, df, tokenizer, max_len, with_labels=True):
        self.tokenizer = tokenizer
        self.data = df
        self.sentences = df["text"].values
        self.labels = df["label"].values if with_labels else None
        self.max_len = max_len
        self.with_labels = with_labels

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, index):
        sentence = self.sentences[index]

        inputs = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            truncation=True
        )

        ids = torch.tensor(inputs['input_ids'], dtype=torch.long)
        mask = torch.tensor(inputs['attention_mask'], dtype=torch.long)

        if self.with_labels:
            label = self.labels[index]
            return {
                'ids': ids,
                'mask': mask,
                'labels': torch.tensor(label, dtype=torch.long)
            }
        else:
            return {
                'ids': ids,
                'mask': mask
            }

In [24]:
# 모델 및 토크나이저 불러오기
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")
model = AutoModelForSequenceClassification.from_pretrained("klue/roberta-base", num_labels=4).to(device)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
# 모델 학습 함수
def train_model(model, tokenizer, train_dataframe, epochs, epochs_stop):
    BATCH_SIZE = 16
    MAX_LEN = 256

    # Train-validation split
    train_df, val_df = train_test_split(train_dataframe, test_size=0.2, random_state=42)

    # Datasets and DataLoaders
    train_data = KoreanTextDataset(train_df, tokenizer, MAX_LEN)
    val_data = KoreanTextDataset(val_df, tokenizer, MAX_LEN)
    train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_data, batch_size=BATCH_SIZE)

    # Optimizer, loss function, and learning rate scheduler
    optimizer = AdamW(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.9, patience=10, verbose=True)

    model.train()
    no_improve = 0
    prev_loss = float('inf')

    for epoch in range(epochs):
        running_loss = 0.0
        running_acc = 0.0  # Accuracy tracking
        model.train()

        for step, data in enumerate(train_dataloader):
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            labels = data['labels'].to(device)

            outputs = model(ids, attention_mask=mask)
            loss = criterion(outputs.logits, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            running_acc += calc_accuracy(outputs.logits, labels)  # Accuracy 계산

        # Validation
        model.eval()
        val_loss = 0.0
        val_acc = 0.0  # Validation accuracy tracking

        with torch.no_grad():
            for data in val_dataloader:
                ids = data['ids'].to(device)
                mask = data['mask'].to(device)
                labels = data['labels'].to(device)

                outputs = model(ids, attention_mask=mask)
                loss = criterion(outputs.logits, labels)
                val_loss += loss.item()
                val_acc += calc_accuracy(outputs.logits, labels)  # Validation accuracy 계산

        avg_train_loss = running_loss / len(train_dataloader)
        avg_val_loss = val_loss / len(val_dataloader)
        avg_train_acc = running_acc / len(train_dataloader)
        avg_val_acc = val_acc / len(val_dataloader)

        print(f"EPOCH {epoch+1} completed: Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}, Training Accuracy: {avg_train_acc:.4f}, Validation Accuracy: {avg_val_acc:.4f}")

        # Learning rate scheduler step
        scheduler.step(avg_val_loss)

        # Early stopping check
        if prev_loss - avg_val_loss <= 0.0001:
            no_improve += 1
        else:
            no_improve = 0

        prev_loss = avg_val_loss

        if no_improve == epochs_stop:
            print("Early stopping due to no improvement in validation loss.")
            break

    return model


In [26]:
# Accuracy 계산 함수
def calc_accuracy(preds, labels):
    _, max_indices = torch.max(preds, dim=1)
    accuracy = (max_indices == labels).sum().item() / max_indices.size(0)
    return accuracy

In [27]:
# 모델 학습 실행 -> 1시간 소요
klueroberta_model_trained = train_model(model, tokenizer, df_train, epochs=10, epochs_stop=5)

EPOCH 1 completed: Training Loss: 0.8587, Validation Loss: 0.7508, Training Accuracy: 0.6521, Validation Accuracy: 0.7105
EPOCH 2 completed: Training Loss: 0.6725, Validation Loss: 0.7119, Training Accuracy: 0.7406, Validation Accuracy: 0.7226
EPOCH 3 completed: Training Loss: 0.5858, Validation Loss: 0.7376, Training Accuracy: 0.7807, Validation Accuracy: 0.7115
EPOCH 4 completed: Training Loss: 0.4974, Validation Loss: 0.7967, Training Accuracy: 0.8162, Validation Accuracy: 0.7190
EPOCH 5 completed: Training Loss: 0.4267, Validation Loss: 0.8393, Training Accuracy: 0.8442, Validation Accuracy: 0.7079
EPOCH 6 completed: Training Loss: 0.3611, Validation Loss: 0.9296, Training Accuracy: 0.8708, Validation Accuracy: 0.7126
EPOCH 7 completed: Training Loss: 0.3254, Validation Loss: 1.0369, Training Accuracy: 0.8831, Validation Accuracy: 0.7073
Early stopping due to no improvement in validation loss.


In [12]:
# def evaluate_model(model, tokenizer, test_dataframe, batch_size=16, max_len=256):
#     model.eval()
#     test_data = KoreanTextDataset(test_dataframe, tokenizer, max_len)
#     test_dataloader = DataLoader(test_data, batch_size=batch_size)

#     test_acc = 0.0
#     with torch.no_grad():
#         for data in test_dataloader:
#             ids = data['ids'].to(device)
#             mask = data['mask'].to(device)
#             labels = data['labels'].to(device)

#             outputs = model(ids, attention_mask=mask)
#             test_acc += calc_accuracy(outputs.logits, labels)

#     avg_test_acc = test_acc / len(test_dataloader)
#     print(f"Test Accuracy: {avg_test_acc:.4f}")
#     return avg_test_acc


# # Test 데이터 평가
# evaluate_model(klueroberta_model_trained, tokenizer, df_test)

Test Accuracy: 0.6990


0.6989690721649484

In [28]:
# prompt: 해당 모델의 f1 score도 보여줘

from sklearn.metrics import f1_score

def evaluate_model_with_f1(model, tokenizer, test_dataframe, batch_size=16, max_len=256):
    model.eval()
    test_data = KoreanTextDataset(test_dataframe, tokenizer, max_len)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)

    test_acc = 0.0
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for data in test_dataloader:
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            labels = data['labels'].to(device)

            outputs = model(ids, attention_mask=mask)
            _, preds = torch.max(outputs.logits, dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

            test_acc += calc_accuracy(outputs.logits, labels)

    avg_test_acc = test_acc / len(test_dataloader)
    print(f"Test Accuracy: {avg_test_acc:.4f}")

    f1 = f1_score(all_labels, all_preds, average='weighted')
    print(f"F1 Score: {f1:.4f}")
    return avg_test_acc, f1


# Test 데이터 평가 및 F1 Score 출력
evaluate_model_with_f1(klueroberta_model_trained, tokenizer, df_test)


Test Accuracy: 0.7026
F1 Score: 0.6928


(0.7025773195876289, 0.6928095574033654)

In [29]:
# npy 저장 코드
import numpy as np

def predict_and_save_results(model, tokenizer, test_dataframe, batch_size=16, max_len=256):
    model.eval()
    test_data = KoreanTextDataset(test_dataframe, tokenizer, max_len, with_labels=True)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)

    true_labels = []
    predicted_classes = []
    probabilities = []

    with torch.no_grad():
        for data in test_dataloader:
            ids = data['ids'].to(device)
            mask = data['mask'].to(device)
            labels = data['labels'].to(device)

            outputs = model(ids, attention_mask=mask)
            logits = outputs.logits
            probs = torch.softmax(logits, dim=1)

            true_labels.extend(labels.cpu().numpy())
            predicted_classes.extend(torch.argmax(logits, dim=1).cpu().numpy())
            probabilities.extend(probs.cpu().numpy())

    # Save results to npy files
    np.save('true_labels.npy', np.array(true_labels))
    np.save('predicted_classes.npy', np.array(predicted_classes))
    np.save('probabilities.npy', np.array(probabilities))

# Call the function to save the results
predict_and_save_results(klueroberta_model_trained, tokenizer, df_test)


In [30]:
import numpy as np

true = np.load('/content/true_labels.npy')
probabilities = np.load('/content/probabilities.npy')
predicted_classes = np.load('/content/predicted_classes.npy')
print(true.shape)
print(predicted_classes.shape)
print(probabilities.shape)


(3093,)
(3093,)
(3093, 4)
