## 개인 프로젝트 : 악성 댓글 탐지
### 모델링 부분

### 0. 필요한 모듈 불러오기

In [18]:
import random
import json
from ast import literal_eval

import numpy as np
import pandas as pd

from tqdm.auto import tqdm

import torch
from torch.nn import LSTM, Module, Linear
from torch.utils.data import DataLoader, Dataset

from gensim.models import FastText

from torchmetrics.classification import BinaryAccuracy, BinaryF1Score

### 1. Padding + FastText 임베딩

In [26]:
# literal_eval을 사용해서 list를 문자열로 불러오지 않고 그대로 불러온다

train = pd.read_csv('../datasets/train.csv', converters={'content': literal_eval})
val = pd.read_csv('../datasets/val.csv', converters={'content': literal_eval})
test = pd.read_csv('../datasets/test.csv', converters={'content': literal_eval})

In [27]:
train_text = [sent for sent in train['content']]
train_label = train['label'].tolist()

val_text = [sent for sent in val['content']]
val_label = val['label'].tolist()

test_text = [sent for sent in test['content']]
test_label = test['label'].tolist()

In [31]:
# token 길이가 30 이하인 문장이 96% 정도이므로 최대 sequence 길이를 30으로 설정
proportion = sum([1 if len(text) <= 30 else 0 for text in train_text]) / len(train_text)
proportion

0.9529557412565769

In [33]:
max_len = 30

In [35]:
# 사전에 https://fasttext.cc/docs/en/crawl-vectors.html에서 한국어 파일을 받아
# fastText_pretrained.model로 저장해두었음

fastText = FastText.load('../../utils/fastText_pretrained.model')

In [36]:
def padding(text_list):
    max_len = 30

    pad_text_list = []

    for text_tokens in text_list:
        if len(text_tokens) >= max_len:
            pad_text = text_tokens[len(text_tokens) - max_len:]
        else:
            pad_text = ['<pad>' for _ in range((max_len - len(text_tokens)))] + text_tokens
        pad_text_list.append(pad_text)

    return pad_text_list

In [37]:
def vectorization(fastText, text_list):
    vec_size = fastText.vector_size
    text_vec_list = []

    for text_tokens in text_list:
        text_vec = []
        for token in text_tokens:
            if token == '<pad>':
                text_vec.append(np.zeros(vec_size))
            else:
                text_vec.append(fastText.wv.get_vector(token))
        text_vec_list.append(text_vec)

    return text_vec_list

In [38]:
X_train_pad = padding(train_text)
X_val_pad = padding(val_text)
X_test_pad = padding(test_text)

In [41]:
X_train = vectorization(fastText, X_train_pad)
X_val = vectorization(fastText, X_val_pad)
X_test = vectorization(fastText, X_test_pad)

In [44]:
X_train = torch.Tensor(X_train)
X_val = torch.Tensor(X_val)
X_test = torch.Tensor(X_test)

  X_train = torch.Tensor(X_train)


In [45]:
# 메모리 문제로 인해 패딩과 임베딩을 완료한 모델을 저장 후 불러오기

torch.save(X_train, '../datasets/X_train.pt')
torch.save(X_val, '../datasets/X_val.pt')
torch.save(X_test, '../datasets/X_test.pt')

In [46]:
X_train = torch.load('../datasets/X_train.pt')
X_val = torch.load('../datasets/X_val.pt')
X_test = torch.load('../datasets/X_test.pt')

### 2. 모델링

In [129]:
var_utils_dict = {'max_len': max_len}

# 변수 저장
with open('../../utils/var_utils.json', 'w') as f:
    json.dump(var_utils_dict, f)

In [130]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {}
        item['encodings'] = torch.tensor(self.encodings[idx])
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [177]:
train_dataset = CustomDataset(X_train, train_label)
val_dataset = CustomDataset(X_val, val_label)
test_dataset = CustomDataset(X_test, test_label)

batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [178]:
class CustomModel(Module):
    def __init__(self, embed_dim, hidden_dim, output_dim, device, num_layers, bidirectional):
        super(CustomModel, self).__init__()
        self.device = device

        self.mul = (2 if bidirectional else 1)

        self.lstm = LSTM(input_size=embed_dim,
                         hidden_size=hidden_dim,
                         num_layers=num_layers,
                         bidirectional=bidirectional,
                         batch_first=True)
        self.fc = Linear(hidden_dim*self.mul, output_dim)

        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

    def forward(self, x):
        hidden_0 = torch.zeros(self.num_layers*self.mul, x.size(0), self.hidden_dim).to(self.device)
        cell_0 = torch.zeros(self.num_layers*self.mul, x.size(0), self.hidden_dim).to(self.device)
        # (num layers * bidirectional, batch size, hidden dim)

        out_lstm, _ = self.lstm(x, (hidden_0, cell_0))

        output = self.fc(out_lstm[:, -1, :])

        return output

In [179]:
EMBED_DIM = 300 # FastText 벡터 사이즈와 동일
device = 'cuda' if torch.cuda.is_available() else 'cpu'
hidden_dim=400
output_dim=2
num_layers=1
bidirectional=True

model = CustomModel(embed_dim=EMBED_DIM, hidden_dim=hidden_dim,
                    output_dim=output_dim, device=device,
                    num_layers=num_layers, bidirectional=bidirectional)

In [180]:
print(model)

CustomModel(
  (lstm): LSTM(300, 400, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=800, out_features=2, bias=True)
)


In [181]:
var_models_dict = {'embed_dim': EMBED_DIM,
                   'hidden_dim': hidden_dim,
                   'output_dim': output_dim,
                   'device': device,
                   'num_layers': num_layers,
                   'bidirectional': bidirectional}

import json

# 변수 저장
with open('../../models/var_models.json', 'w') as f:
    json.dump(var_models_dict, f)

### 3. 훈련, 평가

In [182]:
# 시드 고정
seed = 0
deterministic = True

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [183]:
# Early Stopping 모듈을 다음과 같이 불러왔다.
# Reference: https://github.com/Bjarten/early-stopping-pytorch/blob/main/early_stopping_pytorch/early_stopping.py

class EarlyStopping:
    """Early stops the training if validation loss doesn't improve after a given patience."""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt', trace_func=print):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement.
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
            path (str): Path for the checkpoint to be saved to.
                            Default: 'checkpoint.pt'
            trace_func (function): trace print function.
                            Default: print
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_val_loss = None
        self.early_stop = False
        self.val_loss_min = np.inf
        self.delta = delta
        self.path = path
        self.trace_func = trace_func

    def __call__(self, val_loss, model):
        # Check if validation loss is nan
        if np.isnan(val_loss):
            self.trace_func("Validation loss is NaN. Ignoring this epoch.")
            return

        if self.best_val_loss is None:
            self.best_val_loss = val_loss
            self.save_checkpoint(val_loss, model)
        elif val_loss < self.best_val_loss - self.delta:
            # Significant improvement detected
            self.best_val_loss = val_loss
            self.save_checkpoint(val_loss, model)
            self.counter = 0  # Reset counter since improvement occurred
        else:
            # No significant improvement
            self.counter += 1
            self.trace_func(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decreases.'''
        if self.verbose:
            self.trace_func(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [184]:
from tqdm.auto import tqdm

num_epochs = 30
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()
model.to(device)

es = EarlyStopping(patience=3, verbose=True, delta=0.001, path='../../models/checkpoint.pt')

for epoch in range(num_epochs):

    metric_acc = BinaryAccuracy().to(device)
    metric_f1 = BinaryF1Score().to(device)

    model.train()
    cost = 0

    for batch in tqdm(train_loader):
        inputs = batch['encodings'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs, labels)

        loss.backward()

        optimizer.step()

        cost += loss.item()

        # 정확도, F1 측정
        _, predict = torch.max(outputs.data, -1) # 행별로 최댓값과 index 출력

        acc = metric_acc(predict, labels)
        f1 = metric_f1(predict, labels)

    epoch_acc = metric_acc.compute()
    epoch_f1 = metric_f1.compute()

    avg_loss = cost / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {avg_loss:.4f} - Accuracy: {epoch_acc:.4f} - F1: {epoch_f1:.4f}")

    val_cost = 0

    metric_val_acc = BinaryAccuracy().to(device)
    metric_val_f1 = BinaryF1Score().to(device)

    model.eval()

    with torch.no_grad():
        for batch in tqdm(val_loader):
            val_inputs = batch['encodings'].to(device)
            val_labels = batch['labels'].to(device)

            val_outputs = model(val_inputs)
            val_loss = criterion(val_outputs, val_labels)

            val_cost += val_loss.item()

            # 정확도, F1 측정
            _, val_predict = torch.max(val_outputs.data, -1) # 행별로 최댓값과 index 출력

            val_acc = metric_val_acc(val_predict, val_labels)
            val_f1 = metric_val_f1(val_predict, val_labels)

        epoch_val_acc = metric_val_acc.compute()
        epoch_val_f1 = metric_val_f1.compute()

        avg_val_loss = val_cost / len(val_loader)
        print(f"Epoch {epoch+1}/{num_epochs} - Average Val Loss: {avg_val_loss:.4f} - Val Accuracy: {epoch_val_acc:.4f} - Val F1: {epoch_val_f1:.4f}")

    es(avg_val_loss, model)
    if es.early_stop:
        break

  0%|          | 0/202 [00:00<?, ?it/s]

  item['encodings'] = torch.tensor(self.encodings[idx])


Epoch 1/30 - Average Loss: 0.6545 - Accuracy: 0.6589 - F1: 0.6506


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 1/30 - Average Val Loss: 0.5843 - Val Accuracy: 0.7417 - Val F1: 0.7224
Validation loss decreased (inf --> 0.584320).  Saving model ...


  0%|          | 0/202 [00:00<?, ?it/s]

Epoch 2/30 - Average Loss: 0.5318 - Accuracy: 0.7518 - F1: 0.7362


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 2/30 - Average Val Loss: 0.4956 - Val Accuracy: 0.7667 - Val F1: 0.7572
Validation loss decreased (0.584320 --> 0.495555).  Saving model ...


  0%|          | 0/202 [00:00<?, ?it/s]

Epoch 3/30 - Average Loss: 0.4810 - Accuracy: 0.7809 - F1: 0.7737


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 3/30 - Average Val Loss: 0.4861 - Val Accuracy: 0.7472 - Val F1: 0.7331
Validation loss decreased (0.495555 --> 0.486122).  Saving model ...


  0%|          | 0/202 [00:00<?, ?it/s]

Epoch 4/30 - Average Loss: 0.4477 - Accuracy: 0.7948 - F1: 0.7872


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 4/30 - Average Val Loss: 0.4716 - Val Accuracy: 0.7917 - Val F1: 0.8031
Validation loss decreased (0.486122 --> 0.471555).  Saving model ...


  0%|          | 0/202 [00:00<?, ?it/s]

Epoch 5/30 - Average Loss: 0.4261 - Accuracy: 0.8087 - F1: 0.8028


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 5/30 - Average Val Loss: 0.4879 - Val Accuracy: 0.7639 - Val F1: 0.7709
EarlyStopping counter: 1 out of 3


  0%|          | 0/202 [00:00<?, ?it/s]

Epoch 6/30 - Average Loss: 0.4143 - Accuracy: 0.8149 - F1: 0.8081


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 6/30 - Average Val Loss: 0.4456 - Val Accuracy: 0.7944 - Val F1: 0.7886
Validation loss decreased (0.471555 --> 0.445624).  Saving model ...


  0%|          | 0/202 [00:00<?, ?it/s]

Epoch 7/30 - Average Loss: 0.3964 - Accuracy: 0.8205 - F1: 0.8143


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 7/30 - Average Val Loss: 0.4587 - Val Accuracy: 0.7694 - Val F1: 0.7522
EarlyStopping counter: 1 out of 3


  0%|          | 0/202 [00:00<?, ?it/s]

Epoch 8/30 - Average Loss: 0.3918 - Accuracy: 0.8254 - F1: 0.8196


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 8/30 - Average Val Loss: 0.4482 - Val Accuracy: 0.7833 - Val F1: 0.7809
EarlyStopping counter: 2 out of 3


  0%|          | 0/202 [00:00<?, ?it/s]

Epoch 9/30 - Average Loss: 0.3815 - Accuracy: 0.8270 - F1: 0.8217


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 9/30 - Average Val Loss: 0.4381 - Val Accuracy: 0.8000 - Val F1: 0.7943
Validation loss decreased (0.445624 --> 0.438064).  Saving model ...


  0%|          | 0/202 [00:00<?, ?it/s]

Epoch 10/30 - Average Loss: 0.3690 - Accuracy: 0.8341 - F1: 0.8291


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 10/30 - Average Val Loss: 0.4391 - Val Accuracy: 0.7972 - Val F1: 0.8011
EarlyStopping counter: 1 out of 3


  0%|          | 0/202 [00:00<?, ?it/s]

Epoch 11/30 - Average Loss: 0.3547 - Accuracy: 0.8394 - F1: 0.8349


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 11/30 - Average Val Loss: 0.4518 - Val Accuracy: 0.7917 - Val F1: 0.7956
EarlyStopping counter: 2 out of 3


  0%|          | 0/202 [00:00<?, ?it/s]

Epoch 12/30 - Average Loss: 0.3463 - Accuracy: 0.8521 - F1: 0.8476


  0%|          | 0/23 [00:00<?, ?it/s]

Epoch 12/30 - Average Val Loss: 0.4940 - Val Accuracy: 0.7972 - Val F1: 0.8053
EarlyStopping counter: 3 out of 3


In [185]:
# 모델 checkpoint 불러오기

ckp_model = model
ckp_model.load_state_dict(torch.load('../../models/checkpoint.pt'))

<All keys matched successfully>

In [186]:
ckp_model.eval()

criterion = torch.nn.CrossEntropyLoss()
ckp_model.to(device)

with torch.no_grad():
    metric_test_acc = BinaryAccuracy().to(device)
    metric_test_f1 = BinaryF1Score().to(device)

    test_cost = 0

    for batch in tqdm(test_loader):
        test_inputs = batch['encodings'].to(device)
        test_labels = batch['labels'].to(device)

        test_outputs = ckp_model(test_inputs)
        test_loss = criterion(test_outputs, test_labels)

        test_cost += test_loss.item()

        # 정확도, F1 측정
        _, test_predict = torch.max(test_outputs.data, 1) # 행별로 최댓값과 index 출력

        test_acc = metric_test_acc(test_predict, test_labels)
        test_f1 = metric_test_f1(test_predict, test_labels)

    epoch_test_acc = metric_test_acc.compute()
    epoch_test_f1 = metric_test_f1.compute()

    avg_test_loss = test_cost / len(test_loader)

    print(f"Average Test Loss: {avg_test_loss:.4f} - Test Accuracy: {epoch_test_acc:.4f} - Test F1: {epoch_test_f1:.4f}")

  0%|          | 0/25 [00:00<?, ?it/s]

  item['encodings'] = torch.tensor(self.encodings[idx])


Average Test Loss: 0.4387 - Test Accuracy: 0.7794 - Test F1: 0.7609
