In [1]:
import csv
import sys
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from konlpy.tag import Komoran
import random
from torchtext.data import TabularDataset
from torchtext import data

In [2]:
#GPU와 CPU 사용 설정
USE_CUDA = torch.cuda.is_available()
DEVICE = torch.device("cuda" if USE_CUDA else "cpu")
print("cpu와 cuda 중 다음 기기로 학습함:", DEVICE)

cpu와 cuda 중 다음 기기로 학습함: cuda


In [3]:
#konlpy의 Komoran을 사용하여 텍스트와 레이블 토크나이즈(패딩도 자동으로 진행)
tokenizer = Komoran()
TEXT = data.Field(sequential=True,
                  use_vocab=True,
                  tokenize=tokenizer.morphs,
                  lower=True,
                  batch_first=True,
                  fix_length=60)

LABEL = data.Field(sequential=False,
                   use_vocab=False,
                   batch_first=False,
                   is_target=True,
                  dtype = torch.float)

In [4]:
#train data와 test data 분할
train_data, test_data = TabularDataset.splits(path = '.', train = '악플_train.csv', test = '악플_test.csv', format = 'csv', fields = [('text', TEXT), ('label', LABEL)], skip_header = True)

In [5]:
len(train_data)

70537

In [6]:
len(test_data)

17633

In [7]:
#단어 집합(중복을 제거한 총 단어들의 집합) 만들기
TEXT.build_vocab(train_data, min_freq = 5, max_size = 20000) # min_freq = 최소 5번 이상 나온 단어만 단어 집합에 담는다. max_size = 단어 집합의 최대 사이즈 = 20000

In [8]:
len(TEXT.vocab)

13773

In [9]:
#하이퍼 파라미터 변수 정의
batch_size = 128
lr = 0.001
epochs = 10

In [10]:
#훈련 데이터와 검증 데이터 분할
train_data, val_data = train_data.split(split_ratio = 0.8)

train_iter, valid_iter, test_iter = data.BucketIterator.splits(
        (train_data, val_data, test_data), batch_size=batch_size,
        shuffle=True, repeat=False, device = DEVICE, sort=False)

In [11]:
print('훈련 데이터의 미니 배치의 개수 : {}'.format(len(train_iter)))
print('테스트 데이터의 미니 배치의 개수 : {}'.format(len(test_iter)))
print('검증 데이터의 미니 배치의 개수 : {}'.format(len(valid_iter)))

훈련 데이터의 미니 배치의 개수 : 441
테스트 데이터의 미니 배치의 개수 : 138
검증 데이터의 미니 배치의 개수 : 111


In [12]:
class LSTM(nn.Module):
    def __init__(self, **model_config):
        super(LSTM, self).__init__()
        
        if model_config['emb_type'] == 'glove' or 'fasttext':
            self.emb = nn.Embedding(model_config['vocab_size'],
                                   model_config['emb_dim'],
                                   _weight = TEXT.vocab.vectors)
        else:
            self.emb = nn.Embedding(model_config['vocab_size'],
                                   model_config['emb_dim'])
            
        self.bidirectional = model_config['bidirectional']
        self.num_direction = 2 if model_config['bidirectional'] else 1
        self.model_type = model_config['model_type']
        
        self.LSTM = nn.LSTM(input_size = model_config['emb_dim'],
                           hidden_size = model_config['hidden_dim'],
                           dropout = model_config['dropout'],
                           bidirectional = model_config['bidirectional'],
                           batch_first = model_config['batch_first'])
        
        self.fc = nn.Linear(model_config['hidden_dim'] * self.num_direction,
                           model_config['output_dim'])
        self.drop = nn.Dropout(model_config['dropout'])
        
    def forward(self, x):
        emb = self.emb(x)
        output, (hidden, cell) = self.LSTM(emb)
        last_output = output[:,-1,:]
        
        return self.fc(self.drop(last_output))

In [13]:
sample_for_check = next(iter(train_iter))
print(sample_for_check)
print(sample_for_check.text)
print(sample_for_check.label)


[torchtext.data.batch.Batch of size 128]
	[.text]:[torch.cuda.LongTensor of size 128x60 (GPU 0)]
	[.label]:[torch.cuda.FloatTensor of size 128 (GPU 0)]
tensor([[ 211,    2,   52,  ...,    1,    1,    1],
        [   3,   20, 1248,  ...,    1,    1,    1],
        [   0,  886,    1,  ...,    1,    1,    1],
        ...,
        [  78,   12, 5418,  ...,    1,    1,    1],
        [9875,    5,   73,  ...,    1,    1,    1],
        [1334,  162,  225,  ...,    1,    1,    1]], device='cuda:0')
tensor([0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1., 0.,
        0., 0., 1., 0., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        1., 0., 0., 1., 1., 1., 0., 0., 1., 0., 0., 1., 0., 1., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1.,
        0., 0., 1., 0., 

In [28]:
model_config = dict(batch_first = True,
                        model_type = 'LSTM',
                        bidirectional = True,
                        hidden_dim = 128,
                        output_dim = 1,
                        dropout = 0.8, #드롭아웃 비율 설정
                   emb_type = '',
                   vocab_size = len(TEXT.vocab),
                   emb_dim = 300,
                   batch_size = batch_size
                   )

In [29]:
model = LSTM(**model_config).to(DEVICE)

In [30]:
loss_fn = nn.BCEWithLogitsLoss().to(DEVICE)

In [42]:
optimizer = torch.optim.Adam(model.parameters(), lr, weight_decay=1e-5) #Adam(가중치 규제)

In [43]:
def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum()/len(correct)
    return acc

In [44]:
predictions = model.forward(sample_for_check.text).squeeze()

In [45]:
loss = loss_fn(predictions, sample_for_check.label)

In [46]:
acc = binary_accuracy(predictions, sample_for_check.label)

In [47]:
print(loss)

tensor(0.1522, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)


In [48]:
def train(model, iterator, optimizer, loss_fn, idx_Epoch, **model_params):
    
    Epoch_loss = 0
    Epoch_acc = 0
    model.train()
    batch_size = model_params['batch_size']
    
    for idx, batch in enumerate(iterator):
        optimizer.zero_grad()
        
        predictions = model(next(iter(batch))).squeeze()
        loss = loss_fn(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        
        sys.stdout.write(
        "\r" + f"[Train] Epoch: {idx_Epoch:^3}"\
            f"[{(idx + 1) * batch_size} / {len(iterator) * batch_size}({100. *(idx + 1) / len(iterator) :.4}%)]"\
            f"    Loss: {loss.item()}"\
            f"    Acc: {acc.item()}"\
        )
        
        # Backward
        loss.backward()
        optimizer.step()
        
        Epoch_loss += loss.item()
        Epoch_acc += acc.item()
        
    return Epoch_loss/len(iterator), Epoch_acc/len(iterator)

In [49]:
def evaluate(model, iterator, loss_fn):
    Epoch_loss = 0
    Epoch_acc = 0
    
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = loss_fn(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)
            
            Epoch_loss += loss.item()
            Epoch_acc += acc.item()
    
    return Epoch_loss/len(iterator), Epoch_acc/len(iterator)

In [50]:
N_EPOCH = epochs
best_valid_loss = float('inf')
model_name = f"{'bi' if model_config['bidirectional'] else ''}{model_config['model_type']}_{model_config['emb_type']}"

In [51]:
print('-'*20)
print(f'Model name: {model_name}')
print('-'*20)

for Epoch in range(N_EPOCH):
    train_loss, train_acc = train(model, train_iter, optimizer, loss_fn, Epoch, **model_config)
    valid_loss, valid_acc = evaluate(model, valid_iter, loss_fn)
    print('')
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), f'./{model_name}.pt')
        print(f'\t Saved at {Epoch}-Epoch')
    
    print(f'\t Epoch: {Epoch} | Train Loss: {train_loss:.4} | Train Acc: {train_acc:.4}')
    print(f'\t Epoch: {Epoch} | Valid Loss: {valid_loss:.4} | Valid Acc: {valid_acc:.4}')

--------------------
Model name: biLSTM_
--------------------
	 Saved at 0-Epoch
	 Epoch: 0 | Train Loss: -0.2424 | Train Acc: 0.9354
	 Epoch: 0 | Valid Loss: 0.7992 | Valid Acc: 0.8174
	 Saved at 1-Epoch
	 Epoch: 1 | Train Loss: -0.3146 | Train Acc: 0.9402
	 Epoch: 1 | Valid Loss: 0.7942 | Valid Acc: 0.8132
	 Epoch: 2 | Train Loss: -0.7835 | Train Acc: 0.9457
	 Epoch: 2 | Valid Loss: 0.8692 | Valid Acc: 0.819
	 Epoch: 3 | Train Loss: -0.6433 | Train Acc: 0.9507
	 Epoch: 3 | Valid Loss: 0.9246 | Valid Acc: 0.8063
	 Epoch: 4 | Train Loss: -0.7768 | Train Acc: 0.9538
	 Epoch: 4 | Valid Loss: 0.8869 | Valid Acc: 0.813
	 Epoch: 5 | Train Loss: -0.6221 | Train Acc: 0.9575
	 Epoch: 5 | Valid Loss: 0.9257 | Valid Acc: 0.8077
	 Epoch: 6 | Train Loss: -1.023 | Train Acc: 0.9605
	 Epoch: 6 | Valid Loss: 0.9269 | Valid Acc: 0.8139
	 Epoch: 7 | Train Loss: -1.094 | Train Acc: 0.9629
	 Epoch: 7 | Valid Loss: 0.9489 | Valid Acc: 0.8137
	 Epoch: 8 | Train Loss: -0.8338 | Train Acc: 0.9641
	 Epoch: 8 

In [52]:
test_loss, test_acc = evaluate(model, test_iter, loss_fn)
print('')
print(f'Test Loss: {test_loss:.4} | Test Acc: {test_acc:.4}')


Test Loss: 4.041 | Test Acc: 0.8115
