In [1]:
!pip install torchtext==0.6.0
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[K     |████████████████████████████████| 64 kB 388 kB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 34.8 MB/s 
Installing collected packages: sentencepiece, torchtext
  Attempting uninstall: torchtext
    Found existing installation: torchtext 0.13.1
    Uninstalling torchtext-0.13.1:
      Successfully uninstalled torchtext-0.13.1
Successfully installed sentencepiece-0.1.97 torchtext-0.6.0
2022-11-30 03:06:56.604220: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchtext import data
from torchtext import datasets
import numpy as np
import time
import random

In [3]:
seed = 42

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# <b>1. Torchtext로 전처리하기</b>

## <b>(1) Field 정의하기</b>

In [5]:
TEXT = data.Field(lower=True)
UD_TAGS = data.Field(unk_token=None)
PTB_TAGS = data.Field(unk_token=None)

fields = (('text', TEXT), ('udtags',UD_TAGS), ('ptbtags', PTB_TAGS))

## <b>(2) dataset 생성하기</b>

In [6]:
datasets.UDPOS.splits?

In [8]:
trn_data, val_data, tst_data = datasets.UDPOS.splits(fields)
print(len(trn_data))
print(len(val_data))
print(len(tst_data))
print(vars(trn_data.examples[0]).keys())

print(vars(trn_data.examples[0])['text'])
print(len(vars(trn_data.examples[0])['text']))

print(vars(trn_data.examples[0])['udtags']) # 우리가 사용할 label
print(len(vars(trn_data.examples[0])['udtags']))

print(vars(trn_data.examples[0])['ptbtags'])
print(len(vars(trn_data.examples[0])['ptbtags']))

12543
2002
2077
dict_keys(['text', 'udtags', 'ptbtags'])
['al', '-', 'zaman', ':', 'american', 'forces', 'killed', 'shaikh', 'abdullah', 'al', '-', 'ani', ',', 'the', 'preacher', 'at', 'the', 'mosque', 'in', 'the', 'town', 'of', 'qaim', ',', 'near', 'the', 'syrian', 'border', '.']
29
['PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'ADJ', 'NOUN', 'VERB', 'PROPN', 'PROPN', 'PROPN', 'PUNCT', 'PROPN', 'PUNCT', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'DET', 'NOUN', 'ADP', 'PROPN', 'PUNCT', 'ADP', 'DET', 'ADJ', 'NOUN', 'PUNCT']
29
['NNP', 'HYPH', 'NNP', ':', 'JJ', 'NNS', 'VBD', 'NNP', 'NNP', 'NNP', 'HYPH', 'NNP', ',', 'DT', 'NN', 'IN', 'DT', 'NN', 'IN', 'DT', 'NN', 'IN', 'NNP', ',', 'IN', 'DT', 'JJ', 'NN', '.']
29


## <b>(3) 단어 집합(vocab) 만들기</b>
- 사전 학습된 word의 embedding vector불러오기 (by glove method)

In [9]:
MIN_FREQ = 5

TEXT.build_vocab(trn_data, min_freq = MIN_FREQ, 
                 vectors='glove.6B.100d')
UD_TAGS.build_vocab(trn_data)
PTB_TAGS.build_vocab(trn_data)

.vector_cache/glove.6B.zip: 862MB [02:39, 5.41MB/s]                           
100%|█████████▉| 399999/400000 [00:17<00:00, 22447.59it/s]


In [None]:
# 상위 빈도수 20개 단어
from pprint import pprint
a

[('the', 9076),
 ('.', 8640),
 (',', 7021),
 ('to', 5137),
 ('and', 5002),
 ('a', 3782),
 ('of', 3622),
 ('i', 3379),
 ('in', 3112),
 ('is', 2239),
 ('you', 2156),
 ('that', 2036),
 ('it', 1850),
 ('for', 1842),
 ('-', 1426),
 ('have', 1359),
 ('"', 1296),
 ('on', 1273),
 ('was', 1244),
 ('with', 1216)]


In [None]:
# 상위 빈도순으로 udtags출력
pprint(UD_TAGS.vocab.freqs.most_common())

[('NOUN', 34781),
 ('PUNCT', 23679),
 ('VERB', 23081),
 ('PRON', 18577),
 ('ADP', 17638),
 ('DET', 16285),
 ('PROPN', 12946),
 ('ADJ', 12477),
 ('AUX', 12343),
 ('ADV', 10548),
 ('CCONJ', 6707),
 ('PART', 5567),
 ('NUM', 3999),
 ('SCONJ', 3843),
 ('X', 847),
 ('INTJ', 688),
 ('SYM', 599)]


In [None]:
UD_TAGS.vocab.itos

['<pad>',
 'NOUN',
 'PUNCT',
 'VERB',
 'PRON',
 'ADP',
 'DET',
 'PROPN',
 'ADJ',
 'AUX',
 'ADV',
 'CCONJ',
 'PART',
 'NUM',
 'SCONJ',
 'X',
 'INTJ',
 'SYM']

In [None]:
def tag_percentage(tag_cnts):
    total_cnt = sum([cnt for tag, cnt in tag_cnts])
    tag_cnt_ratio = [(tag, cnt, cnt/total_cnt) for tag, cnt in tag_cnts]
    return tag_cnt_ratio

In [None]:
for tag, cnt, percent in tag_percentage(UD_TAGS.vocab.freqs.most_common()):
    print(f'{tag}\t{cnt}\t{percent*100:.1f}%')

NOUN	34781	17.0%
PUNCT	23679	11.6%
VERB	23081	11.3%
PRON	18577	9.1%
ADP	17638	8.6%
DET	16285	8.0%
PROPN	12946	6.3%
ADJ	12477	6.1%
AUX	12343	6.0%
ADV	10548	5.2%
CCONJ	6707	3.3%
PART	5567	2.7%
NUM	3999	2.0%
SCONJ	3843	1.9%
X	847	0.4%
INTJ	688	0.3%
SYM	599	0.3%


## <b>(4) data를 불러오기 위한 iterator 생성하기</b>
- torchtext.data.BucketIterator

In [None]:
BATCH_SIZE = 64
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT_RATIO = 0.25

VOCAB_SIZE = len(TEXT.vocab)
EMBED_DIM = 100
OUTPUT_DIM = len(UD_TAGS.vocab)
HIDDEN_DIM = 128

LR = 0.001
EPOCHS = 10

In [None]:
trn_iter, val_iter, tst_iter = data.BucketIterator.splits(datasets = (trn_data, val_data, tst_data), 
                                                          batch_size = BATCH_SIZE, device=device)
trn_batch = next(iter(trn_iter))
val_batch = next(iter(val_iter))
tst_batch = next(iter(tst_iter))

In [None]:
# batch_size: [seq길이, batch_size]
# 
print(trn_batch.text.shape)
print(trn_batch.udtags.shape)
print(trn_batch.ptbtags.shape)
print(val_batch.text.shape)
print(val_batch.udtags.shape)
print(val_batch.ptbtags.shape)
print(tst_batch.text.shape)
print(tst_batch.udtags.shape)
print(tst_batch.ptbtags.shape)

torch.Size([95, 64])
torch.Size([95, 64])
torch.Size([95, 64])
torch.Size([1, 64])
torch.Size([1, 64])
torch.Size([1, 64])
torch.Size([1, 64])
torch.Size([1, 64])
torch.Size([1, 64])


In [None]:
val_batch.text

tensor([[   0,    0, 1906,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0, 1906,    0,  127,  141,    0,    0, 1906,    0,    0,
            0,    0,    0,    0,  812, 1494,    0,    0,  812,    0,  812,  812,
          439,  812,  127,    0,    0,    0,    0,    0,    0,  322,  678,    0,
            0,  581,    0,    0,    0,    0,   37,    0,   37,   37,   37,    0,
            0,  732,    0, 2355]], device='cuda:0')

In [None]:
val_batch.udtags

tensor([[ 2,  7,  3, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,  3,  7,  1,
          7,  1, 17,  3,  7,  7,  7,  7,  7,  7,  7, 10,  2,  2,  7,  7,  7,  7,
          7,  7,  1,  7,  1,  7,  7,  7,  1,  7,  7,  1, 17,  7, 16,  7,  7,  7,
          2, 17,  2,  2,  2, 17, 17,  7,  7,  2]], device='cuda:0')

### <b>위 과정에서 소모해버린 batch를 다시 포함시키기 위해 iter를 다시 선언할게요</b>

In [None]:
trn_iter, val_iter, tst_iter = data.BucketIterator.splits(datasets = (trn_data, val_data, tst_data), 
                                                          batch_size = BATCH_SIZE, 
                                                          shuffle=True, 
                                                          repeat=False)

# <b> 2. RNN model 구현하기(LSTM)</b>

In [None]:
nn.LSTM?

## <b>(1) RNN기반 POSTagger정의하기</b>

In [None]:
class POSTagger(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, 
                 embed_dim, output_dim, bidirectional, dropout_ratio):
        super(POSTagger, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.bidirectional = bidirectional

        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.dropout = nn.Dropout(dropout_ratio)
        self.lstm = nn.LSTM(input_size=embed_dim, 
                            hidden_size=hidden_dim, 
                            num_layers=n_layers, 
                            bidirectional=bidirectional, 
                            batch_first=False)
        if bidirectional==True:
            self.fc = nn.Linear(self.hidden_dim*2, output_dim)
        else:
            self.fc = nn.Linear(self.hidden_dim, output_dim)

    def forward(self, x):
        # x.shape: [seq_len, batch_size]
        x = self.embed(x)
        # x = self.dropout(self.embed(x))

        # x.shape: [seq_len, batch_size, embed_dim]
        # h0/c0.shape: [n_direction * n_layers, batch_size, hidden_dim]
        if self.bidirectional:
            h0 = torch.zeros(self.n_layers*2, x.shape[1], self.hidden_dim)
            c0 = torch.zeros(self.n_layers*2, x.shape[1], self.hidden_dim)
        else:
            h0 = torch.zeros(self.n_layers, x.shape[1], self.hidden_dim)
            c0 = torch.zeros(self.n_layers, x.shape[1], self.hidden_dim)
        outputs, (hidden, cell) = self.lstm(x, (h0, c0))

        # outputs.shape: [seq_len, batch_size, hidden_dim*n_direction]
        # hidden/cell.shape: [n_direction * n_layers, batch_size, hidden_dim]
        pred = self.fc(self.dropout(outputs))
        return pred        


## <b>(2) POSTagger클래스의 객체 생성하기</b>

In [None]:
model = POSTagger(n_layers=N_LAYERS, 
                  hidden_dim=HIDDEN_DIM, 
                  n_vocab=VOCAB_SIZE, 
                  embed_dim=EMBED_DIM, 
                  output_dim=OUTPUT_DIM, 
                  bidirectional=BIDIRECTIONAL, 
                  dropout_ratio=DROPOUT_RATIO,
                  device=device).to(device)

In [None]:
def count_params(model):
    return (sum(p.numel() for p in model.parameters() if p.requires_grad))

In [None]:
count_params(model)

1027510

## <b>(3) pretrained embeddings를 불러와서 model의 embedding vector에 대입</b>

In [None]:
model.embed.weight.data

tensor([[ 0.0365,  1.4937, -1.0886,  ...,  1.1405, -1.0644,  0.2250],
        [ 1.5411, -0.1954,  0.9268,  ...,  0.9268, -0.1336, -0.0992],
        [ 0.7603, -0.3772,  1.6935,  ...,  0.0609, -0.4518, -0.7856],
        ...,
        [ 0.1608, -0.1501,  0.5520,  ..., -0.7621, -0.3459,  0.2111],
        [ 0.3467, -2.3064,  0.3042,  ..., -2.2237, -1.0584, -0.6246],
        [-0.1133,  0.2989, -1.2837,  ..., -0.9017,  0.7505, -0.9861]],
       device='cuda:0')

In [None]:
# pretrained vectors로 기존 모델의 embedding vectors를 덮어씌운 결과
model.embed.weight.data.copy_(TEXT.vocab.vectors)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1020,  0.7700,  0.1169,  ..., -0.1416, -0.1932, -0.4225],
        [-0.0263,  0.0179, -0.5016,  ..., -0.8688,  0.9409, -0.2882],
        [ 0.1519,  0.4712,  0.0895,  ..., -0.4702, -0.3127,  0.1078]],
       device='cuda:0')

In [None]:
model.embed.weight.data

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1020,  0.7700,  0.1169,  ..., -0.1416, -0.1932, -0.4225],
        [-0.0263,  0.0179, -0.5016,  ..., -0.8688,  0.9409, -0.2882],
        [ 0.1519,  0.4712,  0.0895,  ..., -0.4702, -0.3127,  0.1078]],
       device='cuda:0')

## (4) <b> \<unk\>과 \<pad\>토큰의 인덱스 지정 및 zero vector로 초기화

In [None]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
print(UNK_IDX)
print(PAD_IDX)
 # 0번 임베딩 벡터에는 0값을 채운다.
model.embed.weight.data[UNK_IDX] = torch.zeros(EMBED_DIM)
 # 1번 임베딩 벡터에는 0값을 채운다.
model.embed.weight.data[PAD_IDX] = torch.zeros(EMBED_DIM)

0
1


## <b>(5) optimizer 생성하기</b>

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr = LR)

# <b>3. 모델 학습 및 평가 함수 생성하기</b>

In [None]:
# Padding에 대해서는 loss를 구하지 않도록
loss_func = nn.CrossEntropyLoss(ignore_index=PAD_IDX)

- 위에서 미리 만들어두었던 trn_batch로 결과를 미리 확인해봅시다.

In [None]:
pred = model(trn_batch.text)
print(trn_batch.text.shape) # shape: [seq_len, batch_size]
print(pred.shape) # shape: [seq_len, batch_size, output_dim(tag개수)]

torch.Size([95, 64])
torch.Size([95, 64, 18])


In [None]:
print(trn_batch.udtags)
print(trn_batch.udtags.shape)

tensor([[ 2, 15,  2,  ...,  3,  4,  4],
        [14,  2,  7,  ...,  4,  8,  3],
        [ 2,  7,  0,  ...,  3,  3,  6],
        ...,
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0],
        [ 0,  0,  0,  ...,  0,  0,  0]], device='cuda:0')
torch.Size([95, 64])


- 아래와 같이 바로 pred와 label을 넣으면 loss가 구해지지 않습니다
    - cross entropy를 구할 때, pred의 label로 반드시 class수가 와야합니다.
    - seq_len도 매 배치마다 달라질 수 있습니다.
- loss값을 쉽게 구해주기 위해 약간의 처리를 해주겠습니다.

In [None]:
loss_func(pred, trn_batch.udtags)

RuntimeError: ignored

In [None]:
pred = torch.reshape(pred, shape=(-1, pred.shape[-1]))
print(pred.shape)
label = torch.reshape(trn_batch.udtags, shape=(-1,))
print(label.shape)
loss_func(pred, label)

torch.Size([6080, 18])
torch.Size([6080])


tensor(2.8900, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
def categorical_accuracy(preds, y, tag_pad_idx):
    """
    미니 배치에 대한 정확도 출력
    """

    # get the index of the max probability
    max_preds = preds.argmax(dim = 1, keepdim = True)
    non_pad_elements = (y != tag_pad_idx).nonzero()
    correct = max_preds[non_pad_elements].squeeze(1).eq(y[non_pad_elements])
    return correct.sum().item() / non_pad_elements.shape[0]

In [None]:
def train(model, iterator, optimizer, criterion, pad_idx, device):

    epoch_loss = 0
    epoch_acc = 0

    model.train()
    for batch in iterator:
        #text.shape: [seq_len, batch_size]
        #label.shape: [seq_len, batch_size]
        text = batch.text.to(device)
        label = batch.udtags.to(device)
        optimizer.zero_grad()

        #preds.shape: [seq_len, batch_size, output_dim(n_tags)]
        preds = model(text)

        # preds.shape: [seq_len * batch_size, output_dim]
        # label.shape: [seq_len * batch_size, ]
        preds = preds.view(-1, preds.shape[-1]) 
        label = label.view(-1) 
        
        loss = criterion(preds, label)
        loss.backward()
        optimizer.step()

        acc = categorical_accuracy(preds, label, pad_idx)
        epoch_loss += loss.item()
        epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion, pad_idx, device):

    epoch_loss = 0
    epoch_acc = 0

    model.eval()
    with torch.no_grad():
        for batch in iterator:
            #text.shape: [seq_len, batch_size]
            #label.shape [seq_len, batch_size]
            text = batch.text.to(device)
            label = batch.udtags.to(device)

            #preds.shape: [seq_len, batch_size, output_dim(n_tags)]
            preds = model(text)

            # preds.shape [seq_len * batch_size, output_dim]
            # label.shape [seq_len * batch_size]
            preds = preds.view(-1, preds.shape[-1])
            label = label.view(-1)

            loss = criterion(preds, label)

            acc = categorical_accuracy(preds, label, pad_idx)
            epoch_loss += loss.item()
            epoch_acc += acc

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
best_val_loss = float('inf')

for epoch in range(EPOCHS):

    trn_loss, trn_acc = train(model=model, iterator=trn_iter, 
                              optimizer=optimizer, criterion=loss_func, 
                              pad_idx=PAD_IDX, device=device)
    val_loss, val_acc = evaluate(model=model, iterator=val_iter, 
                                 criterion=loss_func, pad_idx=PAD_IDX, 
                                 device=device)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'tut1-model.pt')

    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {trn_loss:.4f} | Train Acc: {trn_acc*100:.2f}%')
    print(f'\t Val. Loss: {val_loss:.4f} |  Val. Acc: {val_acc*100:.2f}%')

Epoch: 01
	Train Loss: 0.4575 | Train Acc: 86.56%
	 Val. Loss: 0.8055 |  Val. Acc: 79.27%
Epoch: 02
	Train Loss: 0.0901 | Train Acc: 97.43%
	 Val. Loss: 0.5760 |  Val. Acc: 85.23%
Epoch: 03
	Train Loss: 0.0595 | Train Acc: 98.19%
	 Val. Loss: 0.5168 |  Val. Acc: 86.67%
Epoch: 04
	Train Loss: 0.0489 | Train Acc: 98.48%
	 Val. Loss: 0.5038 |  Val. Acc: 86.45%
Epoch: 05
	Train Loss: 0.0429 | Train Acc: 98.67%
	 Val. Loss: 0.4721 |  Val. Acc: 87.46%
Epoch: 06
	Train Loss: 0.0381 | Train Acc: 98.79%
	 Val. Loss: 0.4786 |  Val. Acc: 87.30%
Epoch: 07
	Train Loss: 0.0350 | Train Acc: 98.89%
	 Val. Loss: 0.4614 |  Val. Acc: 87.77%
Epoch: 08
	Train Loss: 0.0302 | Train Acc: 99.05%
	 Val. Loss: 0.4415 |  Val. Acc: 88.33%
Epoch: 09
	Train Loss: 0.0278 | Train Acc: 99.12%
	 Val. Loss: 0.4444 |  Val. Acc: 88.29%
Epoch: 10
	Train Loss: 0.0246 | Train Acc: 99.23%
	 Val. Loss: 0.4387 |  Val. Acc: 88.35%
