# <b>1. 환경설정하기</b>
- <b>이 코드는 torchtext==0.6.0 이전 버전에서 동작합니다</b>




In [33]:
!pip install torchtext==0.6.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [34]:
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 33.2 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [35]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchtext import data, datasets


In [36]:
seed = 42

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True

if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

# <b> 2. torchtext로 전처리하기</b>

## <b>(1) data 로드하기(by torchtext.data)</b>
- data는 IMDB dataset을 사용하겠습니다

In [1]:
TEXT = data.Field(tokenize='spacy', 
                  tokenizer_language = 'en_core_web_sm', 
                  sequential=True, 
                  batch_first=True, 
                  lower=True)
LABEL = data.LabelField(sequential=False, 
                        batch_first=True, 
                        dtype=torch.long)

NameError: name 'data' is not defined

## <b>(2) data로드 및 분할하기</b>
- torchtext.datasets으로 IMDB dataset을 다운로드 받고, train, test로 분할

In [67]:
trn_dset, tst_dset = datasets.IMDB.splits(TEXT, LABEL)
trn_dset, val_dset = trn_dset.split(random_state = random.seed(seed), 
                                    split_ratio = 0.8)

In [68]:
print(len(trn_dset))
print(len(val_dset))
print(len(tst_dset))

20000
5000
25000


In [69]:
print(vars(trn_dset[0]))

{'text': ['the', 'ballad', 'of', 'django', 'is', 'a', 'meandering', 'mess', 'of', 'a', 'movie', '!', 'this', 'spaghetti', 'western', 'is', 'simply', 'a', 'collection', 'of', 'scenes', 'from', 'other', '(', 'and', 'much', 'better', '!', ')', 'films', 'supposedly', 'tied', 'together', 'by', '"', 'django', '"', 'telling', 'how', 'he', 'brought', 'in', 'different', 'outlaws', '.', 'hunt', 'powers', '(', 'john', 'cameron', ')', 'brings', 'nothing', 'to', 'the', 'role', 'of', 'django', '.', 'skip', 'this', 'one', 'unless', 'you', 'just', 'have', 'to', 'have', 'every', 'django', 'movie', 'made', 'and', 'even', 'that', 'may', 'not', 'be', 'a', 'good', 'enough', 'excuse', 'to', 'see', 'this', 'one', '!', '!'], 'label': 'neg'}


## <b>(3) 단어 집합(vocab) 만들기</b>

In [70]:
TEXT.build_vocab?

In [71]:
TEXT.build_vocab(trn_dset, min_freq = 5)
LABEL.build_vocab(trn_dset)

In [72]:
VOCAB_SIZE = len(TEXT.vocab)
n_classes = 2

### TEXT.vocab으로 단어 집합에 접근
- TEXT.vocab.stoi로 DefaultDict형식의 집합을 볼 수 있습니다

In [73]:
TEXT.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x7f2499a237d0>>,
            {'<unk>': 0,
             '<pad>': 1,
             'the': 2,
             ',': 3,
             '.': 4,
             'and': 5,
             'a': 6,
             'of': 7,
             'to': 8,
             'is': 9,
             'it': 10,
             'in': 11,
             'i': 12,
             'this': 13,
             'that': 14,
             '"': 15,
             "'s": 16,
             '-': 17,
             '/><br': 18,
             'was': 19,
             'as': 20,
             'for': 21,
             'with': 22,
             'movie': 23,
             'but': 24,
             'film': 25,
             'on': 26,
             'you': 27,
             '(': 28,
             "n't": 29,
             ')': 30,
             'not': 31,
             'are': 32,
             'he': 33,
             'his': 34,
             'have': 35,
             'be': 36,
             'one': 37,
      

## <b>(4) data를 불러오기 위한 iterator 생성하기</b>
- torchtext.data.BucketIterator

In [74]:
BATCH_SIZE = 64
N_LAYERS = 1
HIDDEN_DIM = 256
N_CLASSES = 2
DROPOUT_RATIO = 0.3
VOCAB_SIZE = len(TEXT.vocab)
EMBED_DIM = 128
LR = 0.001
EPOCHS = 10

In [75]:
data.BucketIterator.splits?

In [76]:
trn_iter, val_iter, tst_iter = data.BucketIterator.splits(datasets = (trn_dset, val_dset, tst_dset), 
                                                          batch_size = BATCH_SIZE, 
                                                          shuffle=True, 
                                                          repeat=False)

In [77]:
trn_iter

<torchtext.data.iterator.BucketIterator at 0x7f25885c74d0>

- 반복자 (iter)와 반복자에서 next data를 추출하는 next()로 미니배치를 생성

In [78]:
trn_batch = next(iter(trn_iter))
val_batch = next(iter(val_iter))
tst_batch = next(iter(tst_iter))

In [79]:
trn_batch.text

tensor([[  12, 1905, 6508,  ...,    1,    1,    1],
        [  12, 4830,   11,  ...,    1,    1,    1],
        [  50,   78,    8,  ...,    1,    1,    1],
        ...,
        [ 842,   14,   27,  ...,    1,    1,    1],
        [ 114,  450,  844,  ...,    1,    1,    1],
        [ 664,    4,   21,  ...,    1,    1,    1]])

In [80]:
trn_batch.label

tensor([1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
        0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1,
        1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0])

### <b>batch로 추출된 문장 길이는 매번 다릅니다</b>

- fix_length를 정해주지 않았기 때문

In [81]:
print(trn_batch.text.shape)
print(val_batch.text.shape)
print(tst_batch.text.shape)

torch.Size([64, 935])
torch.Size([64, 50])
torch.Size([64, 36])


In [82]:
trn_batch = next(iter(trn_iter))
val_batch = next(iter(val_iter))
tst_batch = next(iter(tst_iter))
print(trn_batch.text.shape)
print(val_batch.text.shape)
print(tst_batch.text.shape)

torch.Size([64, 1186])
torch.Size([64, 50])
torch.Size([64, 36])


In [83]:
val_batch.text

tensor([[  13,    9,   41,  ...,    9,  837,   42],
        [  13,    9,    2,  ...,  140,   79,    4],
        [   6,   64, 1706,  ...,   72, 2925,    4],
        ...,
        [  13,    9,  226,  ...,    1,    1,    1],
        [ 558,  419,   21,  ...,    1,    1,    1],
        [ 214,    3,  375,  ...,    1,    1,    1]])

### <b>위 과정에서 소모해버린 batch를 다시 포함시키기 위해 iter를 다시 선언할게요</b>

In [84]:
my_list=['a','b','c']
my_iter = iter(my_list)
print(next(my_iter))
print(next(my_iter))
print(next(my_iter))
# print(next(my_iter))

a
b
c


In [85]:
trn_iter, val_iter, tst_iter = data.BucketIterator.splits(datasets = (trn_dset, val_dset, tst_dset), 
                                                          batch_size = BATCH_SIZE, 
                                                          shuffle=True, 
                                                          repeat=False)

# <b> 3. RNN model 구현하기(GRU)</b>

In [86]:
class GRU(nn.Module):
    def __init__(self, n_layers, hidden_dim, n_vocab, embed_dim, n_classes, dropout_ratio=0.2):
        super(GRU, self).__init__()
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim

        self.embed = nn.Embedding(n_vocab, embed_dim)
        self.dropout = nn.Dropout(dropout_ratio)
        self.gru = nn.GRU(input_size = embed_dim, 
                          hidden_size = hidden_dim, 
                          num_layers = n_layers, 
                          batch_first=True)
        self.out = nn.Linear(self.hidden_dim, n_classes)

    def forward(self, x):
        x = self.embed(x)
        h0 = self._init_state(batch_size = x.shape[0])
        gru_outputs, _ = self.gru(x, h0)
        final_gru_output = gru_outputs[:,-1,:]
        self.dropout(final_gru_output)
        logit = self.out(final_gru_output)
        return logit

    def _init_state(self, batch_size=1):
        weight = next(self.parameters()).data
        return weight.new(self.n_layers, batch_size, self.hidden_dim).zero_()
        

In [87]:
model = GRU(n_layers = N_LAYERS, 
            hidden_dim=HIDDEN_DIM, 
            n_vocab = VOCAB_SIZE, 
            embed_dim = EMBED_DIM, 
            n_classes = N_CLASSES, 
            dropout_ratio = DROPOUT_RATIO).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr = LR)

# <b>4. 모델 학습 및 평가 함수 생성하기</b>

In [88]:
def train(model, opt, data_iterator):
    model.train()
    for i, batch in enumerate(data_iterator):
        x, y = batch.text.to(device), batch.label.to(device)
        opt.zero_grad()

        output = model(x)
        # print(output)
        # print(y)
        loss = F.cross_entropy(output, y)
        
        
        loss.backward()
        opt.step()

In [89]:
def evaluate(model, data_iterator):
    model.eval()
    corrects, total_loss = 0, 0
    for _, batch in enumerate(data_iterator):
        x, y = batch.text.to(device), batch.label.to(device)
        output = model(x)
        loss = F.cross_entropy(output, y, reduction='sum')
        total_loss += loss.item()
        corrects += (output.max(1)[1].view(y.size()).data == y.data).sum()
    size = len(data_iterator.dataset)
    total_loss /= size
    accuracy = 100*corrects / size
    return total_loss, accuracy

In [90]:
best_val_loss = None
for epoch in range(1, EPOCHS+1):
    train(model, optimizer, trn_iter)
    val_loss, val_accuracy = evaluate(model, val_iter)

    print(f"[Epoch: {epoch}] val loss: {val_loss:.2f} | val accuracy: {val_accuracy:.2f}")

    # 검증 오차가 가장 적은 최적의 모델을 저장
    if not best_val_loss or val_loss < best_val_loss:
        if not os.path.isdir("snapshot"):
            os.makedirs("snapshot")
        torch.save(model.state_dict(), './snapshot/txtclassification.pt')
        best_val_loss = val_loss

[Epoch: 1] val loss: 0.69 | val accuracy: 49.96
[Epoch: 2] val loss: 0.69 | val accuracy: 50.44
[Epoch: 3] val loss: 0.69 | val accuracy: 51.26
[Epoch: 4] val loss: 0.69 | val accuracy: 52.44
[Epoch: 5] val loss: 0.59 | val accuracy: 71.76
[Epoch: 6] val loss: 0.36 | val accuracy: 84.06
[Epoch: 7] val loss: 0.28 | val accuracy: 88.02
[Epoch: 8] val loss: 0.28 | val accuracy: 88.74
[Epoch: 9] val loss: 0.33 | val accuracy: 88.36
[Epoch: 10] val loss: 0.42 | val accuracy: 87.70
