## 20200602 Tuesday

In [1]:
import torch.nn as nn
import torch
import torch.nn.utils.rnn as rnn
import statistics
import nltk
import random
import collections
import time


In [2]:
 # Set cuda if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

cuda


In [3]:
## Dictionary class 선언

class Dictionary(object):
    def __init__(self, dataset, size):
        ## init vocab ##
        self.word2idx = {'<pad>':0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
        self.idx2word = ['<pad>', '<sos>', '<eos>', '<unk>'] #idx를 넣어주면 이 친구들이 나온다

        self.build_dict(dataset, size)

    def __call__(self, word):
        return self.word2idx.get(word, 3) # if word does not exist in vocab then return unk idx

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def build_dict(self, dataset, dict_size): #사전을 만들어주는 역할
        """Tokenizes a text file."""
        total_words = (word for sent in dataset for word in sent)
        word_freq = collections.Counter(total_words) # count the number of each word {'the':100, 'a':1000, 'NLP':50...} / 빈도수 정렬
        vocab = sorted(word_freq.keys(), key=lambda word: (-word_freq[word], word)) # sort by frequency [the, a , am, we, ...]
        vocab = vocab[:dict_size]
        for word in vocab:
            self.add_word(word)

    def __len__(self):
        return len(self.idx2word)


In [4]:
## Brown dataset Preprocessing (NLTK) 전처리 단계
def brown_dataset(min=5, max=30):
    nltk.download('brown') #학습말뭉치 다운로드

    # get sentences with the length between min and max
    # convert all words into lower-case // 소문자로 변환
    all_seq = [[token.lower() for token in seq] for seq in nltk.corpus.brown.sents() if min <= len(seq) <= max]

    random.shuffle(all_seq) # shuffle
    return all_seq

In [5]:
## Download Brown dataset
dataset = brown_dataset()

## print some part
print(dataset[0])
print(dataset[1])
print(dataset[2])

[nltk_data] Downloading package brown to /home/pirl/nltk_data...
[nltk_data]   Package brown is already up-to-date!


['the', 'inner', 'panels', 'do', 'not', 'have', 'to', 'be', 'weatherproof', ',', 'and', 'the', 'choice', 'will', 'depend', 'on', 'the', 'quality', 'of', 'finish', 'desired', '.']
['somebody', 'with', 'no', 'vices', '.']
['from', 'childhood', 'he', 'had', 'known', 'all', 'about', 'knives', '.']


In [6]:
## Data Handling class 선언

class Corpus(object): # 위에 데이터셋, CPU or GPU, 사전 크기, 트레이닝 비율 (나머지는 테스트)
    def __init__(self, dataset, device, dict_size=20000, train_ratio=0.95):
        train_size = int(len(dataset) * train_ratio)
        valid_size = len(dataset) - train_size
        self.device = device
        self.dictionary = Dictionary(dataset, dict_size)
        self.train = dataset[:train_size]
        self.valid = dataset[:valid_size]

    def indexing(self, dat): # 매칭되는 인덱스 만들기
        src_idxes = []
        tgt_idxes = []
        for sent in dat:
            src_idx = [self.dictionary('<sos>')] + [self.dictionary(word) for word in sent] #시작
            tgt_idx = [self.dictionary(word) for word in sent] + [self.dictionary('<eos>')] #끝
            src_idxes.append(torch.tensor(src_idx).type(torch.int64)) # append 해서 matrix로 만드는 역할
            tgt_idxes.append(torch.tensor(tgt_idx).type(torch.int64)) # append 해서 matrix로 만드는 역할

        src_idxes = rnn.pad_sequence(src_idxes, batch_first=True).to(self.device) # shape = [B, L]
        tgt_idxes = rnn.pad_sequence(tgt_idxes, batch_first=True).to(self.device).view(-1) # flatten shape = [B * L]

        return src_idxes, tgt_idxes

    def batch_iter(self, batch_size, isTrain=True): # False면 학습 X
        dat = self.train if isTrain else self.valid 
        if isTrain: # 훈련이면 셔플
            random.shuffle(dat)

        for i in range(len(dat) // batch_size):
            batch = dat[i * batch_size: (i+1) * batch_size]
            src, tgt = self.indexing(batch)
            yield {'src': src, 'tgt': tgt}

In [7]:
corpus = Corpus(dataset, device)

In [8]:
# Dictionary 확인

for i, (key, val) in enumerate(corpus.dictionary.word2idx.items()):
    print('word:  {:10s} | index: {:5d} '.format(key, val))
    if i == 20:
        break


word:  <pad>      | index:     0 
word:  <sos>      | index:     1 
word:  <eos>      | index:     2 
word:  <unk>      | index:     3 
word:  the        | index:     4 
word:  .          | index:     5 
word:  ,          | index:     6 
word:  of         | index:     7 
word:  and        | index:     8 
word:  to         | index:     9 
word:  a          | index:    10 
word:  in         | index:    11 
word:  was        | index:    12 
word:  he         | index:    13 
word:  is         | index:    14 
word:  ''         | index:    15 
word:  ``         | index:    16 
word:  it         | index:    17 
word:  that       | index:    18 
word:  for        | index:    19 
word:  ;          | index:    20 


In [9]:
## indexing 함수 결과 확인

# case : 단일 문장 입력 시. 
sent = [dataset[1]]
idx_src, idx_tgt = corpus.indexing(sent)

print(sent) 
print(idx_src) # <SOS> index로 시작
print(idx_tgt) # <EOS> index로 종료

print('-' * 90)
## case : 복수 문장 입력 시 (batching)
batch = [dataset[0], dataset[1]]
idx_src, idx_tgt = corpus.indexing(batch)

print(batch) # 입력된 값
print(idx_src) # 가장 길이가 긴 문장 (dataset[0]) 보다 짧은 문장 (dataset[1]) 의 경우 남는 길이만큼 padding=0 삽입 확인.
print(idx_tgt)

[['somebody', 'with', 'no', 'vices', '.']]
tensor([[    1,  1841,    22,    56, 12178,     5]], device='cuda:0')
tensor([ 1841,    22,    56, 12178,     5,     2], device='cuda:0')
------------------------------------------------------------------------------------------
[['the', 'inner', 'panels', 'do', 'not', 'have', 'to', 'be', 'weatherproof', ',', 'and', 'the', 'choice', 'will', 'depend', 'on', 'the', 'quality', 'of', 'finish', 'desired', '.'], ['somebody', 'with', 'no', 'vices', '.']]
tensor([[    1,     4,  1720,  2128,    81,    32,    37,     9,    26, 15212,
             6,     8,     4,   817,    57,  2875,    24,     4,   892,     7,
          2489,  2399,     5],
        [    1,  1841,    22,    56, 12178,     5,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0]], device='cuda:0')
tensor([    4,  1720,  2128,    81,    32,    37,     9,    26, 15212,     6,
            8,     4,   817,   

In [10]:
## RNN Language model 선언

# Define network
class RNNModel(nn.Module):
    def __init__(self, ntoken, hidden_size, nlayers, dropout=0.1): # ntoken = dict_size , hidden_size = embedding vector 차원 크기
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.embeddings = nn.Embedding(ntoken, hidden_size, padding_idx=0)
        self.rnn = nn.LSTM(hidden_size, hidden_size, nlayers, dropout=dropout, batch_first=True) # 이전 히든 사이즈, RNN을 거친 히든 사이즈 ...
        self.output_layer = nn.Linear(hidden_size, ntoken)
        self.sm = nn.LogSoftmax(dim=1)

        self.ntoken = ntoken
        self.hidden_size = hidden_size
        self.nlayers = nlayers

        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.embeddings.weight.data.uniform_(-initrange, initrange)
        self.output_layer.weight.data.uniform_(-initrange, initrange)
        self.output_layer.bias.data.zero_()

    def forward(self, input, hidden): # input_shape = (batch, length)
        emb = self.embeddings(input) # emb = (batch, length, dim) 임베딩을 거치며 dimention이 하나 생김
        output, hidden = self.rnn(emb, hidden) # output = (batch. length. dim)
        output = self.drop(output)
        output = self.output_layer(output) #output = (batch, length, vocab_size)
        output = output.view(-1, self.ntoken) # output = (batch * length, dim) // flatten 과정

        return self.sm(output), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters()) # to set init tensor with the same torch.dtype and torch.device
        return (weight.new_zeros(self.nlayers, bsz, self.hidden_size),
                weight.new_zeros(self.nlayers, bsz, self.hidden_size))


In [11]:
# Hyperparameters
batch_size = 40
hidden_size = 256
dropout = 0.2
max_epoch = 30

# build model (모델 생성)
ntokens = len(corpus.dictionary) # 사전 크기 전달 받음
model = RNNModel(ntokens, hidden_size, 2, dropout).to(device)

# set loss func and optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
criterion = nn.NLLLoss(ignore_index=0, reduction='mean')

In [12]:
##### Training / Evaluation Parts #######

In [13]:
# accuracy
def cal_acc(scores, target):
    pred = scores.max(-1)[1]
    non_pad = target.ne(0)
    num_correct = pred.eq(target).masked_select(non_pad).sum().item() 
    num_non_pad = non_pad.sum().item()
    return 100 * (num_correct / num_non_pad)

In [14]:
# train func(가장 중요.)
def train():
    model.train() # Turn on training mode which enables dropout.
    mean_loss = []
    mean_acc = []
    start_time = time.time()

    # batch['src'], batch['tgt']
    for batch in corpus.batch_iter(batch_size): # 학습데이터가 batch에 담긴다.
        hidden = model.init_hidden(batch_size) # zero vectors for init hidden RNN의 첫 번째 hidden vector를 0으로 초기화
        target = batch['tgt'] # flattened target 
        optimizer.zero_grad()
        output, hidden = model(batch['src'], hidden) # output = flatten output = [Batch_size * Length, vocab_size]

        loss = criterion(output, target) # compare between vocab_prob and answer_prob(one-hot converted)
        loss.backward()
        optimizer.step()

        mean_loss.append(loss.item())
        mean_acc.append(cal_acc(output, target))

    total_time = time.time() - start_time
    mean_acc = statistics.mean(mean_acc)
    mean_loss = statistics.mean(mean_loss)

    return mean_loss, total_time, mean_acc

In [15]:
# evaluation func.
def evaluate():
    model.eval() # Turn off dropout
    mean_loss = []
    mean_acc = []

    for batch in corpus.batch_iter(batch_size, isTrain=False):
        with torch.no_grad():
            hidden = model.init_hidden(batch_size)
            target = batch['tgt']
            output, hidden = model(batch['src'], hidden)
            loss = criterion(output, target)
            mean_loss.append(loss.item())
            mean_acc.append(cal_acc(output, target))

    mean_acc = statistics.mean(mean_acc)
    mean_loss = statistics.mean(mean_loss)

    return mean_loss, mean_acc

In [16]:
isTrain = False # Flag variable -> 학습 한번 하고 False로 바꾸기

if isTrain: # set False if you don't need to train model
    start_time = time.time()

    for epoch in range(1, max_epoch+1):
        loss, epoch_time, accuracy = train()
        print('epoch {:4d} | times {:3.3f} |  loss: {:3.3f} | accuracy: {:3.2f}'.format(epoch+1, epoch_time, loss, accuracy))

        if epoch % 10 == 0:
            accuracy = evaluate()
            print('=' * 60)
            print('Evaluation | loss: {:3.3f} | accuracy: {:3.2f}'.format(epoch+1, epoch_time, loss, accuracy))
            print('=' * 60)

    with open('model.pt', 'wb') as f:
        print('save model at: ./model.pt')
        torch.save(model, f)

epoch    2 | times 23.597 |  loss: 6.277 | accuracy: 16.10
epoch    3 | times 23.767 |  loss: 5.556 | accuracy: 20.82
epoch    4 | times 23.906 |  loss: 5.299 | accuracy: 22.19
epoch    5 | times 23.873 |  loss: 5.105 | accuracy: 23.06
epoch    6 | times 23.887 |  loss: 4.946 | accuracy: 23.72
epoch    7 | times 23.965 |  loss: 4.808 | accuracy: 24.24
epoch    8 | times 23.740 |  loss: 4.684 | accuracy: 24.65
epoch    9 | times 24.049 |  loss: 4.571 | accuracy: 25.01
epoch   10 | times 24.067 |  loss: 4.465 | accuracy: 25.37
epoch   11 | times 24.057 |  loss: 4.368 | accuracy: 25.70
Evaluation | loss: 11.000 | accuracy: 24.06
epoch   12 | times 18.774 |  loss: 4.281 | accuracy: 26.07
epoch   13 | times 17.340 |  loss: 4.205 | accuracy: 26.40
epoch   14 | times 22.175 |  loss: 4.134 | accuracy: 26.72
epoch   15 | times 18.202 |  loss: 4.075 | accuracy: 27.07
epoch   16 | times 17.330 |  loss: 4.022 | accuracy: 27.46
epoch   17 | times 17.323 |  loss: 3.970 | accuracy: 27.75
epoch   18 |

  "type " + obj.__name__ + ". It won't be checked "


In [22]:
# 실습 과제
# train 함수를 참고하면 쉽게 짤 수 있음.
def pred_sent_prob(sent):
    import numpy as np
    model.eval()
    with torch.no_grad():
        # 1. construct input for the model // input을 만든다 -> corpus.indexing 이용
        idx_src, idx_tgt = corpus.indexing(sent)
        # 2. set init hidden
        hidden = model.init_hidden(len(sent))
        # 3. get model output 적절히 모델에 전달 log softmax 값이 output으로 출력됨(즉, 전체 단어에 대한 log 확률 출력됨.)
        output, hidden = model(idx_src, hidden) # output shape = (idx * length, vocab_size)
        # 4. get word log_prob output 각각에 대해 입력 단어의 확률 찾는다
        log_prob = []
        for i in range(len(output)) :
            log_prob.append(output[i][idx_src[0][i]])
        # 5. comput sentence log_prob by summing each of word log_prob --> 각각 단어 확률을 모두 더한다.
        sent_prob = np.sum(log_prob)
        return sent_prob

In [23]:
# load saved model
with open('./model.pt', 'rb') as f: # 모델을 불러옴
    print('load model from: ./model.pt')
    model = torch.load(f).to(device)

    print('log prob of [the dog bark .]: {:3.3f}'.format(pred_sent_prob([['the', 'dog', 'bark', '.']])))
    print('log prob of [the cat bark .]: {:3.3f}'.format(pred_sent_prob([['the', 'cat', 'bark', '.']])))

    print('log prob of [boy am a i .]: {:3.3f}'.format(pred_sent_prob([['boy', 'am', 'a', 'i', '.']])))
    print('log prob of [i am a boy .]: {:3.3f}'.format(pred_sent_prob([['i', 'am', 'a', 'boy', '.']])))


load model from: ./model.pt
log prob of [the dog bark .]: -94.374
log prob of [the cat bark .]: -115.751
log prob of [boy am a i .]: -100.460
log prob of [i am a boy .]: -97.180
