In [1]:
# 데이터 다운로드 링크 : http://www.manythings.org/anki
# Base Code : https://wikidocs.net/24996
# 참고 : https://discuss.pytorch.org/t/understanding-lstm-input/31110
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
lines = pd.read_csv('dataset/fra.txt', names=['src', 'tar','drop'], sep='\t')
lines.drop(['drop'], inplace=True, axis=1)
len(lines)


170651

In [2]:
lines = lines[0:10000]
lines.sample(10)

Unnamed: 0,src,tar
4603,You are good.,Vous êtes bon.
4432,Try it again.,Essaye de nouveau.
9133,Tom loves Mary.,Tom adore Marie.
7876,I like talking.,Je me plais à causer.
3114,You're thin.,Vous êtes minces.
1121,Dig faster.,Creusez plus vite.
8272,I'm not racist.,Je ne suis pas raciste.
659,I see Tom.,Je vois Tom.
3330,Get a doctor.,Va chercher un médecin !
3964,I've seen it.,Je l'ai vu.


In [3]:
lines['tar'] = lines['tar'].apply(lambda x : '<SOS> ' + x + ' <EOS>')
lines.sample(5)
# 시작을 의미하는 심볼과 종료를 의미하는 심볼을 각각 \t 와 \n으로 표현.

Unnamed: 0,src,tar
4625,You go first.,<SOS> Vous en premier. <EOS>
804,It's ours.,<SOS> C'est le nôtre. <EOS>
2395,I'll try it.,<SOS> Je le tenterai. <EOS>
2907,We can't go.,<SOS> Nous ne pouvons pas partir. <EOS>
8165,I'll risk that.,<SOS> Je tenterai le coup. <EOS>


In [4]:
# 글자 집합 구축
# src_vocab = set()
# for line in lines['src']:
#     for char in line:
#         src_vocab.add(char)

# tar_vocab=set()
# for line in lines['tar']:
#     for char in line:
#         tar_vocab.add(char)

# src_vocab = set([a for b in lines['src'] for a in b]) # 위 코드와 결과는 같음
# tar_vocab = set([a for b in lines['tar'] for a in b])
# 단어 집합이 아니라 글자 집합이라고 하는 이유는 토큰 단위가 단어가 아니라 글자이기 때문



################################################################
src_vocab = set([a for b in lines['src'] for a in b.split(' ')])
tar_vocab = set([a for b in lines['tar'] for a in b.split(' ')])
# 단어 집합 구축

In [5]:
src_vocab_size = len(src_vocab)+1
tar_vocab_size = len(tar_vocab)+1
print(src_vocab_size)
print(tar_vocab_size)

3102
5999


In [6]:
src_vocab = sorted(list(src_vocab))
tar_vocab = sorted(list(tar_vocab))
print(src_vocab[0:5])
print(tar_vocab[0:5])

# 사전 확인

['$100.', '$5.', '&', '17,', '19.']
['!', '$100.', '19', '2:30.', '50']


In [7]:
src_to_index = dict([(word, i+1) for i, word in enumerate(src_vocab)])
tar_to_index = dict([(word, i+1) for i, word in enumerate(tar_vocab)])
# print(src_to_index)
# print(tar_to_index)
# 정수 인코딩을 위한 과정


In [8]:
encoder_input = []
for line in lines['src']:
    temp_X = []
    for w in line.split(' '):
        temp_X.append(src_to_index[w])
    encoder_input.append(temp_X)
print('영어 정수 인코딩 : ',encoder_input[:5])


decoder_input = []
for line in lines['tar']:
    temp_X = []
    for W in line.split(' '):
        temp_X.append(tar_to_index[W])
    decoder_input.append(temp_X)
print('프랑스어 정수 인코딩 : ', decoder_input[:5])
# 정수 인코딩 결과 5개 샘플
# 프랑스어는 <sos> 때문에 모두 앞에 1이 붙음.


영어 정수 인코딩 :  [[164], [190], [190], [316], [316]]
프랑스어 정수 인코딩 :  [[10, 1064, 1, 9], [10, 936, 1, 9], [10, 938, 9], [10, 219, 9], [10, 218, 9]]


In [9]:
# decoder_target = []
# for line in lines.tar:
#     t=0
#     temp_X = []
#     for w in line:
#       if t>0:
#         temp_X.append(tar_to_index[w])
#       t=t+1
#     decoder_target.append(temp_X)
# print(decoder_target[:5])
print(decoder_input[:5])
decoder_target = [k[1:] for k in decoder_input] # 위와 같은 코드
print(decoder_target[:5])

# 실제값에는 시작 심볼에 해당하는 <sos>가 필요 없으므로 지워준다.

[[10, 1064, 1, 9], [10, 936, 1, 9], [10, 938, 9], [10, 219, 9], [10, 218, 9]]
[[1064, 1, 9], [936, 1, 9], [938, 9], [219, 9], [218, 9]]


In [10]:
max_src_len = max([len(line) for line in lines['src']])
max_tar_len = max([len(line) for line in lines['tar']])
print(max_src_len)
print(max_tar_len)

# 패딩 작업을 위해 가장 긴 길이를 가진 문장의 길이를 구함.

16
69


In [11]:

from keras.preprocessing.sequence import pad_sequences
encoder_input = pad_sequences(encoder_input, maxlen=max_src_len, padding='post')
decoder_input = pad_sequences(decoder_input, maxlen=max_tar_len, padding='post')
decoder_target = pad_sequences(decoder_target, maxlen=max_tar_len, padding='post')
# 영어와 프랑스의 길이는 하나의 쌍이라고 하더라도 전부 다르므로 패딩 할 때도 동일하게 맞춰줄 필요가 없다.
# 영어 데이터는 영어 샘플끼리 프랑스어는 프랑스어 끼리 맞추어 패딩

Using TensorFlow backend.


In [12]:
# from keras.utils import to_categorical
# encoder_input = to_categorical(encoder_input)
# decoder_input = to_categorical(decoder_input)
# decoder_target = to_categorical(decoder_target)

# # 글자단위 번역기므로 워드 임베딩은 별도로 사용 안함.
# # 단어 단위로 바꾸어 워드임베딩을 활용한 seq2seq도 시도예정

In [12]:
import torch
import torch.nn as nn
from torch import optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

encoder_tensor = torch.LongTensor(encoder_input[:10000]).to(device)
decoder_tensor = torch.LongTensor(decoder_input[:10000]).to(device)
decoder_target = torch.LongTensor(decoder_input[:10000]).to(device)

In [14]:
print('encoder_input  :  ',encoder_tensor.shape)
print('decoder_input  :  ',decoder_tensor.shape)
print('decoder_target :  ',decoder_target.shape)

encoder_input  :   torch.Size([10000, 16])
decoder_input  :   torch.Size([10000, 69])
decoder_target :   torch.Size([10000, 69])


In [15]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        hidden_size=32
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(src_vocab_size, self.hidden_size)                       # dictionary size, max(sentence length)
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size, 1)
        
    def forward(self, X,h,c):
        X = self.embedding(X.view(-1))
        X, (hn, cn) = self.lstm(X.view(1,1,-1), (h, c))
        return X, (hn, cn)

In [16]:
first_hn = torch.zeros(1,1,32).to(device)
first_cn = torch.zeros(1,1,32).to(device)
enc = Encoder().to(device)
a, (hf, cf) = enc(encoder_tensor[0][0], first_hn, first_cn)

In [33]:
class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.hidden_size = 32
        self.embedding = nn.Embedding(tar_vocab_size, self.hidden_size)
        self.lstm = nn.LSTM(self.hidden_size, self.hidden_size, 1)
        self.softmax = nn.LogSoftmax(dim=1)
        self.out = nn.Linear(self.hidden_size, tar_vocab_size)
        
    def forward(self, X, h, c):
        X = self.embedding(X)
        X, (hn, cn) = self.lstm(X.view(1,1, -1), (h, c))
        X = self.softmax(self.out(X[0]))
        return X, (hn, cn)
        
        

In [34]:
decoder_tensor[0][0].view(-1)

tensor([10], device='cuda:0')

In [35]:
dec = Decoder().to(device)
a, (b,c) = dec(decoder_tensor[0][0].view(-1), hf, cf)

In [57]:
import random
batch=100
random_choice = random.choice(range(batch))
random_choice

80

In [61]:
%%time
import random
encoder = Encoder().to(device)
decoder = Decoder().to(device)
learning_rate=0.001
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()
iterations = 1000
loss_total = 0
batch=100

for iteration in range(1, iterations+1):
    
    loss = 0
    
    random_choice = random.choice(range(batch))
    
    h = torch.zeros([1,1,32]).to(device)
    c = torch.zeros([1,1,32]).to(device)
    
    encoder_length = encoder_tensor.size(1)
    decoder_length = decoder_tensor.size(1)
    
    for token in range(encoder_length):
        _, (h, c) = encoder(encoder_tensor[random_choice][token], h, c)
    # 한 문장의 encoder 끝
        
        
        
    decoder_input = torch.Tensor([[tar_to_index['<SOS>']]]).long().to(device)
    for token in range(decoder_length-1):
        dec_out, (h, c) = decoder(decoder_input, h, c)
        loss += criterion(dec_out, decoder_target[random_choice][token+1].view(-1))
        decoder_input = decoder_target[random_choice][token+1].view(-1)
    
    loss.backward()  
    encoder_optimizer.step()
    decoder_optimizer.step()

    loss_iter = loss.item() / decoder_length
    loss_total += loss_iter
#     print(decoder_target[0][0])
#     print(X.argmax())
    if iteration % 100 == 0:
        loss_avg = loss_total / 100
        loss_total = 0
        print("[{} - {}%] loss = {:05.4f}".format(iteration, iteration/iterations * 100, loss_avg))

[100 - 10.0%] loss = 5.2897
[200 - 20.0%] loss = 0.4445
[300 - 30.0%] loss = 0.3731
[400 - 40.0%] loss = 0.2715
[500 - 50.0%] loss = 0.2085
[600 - 60.0%] loss = 0.1864
[700 - 70.0%] loss = 0.1823
[800 - 80.0%] loss = 0.1568
[900 - 90.0%] loss = 0.1472
[1000 - 100.0%] loss = 0.1450
Wall time: 2min 12s
