In [1]:
# 데이터 다운로드 링크 : http://www.manythings.org/anki
# Base Code : https://wikidocs.net/24996
# 참고 : https://discuss.pytorch.org/t/understanding-lstm-input/31110
import pandas as pd
lines = pd.read_csv('dataset/fra.txt', names=['src', 'tar','drop'], sep='\t')
lines.drop(['drop'], inplace=True, axis=1)
len(lines)


170651

In [2]:
lines = lines[0:60000]
lines.sample(10)

Unnamed: 0,src,tar
1230,I can swim.,Je sais nager.
14625,I ran out of gas.,Je suis tombé en panne d'essence.
924,That's OK.,Il n'y a pas de problème.
11075,I was surprised.,Je fus surpris.
34161,I don't need to know.,Je n'ai pas besoin de le savoir.
42549,So what're you saying?,Alors qu'es-tu en train de dire ?
52249,He is rather optimistic.,Il est plutôt optimiste.
50841,You don't have to come.,Vous n'êtes pas obligée de venir.
12230,They're animals.,Ce sont des animaux.
31707,Well? Will you come?,Alors ? Viendrez-vous ?


In [3]:
lines['tar'] = lines['tar'].apply(lambda x : '\t ' + x + ' \n')
lines.sample(5)
# 시작을 의미하는 심볼과 종료를 의미하는 심볼을 각각 \t 와 \n으로 표현.

Unnamed: 0,src,tar
18529,I dream in French.,\t Je rêve en français. \n
20725,This is true love.,\t C'est l'amour vrai. \n
23811,I sliced the apple.,\t J'ai tranché la pomme. \n
9270,We never voted.,\t Nous n'avons jamais voté. \n
18922,I opened a window.,\t J'ai ouvert la fenêtre. \n


In [4]:
# 글자 집합 구축
# src_vocab = set()
# for line in lines['src']:
#     for char in line:
#         src_vocab.add(char)

# tar_vocab=set()
# for line in lines['tar']:
#     for char in line:
#         tar_vocab.add(char)

src_vocab = set([a for b in lines['src'] for a in b]) # 위 코드와 결과는 같음
tar_vocab = set([a for b in lines['tar'] for a in b])

# 단어 집합이 아니라 글자 집합이라고 하는 이유는 토큰 단위가 단어가 아니라 글자이기 때문


In [5]:
src_vocab_size = len(src_vocab)+1
tar_vocab_size = len(tar_vocab)+1
print(src_vocab_size)
print(tar_vocab_size)

79
106


In [6]:
src_vocab = sorted(list(src_vocab))
tar_vocab = sorted(list(tar_vocab))
print(src_vocab[45:75])
print(tar_vocab[45:75])

# 글자 집합에 글자 단위로 저장 된 것을 확인할 수 있다.

['W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
['T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w']


In [7]:
src_to_index = dict([(word, i+1) for i, word in enumerate(src_vocab)])
tar_to_index = dict([(word, i+1) for i, word in enumerate(tar_vocab)])
print(src_to_index)
print(tar_to_index)
# 정수 인코딩을 위한 과정


{' ': 1, '!': 2, '"': 3, '$': 4, '%': 5, '&': 6, "'": 7, ',': 8, '-': 9, '.': 10, '/': 11, '0': 12, '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20, '9': 21, ':': 22, '?': 23, 'A': 24, 'B': 25, 'C': 26, 'D': 27, 'E': 28, 'F': 29, 'G': 30, 'H': 31, 'I': 32, 'J': 33, 'K': 34, 'L': 35, 'M': 36, 'N': 37, 'O': 38, 'P': 39, 'Q': 40, 'R': 41, 'S': 42, 'T': 43, 'U': 44, 'V': 45, 'W': 46, 'X': 47, 'Y': 48, 'Z': 49, 'a': 50, 'b': 51, 'c': 52, 'd': 53, 'e': 54, 'f': 55, 'g': 56, 'h': 57, 'i': 58, 'j': 59, 'k': 60, 'l': 61, 'm': 62, 'n': 63, 'o': 64, 'p': 65, 'q': 66, 'r': 67, 's': 68, 't': 69, 'u': 70, 'v': 71, 'w': 72, 'x': 73, 'y': 74, 'z': 75, 'é': 76, '’': 77, '€': 78}
{'\t': 1, '\n': 2, ' ': 3, '!': 4, '"': 5, '$': 6, '%': 7, '&': 8, "'": 9, '(': 10, ')': 11, ',': 12, '-': 13, '.': 14, '0': 15, '1': 16, '2': 17, '3': 18, '4': 19, '5': 20, '6': 21, '7': 22, '8': 23, '9': 24, ':': 25, '?': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 34, 'I': 3

In [8]:
encoder_input = []
for line in lines['src']:
    temp_X = []
    for W in line:
        temp_X.append(src_to_index[W])
    encoder_input.append(temp_X)
print('영어 정수 인코딩 : ',encoder_input[:5])


decoder_input = []
for line in lines['tar']:
    temp_X = []
    for W in line:
        temp_X.append(tar_to_index[W])
    decoder_input.append(temp_X)
print('프랑스어 정수 인코딩 : ', decoder_input[:5])
# 정수 인코딩 결과 5개 샘플
# 프랑스어는 <sos> 때문에 모두 앞에 1이 붙음.


영어 정수 인코딩 :  [[30, 64, 10], [31, 58, 10], [31, 58, 10], [41, 70, 63, 2], [41, 70, 63, 2]]
프랑스어 정수 인코딩 :  [[1, 3, 48, 53, 3, 4, 3, 2], [1, 3, 45, 53, 64, 73, 72, 3, 4, 3, 2], [1, 3, 45, 53, 64, 73, 72, 14, 3, 2], [1, 3, 29, 67, 73, 70, 71, 105, 4, 3, 2], [1, 3, 29, 67, 73, 70, 57, 78, 105, 4, 3, 2]]


In [9]:
# decoder_target = []
# for line in lines.tar:
#     t=0
#     temp_X = []
#     for w in line:
#       if t>0:
#         temp_X.append(tar_to_index[w])
#       t=t+1
#     decoder_target.append(temp_X)
# print(decoder_target[:5])

decoder_target = [k[1:] for k in decoder_input] # 위와 같은 코드
decoder_target[:5]

# 실제값에는 시작 심볼에 해당하는 <sos>가 필요 없으므로 1을 지워준다.

[[3, 48, 53, 3, 4, 3, 2],
 [3, 45, 53, 64, 73, 72, 3, 4, 3, 2],
 [3, 45, 53, 64, 73, 72, 14, 3, 2],
 [3, 29, 67, 73, 70, 71, 105, 4, 3, 2],
 [3, 29, 67, 73, 70, 57, 78, 105, 4, 3, 2]]

In [10]:
max_src_len = max([len(line) for line in lines['src']])
max_tar_len = max([len(line) for line in lines['tar']])
print(max_src_len)
print(max_tar_len)

# 패딩 작업을 위해 가장 긴 길이를 가진 문장의 길이를 구함.

25
76


In [18]:
from keras.preprocessing.sequence import pad_sequences
encoder_input = pad_sequences(encoder_input, maxlen=max_src_len, padding='post')
decoder_input = pad_sequences(decoder_input, maxlen=max_tar_len, padding='post')
decoder_target = pad_sequences(decoder_target, maxlen=max_tar_len, padding='post')
# 영어와 프랑스의 길이는 하나의 쌍이라고 하더라도 전부 다르므로 패딩 할 때도 동일하게 맞춰줄 필요가 없다.
# 영어 데이터는 영어 샘플끼리 프랑스어는 프랑스어 끼리 맞추어 패딩

In [19]:
from keras.utils import to_categorical
encoder_input = to_categorical(encoder_input)
decoder_input = to_categorical(decoder_input)
decoder_target = to_categorical(decoder_target)

# 글자단위 번역기므로 워드 임베딩은 별도로 사용 안함.
# 단어 단위로 바꾸어 워드임베딩을 활용한 seq2seq도 시도예정

In [24]:
import torch
import torch.nn as nn
from torch import optim

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

encoder_input = torch.LongTensor(encoder_input[:10000]).to(device)
decoder_input = torch.LongTensor(decoder_input[:10000]).to(device)
decoder_target = torch.LongTensor(decoder_target[:10000]).to(device)

In [25]:
encoder_input.shape

torch.Size([10000, 25, 79])

In [26]:
decoder_input.shape

torch.Size([10000, 76, 106])

In [27]:
decoder_target.shape

torch.Size([10000, 76, 106])

In [28]:
encoder_input.shape

torch.Size([10000, 25, 79])

In [61]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(26, 16)
        self.lstm = nn.LSTM(16, 16)
        self.linear = nn.Linear(632, 8)
        self.gru = nn.GRU(16, 16,2)
        
    def forward(self, x, h):
        x = self.embedding(x)
        x = self.linear(x.view(2,-1))
#         print(x.shape)
        x = self.gru(x.view(1,1,-1), h)
        return x


class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(76,16)
        self.gru = nn.GRU(16, 16, 2)
        self.out = nn.Linear(16, 106)
        self.linear = nn.Linear(848, 8)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x, h):
        x = self.embedding(x)
        x = self.linear(x.view(2,-1))
        x, hidden = self.gru(x.view(1,1,-1), h)
        x = self.softmax(self.out(x[0]))
        return x

In [62]:

H = torch.randn(2,1,16).to(device)
decoder = Decoder().to(device)
decoder(decoder_input[0][0],H)

tensor([[-5.0838, -5.5578, -4.9453, -5.5748, -5.2558, -4.3325, -4.2342, -3.7796,
         -4.9519, -4.4071, -5.5929, -5.0879, -4.9947, -5.1230, -5.5979, -4.6040,
         -4.3605, -4.9744, -4.5965, -5.2337, -4.4453, -4.6218, -4.5530, -4.6715,
         -4.9833, -4.8089, -5.2214, -4.6931, -5.0656, -3.9251, -3.3829, -5.5055,
         -5.0639, -3.7011, -4.3006, -5.2313, -4.9048, -4.6376, -4.4123, -4.4073,
         -4.9252, -4.5101, -5.1682, -4.4916, -4.9180, -4.5391, -5.7064, -5.4400,
         -5.3979, -5.2050, -5.0735, -3.9831, -5.0178, -4.9550, -4.5609, -4.7451,
         -4.3711, -4.5722, -4.5201, -3.9729, -5.3674, -4.5205, -4.7363, -4.2087,
         -4.4568, -4.2360, -4.6294, -4.6890, -5.1006, -4.3818, -4.8047, -5.6181,
         -5.3187, -4.8214, -4.8933, -4.8147, -3.7917, -4.9250, -4.9842, -4.8344,
         -5.0466, -5.0996, -4.1009, -4.8487, -4.8503, -4.8383, -4.8206, -5.4701,
         -5.0396, -4.9047, -5.3011, -5.4768, -4.6477, -5.4777, -4.9042, -4.8383,
         -4.5329, -4.9407, -

In [71]:
encoder = Encoder().to(device)
decoder = Decoder().to(device)
encoder_optimizer = optim.RMSprop(encoder.parameters(), lr = 0.01)
decoder_optimizer = optim.RMSprop(decoder.parameters(), lr = 0.01)
criterion = nn.NLLLoss()

epochs=10
loss_total=0
for epoch in range(epochs):
    loss = 0
    
#     encoder_hidden = torch.zeros([1, 1, 79]).to(device)
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    
    encoder_hidden = torch.zeros([2, 1, 16]).to(device)
    
    encoder_length = encoder_input.size(1)
    decoder_length = encoder_input.size(1)
    
    for enc_input in range(encoder_length):
        x, encoder_hidden = encoder(encoder_input[epoch][enc_input], encoder_hidden)

    for dec_input in range(decoder_length):
        decoder_output = decoder(decoder_input[epoch][dec_input], encoder_hidden)
        print(decoder_target[epoch][dec_input].view(1,-1).shape, decoder_output.shape)
        loss += criterion(decoder_output, decoder_target[epoch][dec_input].view(1,-1))
    break
#         decoder_input = decoder_target[dec_input]
#     loss.backward()
#     encoder_optimizer.step()
#     decoder_optimizer.step()
    
#     loss_iter = loss.item() / decoder_length
#     loss_total += loss_iter
        
#     if i % print_every == 0:
#         loss_avg = loss_total / print_every
#         loss_total = 0
#         print("[{} - {}%] loss = {:05.4f}".format(i, i/n_iter * 100, loss_avg))

torch.Size([1, 106]) torch.Size([1, 106])


RuntimeError: multi-target not supported at C:/w/1/s/tmp_conda_3.7_055457/conda/conda-bld/pytorch_1565416617654/work/aten/src\THCUNN/generic/ClassNLLCriterion.cu:15

In [535]:
import torch.nn as nn
import torch

gru = nn.GRU(input_size = 8, hidden_size = 50, num_layers = 2, batch_first = True)

In [536]:
inp = torch.randn(1, 1, 8)
out, hn = gru(inp)

In [537]:
out, hn

(tensor([[[ 0.0266, -0.0701,  0.0305, -0.0472,  0.0093, -0.1129,  0.0224,
           -0.0507, -0.0683,  0.0489, -0.0713, -0.0239, -0.0886,  0.0294,
            0.0022, -0.1088, -0.0650,  0.1388, -0.0304, -0.0134, -0.0108,
           -0.0487,  0.0861,  0.0106,  0.0197,  0.0317,  0.0215, -0.0140,
           -0.0582, -0.0122, -0.0061, -0.0332,  0.0734, -0.0026,  0.0439,
           -0.0010,  0.0230,  0.0389, -0.0838, -0.0278, -0.0857, -0.0370,
            0.0023, -0.0450, -0.0105,  0.0245, -0.0498, -0.1098, -0.0059,
           -0.0021]]], grad_fn=<TransposeBackward1>),
 tensor([[[-0.0644, -0.2389, -0.0406, -0.0137, -0.0669,  0.0608, -0.1522,
           -0.1155, -0.0742,  0.1120,  0.0899, -0.0191, -0.2450,  0.1627,
           -0.0429,  0.0686,  0.1121, -0.0253,  0.0224, -0.0148, -0.1014,
           -0.0990, -0.0176, -0.0057,  0.1014, -0.0936,  0.0217, -0.0960,
            0.1097,  0.1364,  0.0418,  0.0141,  0.1343, -0.1115,  0.1690,
           -0.1486, -0.0833,  0.0828, -0.1309,  0.0491,  0