# seq2seq

### Enbedding Vector 준비
1. 영어 glove 임베딩 (사전 학습) 사용 / 6B tokens, 400K vocab, uncased, 100d
2. 한국어 임베딩 (초기 훈련)

In [28]:
# !gdown 1qk-14tgVHPXT5jfRUE4Ua2ji4EXwS022

### 학습 데이터 준비

http://www.manythings.org/anki/

eng-kor 짝으로 된 학습 데이터

1. Encoder 입력 데이터 eng
    - encoder_input_eng 준비
2. Decoder 출력 데이터 kor
    - 학습용 teacher-forcing 모델
        - decoder_input_kor `<sos> 난 널 사랑해`
        - decoder_output_kor `난 널 사랑해 <EOS>`
    - 추론용 모델


In [29]:
# !gdown 17X1AF5lusy-FP-Zadm-DDpbeKIZ1cWmH -O eng_kor.txt

In [None]:
eng_inputs = []
kor_inputs = []
kor_targets = []

with open('eng_kor.txt', 'r', encoding='UTF-8') as f:
    for line in f:
        eng, kor, _ = line.split('\t')
        
        kor_input = '<sos> ' + kor
        kor_target = kor + ' <eos>'

        eng_inputs.append(eng)
        kor_inputs.append(kor_input)
        kor_targets.append(kor_target)

len(eng_inputs), len(kor_inputs), len(kor_targets)

(5890, 5890, 5890)

In [31]:
print(eng_inputs[2500:2505])
print(kor_inputs[2500:2505])
print(kor_targets[2500:2505])

['I speak French a little.', 'I take back what I said.', 'I tried to make friends.', 'I use this all the time.', 'I use this all the time.']
['<sos> 저는 프랑스어를 조금 합니다.', '<sos> 아까 한 말 취소야.', '<sos> 난 친구를 만드려고 했어.', '<sos> 나는 항상 이걸 쓴다.', '<sos> 매번 이걸 쓴다.']
['저는 프랑스어를 조금 합니다. <eos>', '아까 한 말 취소야. <eos>', '난 친구를 만드려고 했어. <eos>', '나는 항상 이걸 쓴다. <eos>', '매번 이걸 쓴다. <eos>']


### 토큰화
- 인코더(영어) : 영문 토크나이저
- 디코더(한글) : 국문 토크나이저

In [32]:
VOCAB_SIZE = 10000

### 영문 토크나이저

In [33]:
from tensorflow.keras.preprocessing.text import Tokenizer

eng_tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<OOV>')
eng_tokenizer.fit_on_texts(eng_inputs)
eng_inputs_seq = eng_tokenizer.texts_to_sequences(eng_inputs)

In [34]:
for seq in eng_inputs_seq[2500:2505]:
    print([eng_tokenizer.index_word[idx] for idx in seq])

['i', 'speak', 'french', 'a', 'little']
['i', 'take', 'back', 'what', 'i', 'said']
['i', 'tried', 'to', 'make', 'friends']
['i', 'use', 'this', 'all', 'the', 'time']
['i', 'use', 'this', 'all', 'the', 'time']


In [35]:
eng_num_words = min(VOCAB_SIZE, len(eng_tokenizer.word_index))
eng_max_len = max([len(seq) for seq in eng_inputs_seq])

### 국문 토큰화

In [36]:
# filter 기본값 = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
# <SOS>, <EOS>까지 필터안되게 비워둠
kor_tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token='<OOV>', filters='')
kor_tokenizer.fit_on_texts(kor_inputs + kor_targets)

kor_inputs_seq = kor_tokenizer.texts_to_sequences(kor_inputs)
kor_targets_seq = kor_tokenizer.texts_to_sequences(kor_targets)

kor_inputs_seq[1000:1001], kor_targets_seq[1000:1001]

([[2, 81, 1655]], [[81, 1655, 3]])

In [38]:
kor_num_words = min(VOCAB_SIZE, len(kor_tokenizer.word_index))
kor_max_len = max([len(seq) for seq in kor_inputs_seq])

### 패딩처리
- 인코더 padding (pre)
- 디코더 padding (post)

In [39]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

eng_inputs_padded = pad_sequences(eng_inputs_seq, maxlen=eng_max_len, padding='pre')
kor_inputs_padded = pad_sequences(kor_inputs_seq, maxlen=kor_max_len, padding='pre')
kor_targets_padded = pad_sequences(kor_targets_seq, maxlen=kor_max_len, padding='post')

eng_inputs_padded.shape, kor_inputs_padded.shape, kor_targets_padded.shape

((5890, 101), (5890, 90), (5890, 90))

### 모델 학습

`encoder + decoder(teacher_forcing)` 구조의 모델 생성 및 학습

##### Embedding Layer

In [40]:
with open('./glove.6B.100d.txt', 'r', encoding='UTF-8') as f:
    for i, vects in enumerate(f):
        print(vects)
        if i == 5:
            break

the -0.038194 -0.24487 0.72812 -0.39961 0.083172 0.043953 -0.39141 0.3344 -0.57545 0.087459 0.28787 -0.06731 0.30906 -0.26384 -0.13231 -0.20757 0.33395 -0.33848 -0.31743 -0.48336 0.1464 -0.37304 0.34577 0.052041 0.44946 -0.46971 0.02628 -0.54155 -0.15518 -0.14107 -0.039722 0.28277 0.14393 0.23464 -0.31021 0.086173 0.20397 0.52624 0.17164 -0.082378 -0.71787 -0.41531 0.20335 -0.12763 0.41367 0.55187 0.57908 -0.33477 -0.36559 -0.54857 -0.062892 0.26584 0.30205 0.99775 -0.80481 -3.0243 0.01254 -0.36942 2.2167 0.72201 -0.24978 0.92136 0.034514 0.46745 1.1079 -0.19358 -0.074575 0.23353 -0.052062 -0.22044 0.057162 -0.15806 -0.30798 -0.41625 0.37972 0.15006 -0.53212 -0.2055 -1.2526 0.071624 0.70565 0.49744 -0.42063 0.26148 -1.538 -0.30223 -0.073438 -0.28312 0.37104 -0.25217 0.016215 -0.017099 -0.38984 0.87424 -0.72569 -0.51058 -0.52028 -0.1459 0.8278 0.27062

, -0.10767 0.11053 0.59812 -0.54361 0.67396 0.10663 0.038867 0.35481 0.06351 -0.094189 0.15786 -0.81665 0.14172 0.21939 0.58505 -0.52158

In [None]:
import numpy as np

def make_embedding_matrix(num_words, embedding_dim, tokenizer, file_path):
    embedding_matrix = np.zeros((num_words + 1, embedding_dim))

    pretrained_embedding = {}

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            word, *vects = line.split()
            vects = np.array(vects, dtype=np.float32)
            pretrained_embedding[word] = vects

    for word, index in tokenizer.word_index.items():
        vects_ = pretrained_embedding.get(word)

        if vects_ is not None:
            embedding_matrix[index] = vects_

        return embedding_matrix