<a href="https://colab.research.google.com/github/KimRass/Programming/blob/master/Data%20Science/Machine%20Learning/NLP/fra-eng%20%26%20Character-Level%20seq2seq%20(NMT).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Training" data-toc-modified-id="Training-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Training</a></span></li><li><span><a href="#Inference" data-toc-modified-id="Inference-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Inference</a></span></li></ul></div>

In [1]:
from google.colab import drive
import os
drive.mount("/content/drive")
os.chdir("/content/drive/MyDrive/Libraries")

import tensorflow as tf
from tensorflow.keras import Input, Model, Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from tensorflow.keras.layers import Layer, Dense, Flatten, Dropout, Concatenate, Add, Dot, Multiply, Reshape, Activation, BatchNormalization, SimpleRNNCell, RNN, SimpleRNN, LSTM, Embedding, Bidirectional, TimeDistributed, Conv1D, Conv2D, MaxPool1D, MaxPool2D, GlobalMaxPool1D, GlobalMaxPool2D, AveragePooling1D, AveragePooling2D, GlobalAveragePooling1D, GlobalAveragePooling2D, ZeroPadding2D
from tensorflow.keras.optimizers import SGD, Adam, Adagrad
from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError, MeanAbsoluteError, MeanAbsolutePercentageError, BinaryCrossentropy, CategoricalCrossentropy, SparseCategoricalCrossentropy, CosineSimilarity
from tensorflow.keras.layers.experimental.preprocessing import Rescaling
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.activations import linear, sigmoid, relu
from tensorflow.keras.initializers import RandomNormal, glorot_uniform, he_uniform, Constant
from tensorflow.keras.models import load_model
import matplotlib.pyplot as plt
import seaborn as sb
import pandas as pd
import numpy as np
import re
import zipfile
import json
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
os.chdir("/content/drive/MyDrive/NLP")
raw_data = pd.read_table("./Datasets/fra-eng/fra.txt", usecols=[0, 1], names=["tar", "src"])

raw_data = raw_data.sample(len(raw_data)//3, random_state=777)

In [4]:
# `lower`: Whether to convert the texts to lowercase.
# `char_level`: If `True`, every character will be treated as a token.
tokenizer_src = Tokenizer(char_level=True)
tokenizer_src.fit_on_texts(raw_data["src"])
char2idx_src = tokenizer_src.word_index
vocab_size_src = len(char2idx_src)
enc_input = tokenizer_src.texts_to_sequences(raw_data["src"])

tokenizer_tar = Tokenizer(char_level=True)
tokenizer_tar.fit_on_texts("시" + raw_data["tar"] + "종")
char2idx_tar = tokenizer_tar.word_index
vocab_size_tar = len(char2idx_tar)
dec_input = tokenizer_tar.texts_to_sequences("시" + raw_data["tar"])
dec_gt = tokenizer_tar.texts_to_sequences(raw_data["tar"] + "종")

In [5]:
ratio = 0.99

lens_enc = sorted([len(doc) for doc in enc_input])
max_len_enc = int(np.quantile(lens_enc, ratio))
print(f"길이가 가장 긴 문장의 길이는 {np.max(lens_enc)}이고 길이가 {max_len_enc} 이하인 문장이 전체의 {ratio:.0%}를 차지합니다.")

lens_dec = sorted([len(doc) for doc in dec_input])
max_len_dec = int(np.quantile(lens_dec, ratio))
print(f"길이가 가장 긴 문장의 길이는 {np.max(lens_dec)}이고 길이가 {max_len_dec} 이하인 문장이 전체의 {ratio:.0%}를 차지합니다.")

길이가 가장 긴 문장의 길이는 305이고 길이가 86 이하인 문장이 전체의 99%를 차지합니다.
길이가 가장 긴 문장의 길이는 240이고 길이가 72 이하인 문장이 전체의 99%를 차지합니다.


In [6]:
enc_input = pad_sequences(enc_input, padding="post", maxlen=max_len_enc)
dec_input = pad_sequences(dec_input, padding="post", maxlen=max_len_dec)
dec_gt = pad_sequences(dec_gt, padding="post", maxlen=max_len_dec)

enc_input = to_categorical(enc_input)
dec_input = to_categorical(dec_input)
dec_gt = to_categorical(dec_gt)

# Training

In [None]:
name = "./fra_eng_char-level_seq2seq"
model_path = f"{name}.h5"
hist_path = f"{name}_hist.npy"
if os.path.exists(model_path):
    model = load_model(model_path)
    hist = np.load(hist_path, allow_pickle="TRUE").item()
else:
    inputs_enc = Input(shape=(max_len_enc, vocab_size_src + 1), name="Input_enc")
    inputs_dec = Input(shape=(max_len_dec, vocab_size_tar + 1), name="Input_dec")
    
    _, h_state, c_state = LSTM(units=256, return_state=True, name="LSTM_enc")(inputs_enc)
    z, _, _ = LSTM(units=256, return_sequences=True, return_state=True, name="LSTM_dec")(inputs_dec, initial_state=[h_state, c_state])
    outputs = Dense(units=vocab_size_tar + 1, activation="softmax", name="Dense_dec")(z)

    model = Model(inputs=[inputs_enc, inputs_dec], outputs=outputs)
    
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["acc"])
    model.summary()

    es = EarlyStopping(monitor="val_loss", mode="auto", verbose=1, patience=1)
    mc = ModelCheckpoint(filepath=model_path, monitor="val_acc", mode="auto", verbose=1, save_best_only=True)
    hist = model.fit(x=[enc_input, dec_input], y=dec_gt, batch_size=2048, epochs=32, validation_split=0.3, callbacks=[es, mc])
    
    np.save(hist_path, hitst.history)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 Input_enc (InputLayer)         [(None, 86, 76)]     0           []                               
                                                                                                  
 Input_dec (InputLayer)         [(None, 72, 61)]     0           []                               
                                                                                                  
 LSTM_enc (LSTM)                [(None, 256),        340992      ['Input_enc[0][0]']              
                                 (None, 256),                                                     
                                 (None, 256)]                                                     
                                                                                              

In [None]:
# 학습이 맞게 됐는지 확인
i = 110
pred = model.predict([tf.expand_dims(enc_input[i], axis=0), tf.expand_dims(dec_input[i], axis=0)])

sent = ""
for idx in tf.argmax(dec_gt[i], axis=1).numpy():
    if idx != 0:
        sent += idx2char_tar[idx]
print(sent)

sent = ""
for idx in tf.argmax(pred[0], axis=1).numpy():
    if idx != 0:
        sent += idx2char_tar[idx]
print(sent)

# Inference

- 우선 인코더를 정의합니다. enc_inputs와 encoder_states는 훈련 과정에서 이미 정의한 것들을 재사용하는 것입니다. 이제 디코더를 설계해보겠습니다.

In [None]:
model.layers

[<tensorflow.python.keras.engine.input_layer.InputLayer at 0x7f97529d7438>,
 <tensorflow.python.keras.engine.input_layer.InputLayer at 0x7f9752d01f60>,
 <tensorflow.python.keras.layers.recurrent_v2.LSTM at 0x7f97529d7908>,
 <tensorflow.python.keras.layers.recurrent_v2.LSTM at 0x7f9753533278>,
 <tensorflow.python.keras.layers.core.Dense at 0x7f9753533cc0>]

In [None]:
inputs_enc = model.layers[0].output
_, h_state, c_state = model.layers[2].output

enc_model = tf.keras.Model(inputs=inputs_enc, outputs=[h_state, c_state])

inputs_dec = model.layers[1].output
h_state_bef = Input(shape=(256,))
c_state_bef = Input(shape=(256,))
# 문장의 다음 단어를 예측하기 위해서 initial_state를 이전 시점의 상태로 사용합니다.
lstm_dec_layer = model.layers[3]
lstm_dec, h_state_aft, c_state_aft = lstm_dec_layer(inputs_dec, initial_state=[h_state_bef, c_state_bef])
dense_dec_layer = model.layers[4]
dense_dec = dense_dec_layer(lstm_dec)

dec_model = tf.keras.Model(inputs=[inputs_dec]+[h_state_bef, c_state_bef], outputs=[dense_dec]+[h_state_aft, c_state_aft])

In [None]:
def decode_seq(input_seq):
# seq = enc_input[i:i+1]
    enc_states = enc_model.predict(input_seq)

    # <SOS>에 해당하는 OHE를 생성합니다.
    seq = np.zeros((1, 1, len(char2idx_tar)+1))
    seq[0, 0, char2idx_tar["<SOS>"]] = 1

    stop_cond = False
    decoded_sent = ""
    # stop_cond이 True가 될 때까지 반복합니다.
    while not stop_cond:
        # 이점 시점의 states를 현재 시점의 states로 사용합니다.
        output_tokens, h_state, c_state = dec_model.predict([seq] + enc_states)
        argmax = np.argmax(output_tokens[0, -1, :])
    #     argmax = np.argmax(output_tokens[0, 0])
        char = idx2char_tar[argmax]
        decoded_sent += char
        # 현재 시점의 예측 결과를 다음 시점의 입력으로 사용하기 위해 저장합니다.
        seq = np.zeros((1, 1, len(char2idx_tar)+1))
        seq[0, 0, argmax] = 1
        enc_states = [h_state, c_state]
        
        # "<EOS>"에 도달하거나 최대 길이를 넘으면 stop_cond=True를 저장합니다.
        if char == "<EOS>" or len(decoded_sent) == max_len_dec:
            stop_cond = True
    return decoded_sent

In [None]:
actual, pred = list(), list()
for seq_index in range(231, 236):
    input_seq = enc_input[seq_index:seq_index+1]
    decoded_sent = decode_seq(input_seq)
    
    actual.append([data["tar"][seq_index][1:len(data["tar"][seq_index])-1].split()])
    pred.append(decoded_sent[:len(decoded_sent)-1].split())
                  
    print(35 * "-")
    print(f"입력 문장 : {data['src'][seq_index]}")
    print(f"정답 문장 : {data['tar'][seq_index][1:len(data['tar'][seq_index])-1]}")
    print(f"번역 문장 : {decoded_sent[:len(decoded_sent)-1]}")
    sf = SmoothingFunction()
    print(f"BLEU-1 : {corpus_bleu(actual, pred, weights=(1, 0, 0, 0),\
                                  smoothing_function=sf.method1)}")
    print(f"BLEU-2 : {corpus_bleu(actual, pred, weights=(1/2, 1/2, 0, 0),\
                                  smoothing_function=sf.method1)}")
    print(f"BLEU-3 : {corpus_bleu(actual, pred, weights=(1/3, 1/3, 1/3, 0),\
                                  smoothing_function=sf.method1)}")
    print(f"BLEU-4 : {corpus_bleu(actual, pred, weights=(1/4, 1/4, 1/4, 1/4),\
                                  smoothing_function=sf.method1)}")

-----------------------------------
입력 문장 : why do you need change?
정답 문장 : ourquoi as-tu besoin de changement 
번역 문장 : jz3lw(++1yj1…w+l1+jl…zp3+lœ1ljz… <EOS
BLEU-1 : 0
BLEU-2 : 0
BLEU-3 : 0
BLEU-4 : 0
-----------------------------------
입력 문장 : why did you change your mind?
정답 문장 : ourquoi as-tu changé d'avis 
번역 문장 : j(l+z0êp+lœ;(s1êjl3;(lw(+l8j8ljê(s(p//8 <EOS
BLEU-1 : 0
BLEU-2 : 0
BLEU-3 : 0
BLEU-4 : 0
-----------------------------------
입력 문장 : we don't want to lose you.
정답 문장 : ous ne voulons pas vous perdre
번역 문장 : sz0+l(s1)ljz0jlœ1l+0pj1lùl/(l…(p+z3 <EOS
BLEU-1 : 0
BLEU-2 : 0
BLEU-3 : 0
BLEU-4 : 0
-----------------------------------
입력 문장 : see that this never happens again.
정답 문장 : aites en sorte que ça ne se produise plus
번역 문장 : èj1+ysz0+lwêèj1lùlc(pê1lh(l&<EOS
BLEU-1 : 0
BLEU-2 : 0
BLEU-3 : 0
BLEU-4 : 0
-----------------------------------
입력 문장 : you must not lose sight of your main object.
정답 문장 : l ne faut pas que tu perdes de vue ton objectif principal
번역 문장 : /1+lwêp4l+