## 1. 셋업

In [1]:
# 패키지 불러오기
import os
import shutil
import zipfile
import pandas as pd
import tensorflow as tf
import requests
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

2023-04-21 05:19:15.145971: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-21 05:19:15.966727: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64
2023-04-21 05:19:15.966867: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/

### 데이터 다운로드

url = "http://www.manythings.org/anki/fra-eng.zip"
r = requests.get(url, headers = {"User-Agent": "cglee"})
filename = url.split("/")[- 1]

with open(filename, "wb") as output_file:
  output_file.write(r.content)
  
with zipfile.ZipFile(filename, "r") as zip:
  zip.extractall()

### 데이터 불러오기

In [2]:
lines = pd.read_csv("fra.txt", names = ["src", "tar", "lic"], sep = "\t")
lines = lines.loc[:, "src":"tar"]

print('전체 샘플의 개수 :',len(lines))

전체 샘플의 개수 : 217975


### 전체 샘플에서 최초 6만 개의 표본 추출

In [3]:
lines = lines[0:60000]

lines.sample(10)

Unnamed: 0,src,tar
19133,Finish the story.,Termine l'histoire.
11341,I want details.,Je souhaite avoir des détails.
43883,Why are you cursing?,Pourquoi jurez-vous ?
20961,I'm very serious.,Je suis très sérieuse.
50754,Tom wasn't born rich.,Tom n'est pas né riche.
19092,Everybody's dead.,Tout le monde a crevé.
18495,You're to blame.,Vous êtes responsables.
14438,Go to the store.,Va au magasin !
39776,I reviewed the file.,J'ai examiné le fichier.
23856,You deserve that.,Vous le méritez.


### 번역 문장에 문장 시작 심볼 ‘\t’, 종료 심볼 ‘\n’ 삽입

In [4]:
lines.tar = lines.tar.apply(lambda x: "\t " + x + " \n")

lines.sample(10)

Unnamed: 0,src,tar
17109,This is so easy.,\t C'est si facile. \n
39916,I took this picture.,\t J'ai pris cette photo. \n
2030,I like R&B.,\t J'aime le R&B. \n
26439,I truly loved her.,\t Je l'aimais sincèrement. \n
19034,Don't pick it up.,\t Ne le ramassez pas. \n
45852,He may well be right.,\t Il pourrait bien avoir raison. \n
45569,Has Tom become crazy?,\t Tom est-il devenu fou ? \n
49813,There's no more salt.,\t Il n'y a plus de sel. \n
6705,You can come.,\t Vous pouvez venir. \n
9933,You're a star.,\t Tu es une célébrité. \n


문자 집합 생성

In [5]:
src_vocab = set()
for line in lines.src:
  for char in line:
    src_vocab.add(char)

tar_vocab = set()
for line in lines.tar:
  for char in line:
    tar_vocab.add(char)

src_vocab_size = len(src_vocab) + 1
tar_vocab_size = len(tar_vocab) + 1

print("source 문장의 char 집합: ", src_vocab_size)
print("target 문장의 char 집합: ", tar_vocab_size)

source 문장의 char 집합:  80
target 문장의 char 집합:  103


### 문자 튜플을 리스트로 변환, 정렬, 인덱스 부여

In [6]:
src_vocab = sorted(list(src_vocab))
tar_vocab = sorted(list(tar_vocab))
src_to_index = dict([(word, i + 1) for i, word in enumerate(src_vocab)])
tar_to_index = dict([(word, i + 1) for i, word in enumerate(tar_vocab)])

print(src_to_index)
print(tar_to_index)

{' ': 1, '!': 2, '"': 3, '$': 4, '%': 5, '&': 6, "'": 7, ',': 8, '-': 9, '.': 10, '/': 11, '0': 12, '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20, '9': 21, ':': 22, '?': 23, 'A': 24, 'B': 25, 'C': 26, 'D': 27, 'E': 28, 'F': 29, 'G': 30, 'H': 31, 'I': 32, 'J': 33, 'K': 34, 'L': 35, 'M': 36, 'N': 37, 'O': 38, 'P': 39, 'Q': 40, 'R': 41, 'S': 42, 'T': 43, 'U': 44, 'V': 45, 'W': 46, 'X': 47, 'Y': 48, 'Z': 49, 'a': 50, 'b': 51, 'c': 52, 'd': 53, 'e': 54, 'f': 55, 'g': 56, 'h': 57, 'i': 58, 'j': 59, 'k': 60, 'l': 61, 'm': 62, 'n': 63, 'o': 64, 'p': 65, 'q': 66, 'r': 67, 's': 68, 't': 69, 'u': 70, 'v': 71, 'w': 72, 'x': 73, 'y': 74, 'z': 75, '°': 76, 'é': 77, '’': 78, '€': 79}
{'\t': 1, '\n': 2, ' ': 3, '!': 4, '"': 5, '$': 6, '%': 7, '&': 8, "'": 9, '(': 10, ')': 11, ',': 12, '-': 13, '.': 14, '0': 15, '1': 16, '2': 17, '3': 18, '4': 19, '5': 20, '6': 21, '7': 22, '8': 23, '9': 24, ':': 25, '?': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 3

### 인코더 입력 데이터 정수 인코딩

In [7]:
encoder_input = []

for line in lines.src:
    encoded_line = []
    for char in line:
        encoded_line.append(src_to_index[char])
    encoder_input.append(encoded_line)

print("인코더 입력 데이터 정수 인코딩 결과: ", encoder_input[:5])

인코더 입력 데이터 정수 인코딩 결과:  [[30, 64, 10], [30, 64, 10], [30, 64, 10], [30, 64, 10], [31, 58, 10]]


### 디코더 입력 데이터 정수 인코딩

In [8]:
decoder_input = []

for line in lines.tar:
    encoded_line = []
    for char in line:
        encoded_line.append(tar_to_index[char])
    decoder_input.append(encoded_line)

print("디코더 입력 데이터 정수 인코딩 결과: ", decoder_input[:5])

디코더 입력 데이터 정수 인코딩 결과:  [[1, 3, 48, 52, 3, 4, 3, 2], [1, 3, 39, 52, 69, 54, 59, 56, 14, 3, 2], [1, 3, 31, 65, 3, 69, 66, 72, 71, 56, 3, 4, 3, 2], [1, 3, 28, 66, 72, 58, 56, 3, 4, 3, 2], [1, 3, 45, 52, 63, 72, 71, 3, 4, 3, 2]]


### 정답 데이터 정수 인코딩

In [9]:
decoder_target = []

for line in lines.tar:
    timestep = 0
    encoded_line = []
    for char in line:
        # 시점이 0을 초과하는 경우 문자를 인덱스로 변환하여 저장하고,
        # 시점이 0인 경우 통과합니다.
        if timestep > 0:
            encoded_line.append(tar_to_index[char])
        timestep += 1
    decoder_target.append(encoded_line)

print("정답 데이터 정수 인코딩 결과: ", decoder_target[:5])

정답 데이터 정수 인코딩 결과:  [[3, 48, 52, 3, 4, 3, 2], [3, 39, 52, 69, 54, 59, 56, 14, 3, 2], [3, 31, 65, 3, 69, 66, 72, 71, 56, 3, 4, 3, 2], [3, 28, 66, 72, 58, 56, 3, 4, 3, 2], [3, 45, 52, 63, 72, 71, 3, 4, 3, 2]]


### 패딩

In [10]:
max_src_len = max([len(line) for line in lines.src])
max_tar_len = max([len(line) for line in lines.tar])

print("Source 최대 길이: ", max_src_len)
print("Target 최대 길이: ", max_tar_len)

encoder_input = pad_sequences(encoder_input, maxlen = max_src_len, padding = "post", dtype = object)
decoder_input = pad_sequences(decoder_input, maxlen = max_tar_len, padding = "post", dtype = object)
decoder_target = pad_sequences(decoder_target, maxlen = max_tar_len, padding = "post", dtype = object)

Source 최대 길이:  22
Target 최대 길이:  76


### 원-핫 인코딩

In [11]:
encoder_input = to_categorical(encoder_input)
decoder_input = to_categorical(decoder_input)
decoder_target = to_categorical(decoder_target)

## 2. 번역기 설계

### 인코더 설계

In [12]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
import numpy as np

encoder_inputs = Input(shape = (None, src_vocab_size))
encoder_lstm = LSTM(units = 256, return_state = True)

encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)

encoder_states = [state_h, state_c]

2023-04-21 05:19:22.461810: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-21 05:19:22.472786: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-21 05:19:22.474428: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-04-21 05:19:22.476927: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild

### 디코더 설계

In [13]:
decoder_inputs = Input(shape = (None, tar_vocab_size))
decoder_lstm = LSTM(units = 256, return_sequences = True, return_state = True)

## 디코더에게 인코더의 상태 전달
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state = encoder_states)

decoder_softmax_layer = Dense(tar_vocab_size, activation = "softmax")
decoder_outputs = decoder_softmax_layer(decoder_outputs)

model = Model(
    inputs = [encoder_inputs, decoder_inputs],
    outputs = decoder_outputs
)
model.compile(optimizer = "rmsprop", loss = "categorical_crossentropy")

### 번역기 학습

In [14]:
model.fit(
    x = [encoder_input, decoder_input],
    y = decoder_target,
    batch_size = 64,
    epochs = 40,
    validation_split = 0.2
)

Epoch 1/40


2023-04-21 05:19:30.073158: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8200
2023-04-21 05:19:30.977484: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x7f55d00cc2e0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-04-21 05:19:30.977531: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): Tesla T4, Compute Capability 7.5
2023-04-21 05:19:30.983811: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-04-21 05:19:31.110094: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7f577fbce9d0>

## 3. 번역기 실행

### 인코더 설계

In [15]:
encoder_model = Model(
    inputs = encoder_inputs,
    outputs = encoder_states
)

### 디코더 설계

In [16]:
decoder_state_input_h = Input(shape = (256, ))
decoder_state_input_c = Input(shape = (256, ))
decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs,
    initial_state = decoder_state_inputs
)

decoder_states = [state_h, state_c]
decoder_outputs = decoder_softmax_layer(decoder_outputs)
decoder_model = Model(
    inputs = [decoder_inputs] + decoder_state_inputs,
    outputs = [decoder_outputs] + decoder_states
)

### 인덱스를 단어로 변환하는 함수 정의

In [17]:
index_to_src = dict((i, char) for char, i in src_to_index.items())
index_to_tar = dict((i, char) for char, i in tar_to_index.items())

In [18]:
### 변역을 수행하는 함수 정의

In [23]:
def translate_sequence(input_seq):
    # 인코더에 입력 시퀀스를 입력하여 상태값을 얻습니다
    states_value = encoder_model.predict(input_seq)

    # (3차원 크기, 행 크기, 열 크기)
    # (1, 1, 103)
    target_seq = np.zeros((1, 1, tar_vocab_size))
    target_seq[0, 0, tar_to_index["\t"]] = 1

    stop_condition = False
    translated_sentence = ""

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        sampled_token_index = np.argmax(output_tokens)
        sampled_char = index_to_tar[sampled_token_index]

        translated_sentence += sampled_char

        if (sampled_char == "\n" or
            len(translated_sentence) > max_tar_len):
            stop_condition = True
        
        target_seq = np.zeros((1, 1, tar_vocab_size))
        target_seq[0, 0, sampled_token_index] = 1

        states_value = [h, c]

    return translated_sentence

In [24]:
for seq_index in [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
    input_seq = encoder_input[[seq_index]]
    translated_sentence = translate_sequence(input_seq)

    print("-" * 30)
    print("입력 문장: ", lines.src[seq_index])
    print("정답 문장: ", lines.tar[seq_index])
    # 문장 종료 심볼 '\n'을 제외하고 출력
    print("번역 문장: ", translated_sentence[1:len(translated_sentence) - 1])

------------------------------
입력 문장:  Go.
정답 문장:  	 En route ! 

번역 문장:  Décampe ! 
------------------------------
입력 문장:  Hi.
정답 문장:  	 Salut ! 

번역 문장:  Fais-le ! 
------------------------------
입력 문장:  Run!
정답 문장:  	 Prenez vos jambes à vos cous ! 

번역 문장:  Cours ! 
------------------------------
입력 문장:  Run.
정답 문장:  	 Prenez vos jambes à vos cous ! 

번역 문장:  Courrez ! 
------------------------------
입력 문장:  Hide.
정답 문장:  	 Cachez-vous. 

번역 문장:  Cale ! 
------------------------------
입력 문장:  Relax.
정답 문장:  	 Détends-toi ! 

번역 문장:  Commence à l'aide. 
------------------------------
입력 문장:  No way!
정답 문장:  	 C'est pas possible ! 

번역 문장:  Aucun ! 
------------------------------
입력 문장:  Get out.
정답 문장:  	 Sortez ! 

번역 문장:  Décampe ! 
------------------------------
입력 문장:  Get away!
정답 문장:  	 Déguerpissez. 

번역 문장:  Décampe ! 
------------------------------
입력 문장:  Hold this.
정답 문장:  	 Tenez ceci ! 

번역 문장:  Faites attention. 
