# 1. 단어의 토큰화

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
paper = ['많은 것을 바꾸고 싶다면 많은 것을 받아들여라.']
tknz = Tokenizer()
tknz.fit_on_texts(paper)      # 문장을 인자로 넣고 적합
print(tknz.word_index)        # 토근 별 인덱스확인
print(tknz.word_counts)       # 토근 별 언급 횟수 확인

{'많은': 1, '것을': 2, '바꾸고': 3, '싶다면': 4, '받아들여라': 5}
OrderedDict([('많은', 2), ('것을', 2), ('바꾸고', 1), ('싶다면', 1), ('받아들여라', 1)])


# 2. 단어를 벡터로 변환

In [3]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
paper = ['많은 것을 바꾸고 싶다면 많은 것을 받아들여라.']
tknz = Tokenizer()
tknz.fit_on_texts(paper)      # 문장을 인자로 넣고 적합

idx_paper = tknz.texts_to_sequences(paper)
print(idx_paper)
n = len(tknz.word_index) + 1
print(n)
idx_onehot = to_categorical(idx_paper, num_classes=n)
print(idx_onehot)

[[1, 2, 3, 4, 1, 2, 5]]
6
[[[0. 1. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0.]
  [0. 0. 0. 0. 1. 0.]
  [0. 1. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 1.]]]


# 3. 단어 임베딩

In [4]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding

model = Sequential()
model.add(Embedding(input_dim=n, output_dim=3))
model.compile(optimizer='rmsprop', loss='mse')
embedding = model.predict(idx_paper)
print(embedding)

[[[-0.02841144  0.01940265 -0.02723807]
  [-0.02251455  0.01084784 -0.02967387]
  [ 0.00619539 -0.0328768  -0.04546695]
  [-0.00840687 -0.03625091  0.00746967]
  [-0.02841144  0.01940265 -0.02723807]
  [-0.02251455  0.01084784 -0.02967387]
  [-0.00701772  0.01069533 -0.04733161]]]


# 4. 자연어 처리 실습

### 랜덤 시드 설정

In [1]:
import numpy as np
import tensorflow as tf
np.random.seed(0)
tf.random.set_seed(0)

### 모형 변수 설정

In [2]:
n_batch = 64
epochs = 100
latent_dim = 256       # output 크기
n_max_sample = 10000   # 최대 데이터 샘플 크기
data_path = './data/eng-fra/fra.txt'

### 전체 데이터 불러오기

In [3]:
with open(data_path, 'r', encoding='utf-8') as f:
    lines = f.read().split('\n')

### 데이터 확인

In [4]:
lines[:10]

['Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)',
 'Hi.\tSalut !\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)',
 'Hi.\tSalut.\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)',
 'Run!\tCours\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906331 (sacredceltic)',
 'Run!\tCourez\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906332 (sacredceltic)',
 'Who?\tQui ?\tCC-BY 2.0 (France) Attribution: tatoeba.org #2083030 (CK) & #4366796 (gillux)',
 'Wow!\tÇa alors\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #52027 (Zifre) & #374631 (zmoo)',
 'Fire!\tAu feu !\tCC-BY 2.0 (France) Attribution: tatoeba.org #1829639 (Spamster) & #4627939 (sacredceltic)',
 "Help!\tÀ l'aide\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #435084 (lukaszpp) & #128430 (sysko)",
 'Jump.\tSaute.\tCC-BY 2.0 (France) Attribution: tatoeba.org #631038 (Shishi

### 인풋, 타깃 텍스트 데이터 정리

In [5]:
x_txts = []
y_txts = []
x_chars_uni = set()
y_chars_uni = set()
n_sample = min(n_max_sample, len(lines) - 1)

for line in lines[:n_sample]:
    x_txt, y_txt, _ = line.split('\t')
    y_txt = '\t' + y_txt + '\n'
    x_txts.append(x_txt)
    y_txts.append(y_txt)
    
    for char in x_txt:
        if char not in x_chars_uni:
            x_chars_uni.add(char)
            
    for char in y_txt:
        if char not in y_chars_uni:
            y_chars_uni.add(char)

In [6]:
print(n_sample)

10000


### 인풋 데이터, 타깃 데이터 및 토큰 확인

In [7]:
print(x_txts[:5])
print(y_txts[:3])
print(x_chars_uni)
print(y_chars_uni)

['Go.', 'Hi.', 'Hi.', 'Run!', 'Run!']
['\tVa !\n', '\tSalut !\n', '\tSalut.\n']
{'B', '!', 'P', 'O', 'l', 'J', '3', 'D', '1', '0', 'F', 'w', 'G', 'q', 'v', ':', 'C', 'i', '5', 'S', 'W', 'm', '$', 'k', '8', 'E', 'M', 'é', 'V', '-', 'L', 'a', 'H', 's', "'", 'p', '?', ' ', 'y', 'N', '%', '.', 'g', 'T', 'K', 'Q', 'b', 'c', '&', 'u', 'o', 'n', 'e', 'R', 'f', 'z', 'Y', 'A', 'U', '2', 'r', 'j', ',', 'h', 'x', 'd', 't', 'I', '7', '9', '6'}
{'0', ':', '8', 'k', 's', 'p', 'g', 'T', 'Q', '\n', 'u', 'n', 'e', 'U', 'ï', 'h', 'Ç', 'É', '9', 'œ', 'O', 'l', 'J', 'ë', 'D', '1', 'G', 'v', '$', 'M', 'a', '?', 'N', 'K', 'o', 'â', 'f', 'Y', 'è', 'r', 'x', 'd', 't', 'B', '(', 'À', 'û', 'F', 'q', 'i', '5', 'S', 'm', 'é', 'V', 'y', 'î', 'ô', 'A', '2', 'j', 'ù', 'I', '!', 'P', '’', '3', 'ç', ')', '»', 'ê', 'C', '\u202f', 'E', '-', 'H', "'", ' ', '%', '.', '\u2009', 'b', 'c', '&', 'Ê', 'z', 'R', '«', ',', '\t', 'L', 'à'}


### 토큰 단위 정리

In [8]:
x_chars_uni = sorted(list(x_chars_uni))
y_chars_uni = sorted(list(y_chars_uni))
n_encoder_tokens = len(x_chars_uni)
n_decoder_tokens = len(y_chars_uni)

max_encoder_seq_len = 0
for txt in x_txts:
    txt_len = len(txt)
    max_encoder_seq_len = max(txt_len,
                             max_encoder_seq_len)
    
max_decoder_seq_len = 0
for txt in y_txts:
    txt_len = len(txt)
    max_decoder_seq_len = max(txt_len,
                             max_decoder_seq_len)

In [9]:
print("유니크 인코더 토큰 글자 수: ", n_encoder_tokens)
print("유니크 디코더 토큰 글자 수: ", n_decoder_tokens)
print("인코더 문장 내 최대 문자 수: ", max_encoder_seq_len)
print("디코더 문장 내 최대 문자 수: ", max_decoder_seq_len)

유니크 인코더 토큰 글자 수:  71
유니크 디코더 토큰 글자 수:  92
인코더 문장 내 최대 문자 수:  15
디코더 문장 내 최대 문자 수:  59


### 단어 토큰 별 인덱스

In [10]:
x_token_idx = {}
for idx, char in enumerate(x_chars_uni):
    x_token_idx[char] = idx
    
y_token_idx = {}
for idx, char in enumerate(y_chars_uni):
    y_token_idx[char] = idx

In [11]:
print(x_token_idx)
print(y_token_idx)

{' ': 0, '!': 1, '$': 2, '%': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '0': 9, '1': 10, '2': 11, '3': 12, '5': 13, '6': 14, '7': 15, '8': 16, '9': 17, ':': 18, '?': 19, 'A': 20, 'B': 21, 'C': 22, 'D': 23, 'E': 24, 'F': 25, 'G': 26, 'H': 27, 'I': 28, 'J': 29, 'K': 30, 'L': 31, 'M': 32, 'N': 33, 'O': 34, 'P': 35, 'Q': 36, 'R': 37, 'S': 38, 'T': 39, 'U': 40, 'V': 41, 'W': 42, 'Y': 43, 'a': 44, 'b': 45, 'c': 46, 'd': 47, 'e': 48, 'f': 49, 'g': 50, 'h': 51, 'i': 52, 'j': 53, 'k': 54, 'l': 55, 'm': 56, 'n': 57, 'o': 58, 'p': 59, 'q': 60, 'r': 61, 's': 62, 't': 63, 'u': 64, 'v': 65, 'w': 66, 'x': 67, 'y': 68, 'z': 69, 'é': 70}
{'\t': 0, '\n': 1, ' ': 2, '!': 3, '$': 4, '%': 5, '&': 6, "'": 7, '(': 8, ')': 9, ',': 10, '-': 11, '.': 12, '0': 13, '1': 14, '2': 15, '3': 16, '5': 17, '8': 18, '9': 19, ':': 20, '?': 21, 'A': 22, 'B': 23, 'C': 24, 'D': 25, 'E': 26, 'F': 27, 'G': 28, 'H': 29, 'I': 30, 'J': 31, 'K': 32, 'L': 33, 'M': 34, 'N': 35, 'O': 36, 'P': 37, 'Q': 38, 'R': 39, 'S': 40, 'T': 41,

### 데이터 영 행렬 만들기

In [12]:
encoder_x_data = np.zeros((len(x_txts),
                          max_encoder_seq_len,
                          n_encoder_tokens),
                         dtype='float32')

decoder_x_data = np.zeros((len(x_txts),
                          max_decoder_seq_len,
                          n_decoder_tokens),
                         dtype='float32')

decoder_y_data = np.zeros((len(x_txts),
                          max_decoder_seq_len,
                          n_decoder_tokens),
                         dtype='float32')

### 인풋 데이터 행렬

In [13]:
for i, x_txt in enumerate(x_txts):
    for t, char in enumerate(x_txt):
        encoder_x_data[i, t, x_token_idx[char]] = 1.
    encoder_x_data[i, t + 1:, x_token_idx[' ']] = 1.

### 타깃 데이터 행렬

In [14]:
for i, y_txt in enumerate(y_txts):
    for t, char in enumerate(y_txt):
        decoder_x_data[i, t, y_token_idx[char]] = 1.
        if t > 0:
            decoder_y_data[i, t - 1:, y_token_idx[char]] = 1.
    decoder_x_data[i, t + 1:, y_token_idx[' ']] = 1.
    decoder_y_data[i, t:, y_token_idx[' ']] = 1.

### 인코더 모형 생성

In [15]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import TimeDistributed

encoder_inputs = Input(shape=(None, n_encoder_tokens))
encoder = LSTM(latent_dim, return_state=True)
encoder_outs, state_h, state_c = encoder(encoder_inputs)
encoder_states = [state_h, state_c]

### 디코더 모형 생성

In [16]:
decoder_inputs = Input(shape=(None, n_decoder_tokens))
decoder = LSTM(latent_dim,
              return_sequences=True,
              return_state=True)
decoder_outs, _, _ = decoder(decoder_inputs,
                            initial_state=encoder_states)
decoder_dense = TimeDistributed(Dense(n_decoder_tokens,
                                     activation='softmax'))
decoder_outputs = decoder_dense(decoder_outs)

### 인코더-디코더

In [17]:
model = Model([encoder_inputs, decoder_inputs],
             decoder_outputs)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, 71)]   0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None, 92)]   0           []                               
                                                                                                  
 lstm (LSTM)                    [(None, 256),        335872      ['input_1[0][0]']                
                                 (None, 256),                                                     
                                 (None, 256)]                                                     
                                                                                              

### 모형 컴파일

In [18]:
model.compile(optimizer='rmsprop',
             loss='categorical_crossentropy',
             metrics=['accuracy'])

### 학습

In [1]:
model.fit([encoder_x_data, decoder_x_data], decoder_y_data,
         batch_size = n_batch,
         epochs=epochs,
         validation_split=0.2)

NameError: name 'model' is not defined

### 추론 모형 생성

In [40]:
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h,
                        decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder(decoder_inputs,
                                            initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                      [decoder_outputs] + decoder_states)

### 토큰 리버스 인덱스

In [42]:
reverse_x_char_idx = {}
for char, idx in x_token_idx.items():
    reverse_x_char_idx[idx] = char

reverse_y_char_idx = {}
for char, idx in y_token_idx.items():
    reverse_y_char_idx[idx] = char

In [43]:
print(reverse_x_char_idx)
print(reverse_y_char_idx)

{0: ' ', 1: '!', 2: '$', 3: '%', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '0', 10: '1', 11: '2', 12: '3', 13: '5', 14: '6', 15: '7', 16: '8', 17: '9', 18: ':', 19: '?', 20: 'A', 21: 'B', 22: 'C', 23: 'D', 24: 'E', 25: 'F', 26: 'G', 27: 'H', 28: 'I', 29: 'J', 30: 'K', 31: 'L', 32: 'M', 33: 'N', 34: 'O', 35: 'P', 36: 'Q', 37: 'R', 38: 'S', 39: 'T', 40: 'U', 41: 'V', 42: 'W', 43: 'Y', 44: 'a', 45: 'b', 46: 'c', 47: 'd', 48: 'e', 49: 'f', 50: 'g', 51: 'h', 52: 'i', 53: 'j', 54: 'k', 55: 'l', 56: 'm', 57: 'n', 58: 'o', 59: 'p', 60: 'q', 61: 'r', 62: 's', 63: 't', 64: 'u', 65: 'v', 66: 'w', 67: 'x', 68: 'y', 69: 'z', 70: 'é'}
{0: '\t', 1: '\n', 2: ' ', 3: '!', 4: '$', 5: '%', 6: '&', 7: "'", 8: '(', 9: ')', 10: ',', 11: '-', 12: '.', 13: '0', 14: '1', 15: '2', 16: '3', 17: '5', 18: '8', 19: '9', 20: ':', 21: '?', 22: 'A', 23: 'B', 24: 'C', 25: 'D', 26: 'E', 27: 'F', 28: 'G', 29: 'H', 30: 'I', 31: 'J', 32: 'K', 33: 'L', 34: 'M', 35: 'N', 36: 'O', 37: 'P', 38: 'Q', 39: 'R', 40: 'S', 41: 'T',