In [1]:
# 코드 10-64 예제를 진행할 텍스트 생성
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

text = """과일 가게에 사과가 많이 진열되어 있다
그녀가 나에게 사과한 후, 우리는 친해졌다
애플은 사과 모양을 로고로 사용한다\n"""  # 텍스트 생성

2024-12-03 02:35:47.821736: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1733160947.831043  265465 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1733160947.834133  265465 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-03 02:35:47.845005: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# 코드 10-65 텍스트 토큰화
tok = Tokenizer()  # Tokenizer 객체 생성
tok.fit_on_texts([text])

vocSize = len(tok.word_index) + 1

seqs = list()
for word in text.split("\n"):
    encoded = tok.texts_to_sequences([word])[0] # 텍스트를 숫자로 변환
    for i in range(1, len(encoded)):
        seq = encoded[:i+1]
        seqs.append(seq)

maxLen = max(len(i) for i in seqs)

# 패딩을 이용하여 샘플 길이를 동일하게 지정
seqs = pad_sequences(seqs, maxlen=maxLen, padding='pre')

seqs

array([[ 0,  0,  0,  0,  1,  2],
       [ 0,  0,  0,  1,  2,  3],
       [ 0,  0,  1,  2,  3,  4],
       [ 0,  1,  2,  3,  4,  5],
       [ 1,  2,  3,  4,  5,  6],
       [ 0,  0,  0,  0,  7,  8],
       [ 0,  0,  0,  7,  8,  9],
       [ 0,  0,  7,  8,  9, 10],
       [ 0,  7,  8,  9, 10, 11],
       [ 7,  8,  9, 10, 11, 12],
       [ 0,  0,  0,  0, 13, 14],
       [ 0,  0,  0, 13, 14, 15],
       [ 0,  0, 13, 14, 15, 16],
       [ 0, 13, 14, 15, 16, 17]], dtype=int32)

In [3]:
# 코드 10-66 x 값에 대한 정의
import numpy as np

seqs = np.array(seqs)  # seqs를 배열로 변환한 후 seqs에 저장합니다.
x = seqs[:, :-1]  # 마지막 열을 제외한 모든 행과 열을 가져옵니다.

In [4]:
# 코드 10-67 y 값에 대한 정의
from tensorflow.keras.utils import to_categorical

y = seqs[:, -1]  # 모든 행과 마지막 열만 취합니다(-1은 마지막 열을 의미).
#  케라스에서 제공하는 to_categorical()을 사용하여 원-핫 인코딩을 적용합니다.
y = to_categorical(y, num_classes=vocSize)

In [5]:
# 코드 10-68 모델 생성 및 훈련
from tensorflow.keras.layers import LSTM
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Flatten, Dense
from tensorflow.keras.layers import Embedding

model = Sequential()  # 모델 생성
model.add(Embedding(vocSize, 10, input_length=maxLen - 1))
model.add(LSTM(32))
# 각 단어의 임베딩 벡터가 10차원
model.add(Dense(vocSize, activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(x, y, epochs=200)

I0000 00:00:1733160949.589229  265465 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 6687 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080, pci bus id: 0000:0a:00.0, compute capability: 6.1


Epoch 1/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.0714 - loss: 2.8875
Epoch 2/200


I0000 00:00:1733160951.074129  265550 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.1429 - loss: 2.8858
Epoch 3/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.2143 - loss: 2.8839
Epoch 4/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.2143 - loss: 2.8821
Epoch 5/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.2857 - loss: 2.8803
Epoch 6/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.2143 - loss: 2.8784
Epoch 7/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.1429 - loss: 2.8764
Epoch 8/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.1429 - loss: 2.8744
Epoch 9/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.2143 - loss: 2.8724
Epoch 10/200
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x7f805f04d7b0>

In [6]:
# 코드 10-69 단어 예측
def sentGen(model, tok, word, n):
    sent = ""
    word2 = word
    for _ in range(n):
        encoded = tok.texts_to_sequences([])
        encoded = pad_sequences([encoded], maxlen=7, padding="pre")
        res = model.predict(encoded)
        res = np.argmax(res, axis=1)
        for w, i in tok.word_index.items():
            if i == res: # 예측 단어와 인덱스 단어가 동일한 겨우 if 문 수행
                break
        word = word + " " + w
        sent = sent + " " + w
    sent = word2 + sent
    return sent

In [7]:
# 코드 10-70 '과일' 이후의 예측 단어
print(sentGen(model, tok, "과일", 2))  # "과일" 뒤에 등장하는 단어 두 개를 예측

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
과일 나에게 나에게
