In [4]:
# 7-3
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = "2"
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

In [5]:
# 7-4
tf.random.set_seed(22)
np.random.seed(22)
assert tf.__version__.startswith('2.')

batch_size = 128
total_words = 10000
max_review_len = 80
embedding_len = 100

In [6]:
# 7-5
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=total_words)

x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=max_review_len)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=max_review_len)

train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_data = train_data.shuffle(10000).batch(batch_size, drop_remainder=True)
test_data = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_data = test_data.batch(batch_size, drop_remainder=True)
print('x_train_shape:', x_train.shape, tf.reduce_max(y_train), tf.reduce_min(y_train))
print('x_test_shape:', x_test.shape)

sample = next(iter(test_data))
print(sample[0].shape)

x_train_shape: (25000, 80) tf.Tensor(1, shape=(), dtype=int64) tf.Tensor(0, shape=(), dtype=int64)
x_test_shape: (25000, 80)
(128, 80)


In [7]:
# 7-6
class RNN_Build(tf.keras.Model):
    def __init__(self, units):
        super(RNN_Build, self).__init__()

        self.state0 = [tf.zeros([batch_size, units])]
        self.state1 = [tf.zeros([batch_size, units])]
        self.embedding = tf.keras.layers.Embedding(total_words, embedding_len, input_length=max_review_len)

        self.RNNCell0 = tf.keras.layers.SimpleRNNCell(units, dropout=0.2)
        self.RNNCell1 = tf.keras.layers.SimpleRNNCell(units, dropout=0.2)
        self.outlayer = tf.keras.layers.Dense(1)

    def call(self, inputs, training=None):
        x = inputs
        x = self.embedding(x)
        state0 = self.state0
        state1 = self.state1
        for word in tf.unstack(x, axis=1):
            out0, state0 = self.RNNCell0(word, state0, training)
            out1, state1 = self.RNNCell1(out0, state1, training)
        x = self.outlayer(out1)
        prob = tf.sigmoid(x)
        return prob

In [8]:
# 7-7
import time
units = 64
epochs = 4
t0 = time.time()

model = RNN_Build(units)
model.compile(optimizer=tf.keras.optimizers.Adam(0.001),
              loss=tf.losses.BinaryCrossentropy(),
              metrics=['accuracy'],
              experimental_run_tf_function=False)

model.fit(train_data, epochs=epochs, validation_data=test_data, validation_freq=2)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.src.callbacks.History at 0x7a0c519b5720>

In [9]:
# 7-8
print("훈련 데이터셋 평가...")
(loss, accuracy) = model.evaluate(train_data, verbose=0)
print("loss={:.4f}, accuracy: {:.4f}%".format(loss,accuracy * 100))
print("테스트 데이터셋 평가...")
(loss, accuracy) = model.evaluate(test_data, verbose=0)
print("loss={:.4f}, accuracy: {:.4f}%".format(loss,accuracy * 100))
t1 = time.time()
print('시간:', t1-t0)

훈련 데이터셋 평가...
loss=0.0320, accuracy: 98.9423%
테스트 데이터셋 평가...
loss=0.7063, accuracy: 80.9936%
시간: 109.89191699028015


In [11]:
print(x_train[0])

[  15  256    4    2    7 3766    5  723   36   71   43  530  476   26
  400  317   46    7    4    2 1029   13  104   88    4  381   15  297
   98   32 2071   56   26  141    6  194 7486   18    4  226   22   21
  134  476   26  480    5  144   30 5535   18   51   36   28  224   92
   25  104    4  226   65   16   38 1334   88   12   16  283    5   16
 4472  113  103   32   15   16 5345   19  178   32]


In [18]:
import re
from keras.preprocessing.sequence import pad_sequences

def sentiment_predict(new_sentence):
 # 알파벳과 숫자를 제외하고 모두 제거 및 알파벳 소문자화
 new_sentence = re.sub('[^0-9a-zA-Z ]', '', new_sentence).lower()
 encoded = []

 # 띄어쓰기 단위 토큰화 후 정수 인코딩
 word_to_index = tf.keras.datasets.imdb.get_word_index()
 for word in new_sentence.split():
  try :
    # 단어 집합의 크기를 10,000으로 제한.
   if word_to_index[word] <= 10000:
    encoded.append(word_to_index[word]+3)
   else:
    # 10,000 이상의 숫자는 <unk> 토큰으로 변환.
    encoded.append(2)
  # 단어 집합에 없는 단어는 <unk> 토큰으로 변환.
  except KeyError:
    encoded.append(2)
 pad_sequence = pad_sequences([encoded], maxlen=max_review_len)
 score = float(model.predict(pad_sequence)[0]) # 예측
 if(score > 0.5):
  print("{:.2f}% 확률로 긍정 리뷰입니다.".format(score * 100))
 else:
  print("{:.2f}% 확률로 부정 리뷰입니다.".format((1 - score) * 100))

In [23]:
# imdb.get_word_index 의 딕셔너리 구조에서 키와 벨류를 서로 바꿔
# reverse_word_index에 저장한 후 특정 인덱스의 리뷰를 텍스트로 바꿔 sentiment_predict() 함수에 적용시킴

word_index = tf.keras.datasets.imdb.get_word_index()  # 단어 인덱스 가져오기
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])  # 단어와 인덱스를 뒤집어서 저장

def decode_review(index_list):
    return ' '.join([reverse_word_index.get(i - 3, ',') for i in index_list])  # 패딩, 문장 시작, 사전에 없는 단어 처리

positive_index = 0 # 긍정 리뷰 인덱스
negative_index = 1 # 부정 리뷰 인덱스

# 해당 인덱스의 리뷰를 텍스트로 디코딩
positive_review = decode_review(x_train[positive_index])
negative_review = decode_review(x_train[negative_index])


sentiment_predict(positive_review)
sentiment_predict(negative_review)

99.62% 확률로 긍정 리뷰입니다.
97.71% 확률로 부정 리뷰입니다.
