In [1]:
from IPython.display import display, HTML
display(HTML("""
<style>
div.container{width:90% !important;}
div.cell.code_cell.rendered{width:100%;}
div.input_prompt{padding:2px;}
div.CodeMirror {font-family:Consolas; font-size:10pt;}
div.text_cell_render.rendered_html{font-size:10pt;}
div.output {font-size:10pt; font-weight:bold;}
div.input {font-family:Consolas; font-size:10pt;}
div.prompt {min-width:70px;}
div#toc-wrapper{padding-top:120px;}
div.text_cell_render ul li{font-size:11pt;padding:4px;}
table.dataframe{font-size:10px;}
</style>
"""))

**<font size='6' color='red'>ch9. Transformers</font>**
- 인코더 층만으로 구현(입력:자연어, 출력:긍정/부정)
## 1. 패키지

In [2]:
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from time import time # 70.01.01 부터 현재까지의 밀리세컨

from sklearn.metrics import confusion_matrix,f1_score,precision_score, recall_score

## 2. 하이퍼 파라미터 설정(이 파라미터를 바꾸면 정확도나 학습 속도에 차이남)

In [3]:
MY_WORDS = 10000  # imdb 데이터의 단어수
MY_LENGTH = 80  # 영화평 단어수 80개만 독립변수
MY_EMBED = 32   # embading layer의 결과 차원
MY_HIDDEN = 64  # LSTM의 units 차원
MY_EPOCH = 10  # 학습 수
MY_BATCH = 200 # batch_size(fit시 매번 데이터를 가져오는 데이터)

## 3. 데이터 불러오기

In [4]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=MY_WORDS)

## 4. 문자단어 -> 정수

In [5]:
word_to_id = imdb.get_word_index()  # {'word':id}

# 정수 -> 문자 단어
id_to_word = {}  # {1:'the', 3:'a', 16816:'sonja'}
for word, value in word_to_id.items():
    id_to_word[value] = word
print(id_to_word[1])
print(id_to_word[3])
print(id_to_word[2])

the
a
and


## 5.숫자 영화평 -> 자연어 영화평 return 함수

In [6]:
def decoding(review_num):
    decoded = [id_to_word.get(num-3, '???') for num in review_num]
    return ' '.join(decoded)

In [10]:
print(decoding(x_train[1]), y_train[1])

??? big hair big boobs bad music and a giant safety pin these are the words to best describe this terrible movie i love cheesy horror movies and i've seen hundreds but this had got to be on of the worst ever made the plot is paper thin and ridiculous the acting is an abomination the script is completely laughable the best is the end showdown with the cop and how he worked out who the killer is it's just so damn terribly written the clothes are sickening and funny in equal ??? the hair is big lots of boobs ??? men wear those cut ??? shirts that show off their ??? sickening that men actually wore them and the music is just ??? trash that plays over and over again in almost every scene there is trashy music boobs and ??? taking away bodies and the gym still doesn't close for ??? all joking aside this is a truly bad film whose only charm is to look back on the disaster that was the 80's and have a good old laugh at how bad everything was back then 0


## 6. 영화평(입력변수)의 길이

In [7]:
def show_length(x_train):
    print('첫 20개 영화평 길이')
    print([len(x_data) for x_data in x_train[:20]])

In [8]:
# pad_sequence
show_length(x_train)

첫 20개 영화평 길이
[218, 189, 141, 550, 147, 43, 123, 562, 233, 130, 450, 99, 117, 238, 109, 129, 163, 752, 212, 177]


## 7. 모든 영화평 길이를 동일하게 (80)

In [9]:
X_train = pad_sequences(x_train,
                       padding='pre',
                       truncating='pre',  # 뒷부분을 짜르고 앞 부분을 남김 / 'pre' : 앞 부분을 짜르고 뒷 부분을 남김
                       maxlen=MY_LENGTH)
X_test = pad_sequences(x_test,
                      padding='pre',
                      truncating='pre',
                      maxlen=MY_LENGTH)
show_length(X_train), show_length(X_test)

첫 20개 영화평 길이
[80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80]
첫 20개 영화평 길이
[80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80]


(None, None)

## 8. 최종 데이터 shape확인

In [10]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((25000, 80), (25000,), (25000, 80), (25000,))

## 9. 모델  생성(LSTM)

In [18]:
model = Sequential()
model.add(Embedding(input_dim=MY_WORDS,  # 10000
                   output_dim=MY_EMBED,  # 32
                   input_length=MY_LENGTH,  # 80
                   ))
# RNN : 입력 단어의 길이 수가 너무 길면 파라미터 업데이트 안 됨
# 개선모델 1. LSTM | 개선모델 2. GRU
model.add(LSTM(units=MY_HIDDEN,
              input_shape=(MY_LENGTH, MY_EMBED)))
model.add(Dense(units=1, activation='sigmoid'))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 80, 32)            320000    
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 344,897
Trainable params: 344,897
Non-trainable params: 0
_________________________________________________________________


## 9. 모델 생성(Transformers)

In [11]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential, Model

INPUTS = layers.Input(shape=(MY_LENGTH,)) # 80

INPUT_EMBEDDING = layers.Embedding(input_dim=MY_WORDS, # 10000
                                    output_dim=MY_EMBED)(INPUTS)# 32
# Positional Encoding
POSITIONS = tf.range(start=0,
                    limit=MY_LENGTH)
POS_ENCODING = layers.Embedding(input_dim=MY_LENGTH, output_dim=MY_EMBED)(POSITIONS)
POS_ENC_OUTPUT = POS_ENCODING + INPUT_EMBEDDING

ATTENTION_OUTPUT = layers.MultiHeadAttention(num_heads=3,
                                            key_dim=MY_EMBED)(POS_ENC_OUTPUT,
                                            POS_ENC_OUTPUT)
X = layers.add([POS_ENC_OUTPUT, ATTENTION_OUTPUT])
X = layers.BatchNormalization()(X)

# FeedForward Network
FFN = Sequential([layers.Dense(MY_HIDDEN, activation="relu"),
                                layers.Dense(MY_EMBED, activation="relu")])(X)
X = layers.add([FFN, X])
X = layers.BatchNormalization()(X)

# 하나의 벡터로 압축해서 Dense로 보내줌
X = layers.GlobalAveragePooling1D()(X)
X = layers.Dropout(0.1)(X)

X = layers.Dense(MY_HIDDEN, activation="relu")(X)
X = layers.Dropout(0.1)(X)

OUTPUTS = layers.Dense(2, activation="softmax")(X)
model = Model(inputs=INPUTS, outputs=OUTPUTS)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 80)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 80, 32)       320000      ['input_1[0][0]']                
                                                                                                  
 tf.__operators__.add (TFOpLamb  (None, 80, 32)      0           ['embedding[0][0]']              
 da)                                                                                              
                                                                                                  
 multi_head_attention (MultiHea  (None, 80, 32)      12608       ['tf.__operators__.add[0][0]'

## 10. 학습환경 설정 및 학습하기

In [14]:
model.compile(#loss='binary_crossentropy',  # 이진분류시 손실함수
             loss='sparse_categorical_crossentropy',
             optimizer='adam',
             metrics=['acc'])

begin = time() # 70.1.1부터 현재까지의 세컨

hist = model.fit(X_train, y_train,
                epochs=MY_EPOCH,
                batch_size=MY_BATCH,
                validation_split=0.2,
                verbose=1)
end = time()
print('총 학습 시간 : ', (end-begin))

1750733893.0996554
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
총 학습 시간 :  156.19479894638062


In [20]:
hist.history.keys()

NameError: name 'hist' is not defined

In [None]:
import matplotlib.pyplot as plt
fig, loss_ax = plt.subplots(figsize=(12,6))
loss_ax.plot(hist.history['loss'], 'y', label='train loss')
loss_ax.plot(hist.history['val_loss'], 'r', label='val loss')
acc_ax = loss_ax.twinx()
acc_ax.plot(hist.history['acc'], 'g', label='train accuracy')
acc_ax.plot(hist.history['val_acc'], 'b', label='val accuracy')
loss_ax.set_xlabel('epochs')
loss_ax.set_ylabel('loss')
acc_ax.set_ylabel('accuracy')
loss_ax.legend(loc='center right')
acc_ax.legend(loc='upper left')
plt.show()

## 11. 모델 평가

In [15]:
# pad_sequences 'post' : 75.6%
# pad_sequences 'pre' : 80.1%
loss, acc = model.evaluate(X_test, y_test)
print('정확도 : ', acc)

정확도 :  0.7961999773979187


In [None]:
# 혼동행렬, recall, precision을 위한 yhat
yhat = (model.predict(X_test, verbose=0) > 0.5).astype(np.int16).reshape(-1)
yhat

In [None]:
# 혼동행렬
confusion_matrix(y_test, yhat)

In [None]:
# recall ( 실제 True인 것 중 True로 예측한 비율) 9853 /  ( 2647+9853)
recall_score(y_test, yhat)

In [None]:
# precision(True로 예측한 것 중 실제값이 True인 비율) 9853 / (2307+9853)
precision_score(y_test, yhat)

## 12. 모델 사용하기

In [39]:
review = """This was genuinely one of those rare films where I completely lost track of time in the theater. 
Initially, I expected it to follow a predictable storyline, 
but from the midpoint onward, the plot kept subverting my expectations, keeping me thoroughly engaged until the very end.
The performances by the lead actors were particularly impressive. 
They managed to convey even the most subtle emotional nuances with remarkable naturalness. 
The cinematography and soundtrack also complemented the story perfectly, significantly enhancing the overall immersive experience.
What I appreciated most was how the film lingered in my thoughts long after leaving the theater. 
It sparked plenty of conversations with friends, and I found myself wanting to watch it again – that's how satisfying the experience was.
It's the kind of movie that works both as light entertainment and as something that gives you plenty to think about. 
I'd definitely recommend it to a wide audience!~!@#$%^&
""".lower()
import re
review = re.sub('[^a-zA-Z\'\s]', '', review)
#print('영화평(특수문자 제외) : ', review)
review = review.split()  # 단어 list
review = [1] + [word_to_id.get(word, -1)+3 if word_to_id.get(word, -1) < 10000 else '???' for word in review]
print(review, len(review))

[1, 14, 16, 2070, 31, 7, 148, 1281, 108, 121, 13, 340, 416, 1406, 7, 58, 11, 4, 750, 2721, 13, 873, 12, 8, 794, 6, 727, 769, 21, 39, 4, '???', '???', 4, 114, 828, '???', 61, 1398, 1895, 72, 1562, 3953, 366, 4, 55, 130, 4, 354, 34, 4, 485, 156, 71, 572, 1159, 36, 1319, 8, 2833, 60, 4, 91, 1302, 921, 6902, 19, 1739, '???', 4, 627, 5, 816, 82, '???', 4, 65, 950, 8703, '???', 4, 444, '???', 585, 51, 13, 2525, 91, 16, 89, 4, 22, '???', 11, 61, 2334, 196, 103, 1200, 4, 750, 12, '???', 958, 7, 3958, 19, 369, 5, 13, 258, 546, 1786, 8, 106, 12, 174, 198, 89, 2349, 4, 585, 16, 45, 4, 243, 7, 20, 15, 495, 199, 17, 641, 722, 5, 17, 142, 15, 408, 25, 958, 8, 104, 44, 474, 407, 386, 12, 8, 6, 1876, 311] 152


In [40]:
input_data = pad_sequences([review],
                      padding='pre',
                      truncating='pre',
                      maxlen=MY_LENGTH)
input_data

ValueError: invalid literal for int() with base 10: '???'

In [41]:
result = (model.predict(input_data) > 0.5).astype('int8').reshape(-1)
result



array([0], dtype=int8)

In [36]:
review = """The movie was really exciting, 
and I watched it throughout the movie hoping it wouldn't end without 
watching the clock. I strongly recommend it. 
It was even more interesting thinking that it could happen in real life. 
The main character was handsome and acted well, so my eyes were happy, 
and the story was fun @_@ㅠ.ㅠ""".lower()
import re
review = re.sub('[^a-zA-Z\'\s]', '', review)
review = review.split() # 단어 list
review = [1] + [word_to_id.get(word, -1)+3 for word in review]
print(review, len(review))

[1, 4, 20, 16, 66, 1127, 5, 13, 296, 12, 469, 4, 20, 1383, 12, 586, 130, 209, 149, 4, 5431, 13, 2303, 386, 12, 12, 16, 60, 53, 221, 536, 15, 12, 100, 593, 11, 147, 113, 4, 293, 109, 16, 2252, 5, 917, 73, 38, 61, 523, 71, 654, 5, 4, 65, 16, 253] 56


In [37]:
input_data = pad_sequences([review],
                      padding='pre',
                      truncating='pre',
                      maxlen=MY_LENGTH)
input_data

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    1,    4,   20,   16,   66, 1127,    5,   13,  296,
          12,  469,    4,   20, 1383,   12,  586,  130,  209,  149,    4,
        5431,   13, 2303,  386,   12,   12,   16,   60,   53,  221,  536,
          15,   12,  100,  593,   11,  147,  113,    4,  293,  109,   16,
        2252,    5,  917,   73,   38,   61,  523,   71,  654,    5,    4,
          65,   16,  253]])

In [38]:
result = (model.predict(input_data)>0.5).astype('int8').reshape(-1)
result



array([0], dtype=int8)