# Sentiment Classification & Embedding II

* Embedding Layer
* Sequence Model

# 01. What data we use?

In [1]:
import numpy as np
import pandas as pd

from tensorflow.keras.datasets import imdb

In [2]:
max_words = 40000
(x_train, y_train), (x_test, y_test) = imdb.load_data( num_words = max_words)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [4]:
x_train.shape, y_train.shape

((25000,), (25000,))

In [5]:
x_train[:3]

array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
       list([1, 194, 1

In [6]:
word2idx = imdb.get_word_index()
idx2word = {val:key for key, val in word2idx.items()}

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [7]:
len(word2idx)

88584

In [8]:
" ".join(['a', 'bb'])

'a bb'

In [9]:
idx = 0
temp = x_train[idx]
## Sequence를 문장으로 복원하기.
print(" ".join([ idx2word[id] for id in temp   ]))

the as you with out themselves powerful lets loves their becomes reaching had journalist of lot from anyone to have after out atmosphere never more room titillate it so heart shows to years of every never going villaronga help moments or of every chest visual movie except her was several of enough more with is now current film as you of mine potentially unfortunately of you than him that with out themselves her get for was camp of you movie sometimes movie that with scary but pratfalls to story wonderful that in seeing in character to of 70s musicians with heart had shadows they of here that with her serious to have does when from why what have critics they is you that isn't one will very to as itself with other tricky in of seen over landed for anyone of gilmore's br show's to whether from than out themselves history he name half some br of 'n odd was two most of mean for 1 any an boat she he should is thought frog but of script you not while history he heart to real at barrel but whe

In [10]:
## 문장을 sequence로 바꿔보기
text = "this movie was so great"
text_tokenized = text.split()
seq = [ word2idx[word] for word in text_tokenized]
seq

[11, 17, 13, 35, 84]

In [17]:
y_train[:10]

array([1, 0, 0, 1, 0, 0, 1, 0, 1, 0])

# Q1. Tokenizing & Text to Sequences

* 상위 40000개 단어 사용하기로 했음.

In [18]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=40000, lower=True)

In [None]:
# Text --> Sequence



# Padding Sequence

* 문장의 최대 길이는 500으로 한다.

In [19]:
max_words = 40000
embedding_dim = 128
max_len = 500

In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
x_train = pad_sequences(x_train, maxlen = max_len)
x_test = pad_sequences(x_test, maxlen = max_len)

In [None]:
x_train = np.array(x_train)
x_test = np.array(x_test)

# 모델링

**모델 구조**
1. 임베딩 레이어 : 임베딩차원은 196
2. Conv1D 블록 : 필터수 128개, 윈도우 사이즈 5
2. Conv1D 블록 : 필터수 128개, 윈도우 사이즈 5
6. MaxPool1D 블록 : 필터사이즈2
3. Bidirectional layer :
    * 정방향 : LSTM, 히든스테이트 64 
    * 역방향 : LSTM, 히든스테이트 64
6. MaxPool1D 블록 : 필터사이즈2
4. Bidirectional layer :
    * 정방향 : GRU, 히든스테이트 32
    * 역방향 : GRU, 히든스테이트 32
4. Bidirectional layer :
    * 정방향 : GRU, 히든스테이트 32
    * 역방향 : GRU, 히든스테이트 32    
6. MaxPool1D 블록 : 필터사이즈2
7. 플래튼
8. FC Layer : 노드 1024개
9. 시그모이드 레이어

In [23]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Conv1D, MaxPool1D, Input
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, SimpleRNN, GRU
from tensorflow.keras.models import Model

In [24]:
# session clear
tf.keras.backend.clear_session()
# model
model = Model()
il = Input(shape=max_len)
# embadding - dim=128
hl = Embedding( max_words, embedding_dim )(il)
# conv1d f : 64 ,w : 5
hl = Conv1D(64, 5, activation='swish')(hl)
# Bidirectional : lstm 32, lstm 32 / gru 32, rnn 16
hl = Bidirectional(LSTM(32, return_sequences=True))(hl)

forward_layer = GRU(32, return_sequences=True)
backward_layer = SimpleRNN(16, return_sequences=True, go_backwards=True)
hl = Bidirectional(forward_layer, backward_layer=backward_layer)(hl)
# conv1d f:32 w:5
hl = Conv1D(filters=32, kernel_size=5)(hl)
# maxpool 1d : f_size 2
hl = MaxPool1D(2)(hl)
# flatten
hl = Flatten()(hl)
# fc layer 1024
hl = Dense(1024, activation='swish')(hl)
# output - sigmoid
ol = Dense(1, activation='sigmoid')(hl)
# Model
model = Model(il, ol)
# compile
model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='adam')
# summary
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 500)]             0         
                                                                 
 embedding (Embedding)       (None, 500, 128)          5120000   
                                                                 
 conv1d (Conv1D)             (None, 496, 64)           41024     
                                                                 
 bidirectional (Bidirectiona  (None, 496, 64)          24832     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 496, 48)          10704     
 nal)                                                            
                                                                 
 conv1d_1 (Conv1D)           (None, 492, 32)           7712

In [None]:
#####################
## your codes here ##
#####################






# EarlyStopping을 이용한 학습.

1. 20%는 벨리데이션 셋.
2. 4epochs전과 비교하여 early stopping할 것.

In [25]:
from tensorflow.keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='val_loss',
                   min_delta=0,
                   patience=5,
                   verbose=1,
                   restore_best_weights=True)

In [None]:
history = model.fit(x_train, y_train, epochs=100, validation_split=0.2, verbose =1,callbacks=[es])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100

# Test셋 위에서 성능 관찰



In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8,6))
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.show()

In [None]:
model.evaluate(x_test, y_test)