## RNN 토큰화 처리 & 임베딩

### Tokenization 처리

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

In [16]:
# 데이터를 토큰화
texts = ['You are very handsome',
         'You are very beautiful']

tokenizer = Tokenizer(num_words=10, oov_token='<00V>')
tokenizer.fit_on_texts(texts)

In [17]:
# 텍스트 데이터를 정수 인덱스 형태로 변환
sequences = tokenizer.texts_to_sequences(texts)

In [18]:
sequences

[[2, 3, 4, 5], [2, 3, 4, 6]]

In [19]:
# One-hot-encoding
binary = tokenizer.sequences_to_matrix(sequences, mode ='binary')
binary

array([[0., 0., 1., 1., 1., 1., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 0., 1., 0., 0., 0.]])

In [20]:
binary.shape

(2, 10)

In [21]:
tokenizer.word_index

{'<00V>': 1, 'you': 2, 'are': 3, 'very': 4, 'handsome': 5, 'beautiful': 6}

## IMDB 데이터셋 실습

### 데이터 다운받기

In [24]:
from tensorflow.keras.datasets import imdb

num_words = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=num_words)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(25000,) (25000,)
(25000,) (25000,)


### 빈도에 따른 단어를 추출

In [27]:
imdb_get_word_index ={}

for key, value in imdb.get_word_index().items():
    imdb_get_word_index[value] = key
    
for i in range(1, 6):
    print('{} 번째로 사용 빈도가 많은 단어 = {}'.format(i, imdb_get_word_index[i]))
    

1 번째로 사용 빈도가 많은 단어 = the
2 번째로 사용 빈도가 많은 단어 = and
3 번째로 사용 빈도가 많은 단어 = a
4 번째로 사용 빈도가 많은 단어 = of
5 번째로 사용 빈도가 많은 단어 = to


### 데이터를 같은 길이로 맞추기

In [32]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

maxlen = 500
print('pad_sequences 사용 전:', len(X_train[0]))

pad_X_train = pad_sequences(X_train, maxlen=maxlen, padding='pre')
pad_X_test = pad_sequences(X_test, maxlen=maxlen, padding='pre')

print('pad_sequence 사용 후:', len(pad_X_train[0]))

pad_sequences 사용 전: 218
pad_sequence 사용 후: 500


## Embedding 층으로 모델 만들기

In [35]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

model = Sequential([
    Embedding(input_dim=num_words, output_dim=32, input_length=maxlen),
    Flatten(),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 32)           320000    
_________________________________________________________________
flatten (Flatten)            (None, 16000)             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 16001     
Total params: 336,001
Trainable params: 336,001
Non-trainable params: 0
_________________________________________________________________


In [36]:
history = model.fit(pad_X_train, y_train,
                    batch_size=32,
                    epochs=10,
                    validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [38]:
model.evaluate(pad_X_test, y_test, verbose=2)

25000/1 - 1s - loss: 0.3601 - acc: 0.8726


[0.4562104887187481, 0.8726]