# RNN 토큰화 처리 & 임베딩

## Tokenization 처리

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

In [3]:
texts = ['you are very handsome',
         'you are very beautiful']

tokenizer = Tokenizer(num_words=10, oov_token= '<00V')
tokenizer.fit_on_texts((texts))

In [4]:
tokenizer

<keras_preprocessing.text.Tokenizer at 0x199bdceee08>

In [5]:
sequences = tokenizer.texts_to_sequences(texts)

In [6]:
sequences

[[2, 3, 4, 5], [2, 3, 4, 6]]

In [7]:
# 원 핫 인코딩

binary = tokenizer.sequences_to_matrix(sequences, mode = 'binary')

In [8]:
binary

array([[0., 0., 1., 1., 1., 1., 0., 0., 0., 0.],
       [0., 0., 1., 1., 1., 0., 1., 0., 0., 0.]])

In [9]:
binary.shape

(2, 10)

In [10]:
tokenizer.word_index

{'<00V': 1, 'you': 2, 'are': 3, 'very': 4, 'handsome': 5, 'beautiful': 6}

# IMDB 데이터 셋 실습

### 데이터 다운받기

In [11]:
from tensorflow.keras.datasets import imdb

num_words = 10000
(X_train, y_train),(X_test, y_test) = imdb.load_data(num_words = num_words)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(25000,) (25000,)
(25000,) (25000,)


### 데이터 셋 확인하기

In [12]:
X_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 2,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 2,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 2,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 2,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 2,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 2,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5,
 144,
 30,
 5535,
 18,

### 빈도에 따른 단어를 추출

In [13]:
imdb_get_word_index = {}

for key, value in imdb.get_word_index().items():
    imdb_get_word_index[value] = key
    
for i in range(1,6):
    print('{} 번째로 사용 빈도가 많은 단어 = {}'.format(i, imdb_get_word_index[i]))
                                            

1 번째로 사용 빈도가 많은 단어 = the
2 번째로 사용 빈도가 많은 단어 = and
3 번째로 사용 빈도가 많은 단어 = a
4 번째로 사용 빈도가 많은 단어 = of
5 번째로 사용 빈도가 많은 단어 = to


### 데이터 같은 길이로 맞추기

In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

maxlen = 500
print('pad_sequence 사용전:', len(X_train[0]))
pad_X_train = pad_sequences(X_train, maxlen = maxlen, padding = 'pre')
pad_X_test = pad_sequences(X_test, maxlen = maxlen, padding = 'pre')

print('pad_sequence 사용후:', len(pad_X_train[0]))

pad_sequence 사용전: 218
pad_sequence 사용후: 500


### Embedding 층으로 모델 만들기

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

model = Sequential([
    Embedding(input_dim = num_words, output_dim = 32, input_length = maxlen),
    Flatten(),
    Dense(1, activation = 'sigmoid')
])

model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['acc'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 32)           320000    
_________________________________________________________________
flatten (Flatten)            (None, 16000)             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 16001     
Total params: 336,001
Trainable params: 336,001
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(pad_X_train, y_train,
                   batch_size=32,
                   epochs=10,
                   validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10