### Import modules

In [11]:
# TensorFlow and tf.keras
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Helper libraries
import numpy as np
import matplotlib.pyplot as plt

### load data 
- #### dataset (x: imdb article -> y: text type)
- #### word2idx dict (key: word; value: index)

In [12]:
vocab_size = 10000
## deciding the number of input nodes from the beginning
(x_train, y_train), (x_test, y_test) = tensorflow.keras.datasets.imdb.load_data(num_words=vocab_size) 
word2idx = tensorflow.keras.datasets.imdb.get_word_index()

In [17]:
x_train.shape

(25000,)

In [19]:
x_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 2,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 2,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 2,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 2,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 2,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 2,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5,
 144,
 30,
 5535,
 18,

In [20]:
y_train[0]

1

In [29]:
word2idx

{'fawn': 34707,
 'tsukino': 52012,
 'nunnery': 52013,
 'sonja': 16822,
 'vani': 63957,
 'woods': 1414,
 'spiders': 16121,
 'hanging': 2351,
 'woody': 2295,
 'trawling': 52014,
 "hold's": 52015,
 'comically': 11313,
 'localized': 40836,
 'disobeying': 30574,
 "'royale": 52016,
 "harpo's": 40837,
 'canet': 52017,
 'aileen': 19319,
 'acurately': 52018,
 "diplomat's": 52019,
 'rickman': 25248,
 'arranged': 6752,
 'rumbustious': 52020,
 'familiarness': 52021,
 "spider'": 52022,
 'hahahah': 68810,
 "wood'": 52023,
 'transvestism': 40839,
 "hangin'": 34708,
 'bringing': 2344,
 'seamier': 40840,
 'wooded': 34709,
 'bravora': 52024,
 'grueling': 16823,
 'wooden': 1642,
 'wednesday': 16824,
 "'prix": 52025,
 'altagracia': 34710,
 'circuitry': 52026,
 'crotch': 11591,
 'busybody': 57772,
 "tart'n'tangy": 52027,
 'burgade': 14135,
 'thrace': 52029,
 "tom's": 11044,
 'snuggles': 52031,
 'francesco': 29120,
 'complainers': 52033,
 'templarios': 52131,
 '272': 40841,
 '273': 52034,
 'zaniacs': 52136,

In [30]:
len(word2idx)

88588

In [31]:
# word2iid.items()

### modifiy word2idx dict (to reflect x_train, x_text)

In [37]:
word2idx = {k:v+3 for k, v in word2idx.items()} # think as slight update of the dict
word2idx['<PAD>'] = 0
word2idx['<START>'] = 1
word2idx['<UNK>'] = 2
word2idx['<UNUSED>'] = 3

### create idx2word: inverse of word2idx

In [40]:
idx2word = {v:k for k, v in word2idx.items()}

In [41]:
x_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 2,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 2,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 2,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 2,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 2,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 2,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5,
 144,
 30,
 5535,
 18,

In [42]:
len(x_train[0])

218

### padding x_train, x_test

In [43]:
x_train = pad_sequences(x_train, value=word2idx['<PAD>'], padding='post', maxlen=256)
x_test = pad_sequences(x_test, value=word2idx['<PAD>'], padding='post', maxlen=256)

In [44]:
x_train[0]
# total 256 *1 numbers into 256 * 10000 is embedding (1 0 0 0 0, 0 0 0 0 0 0 0 0 0 1 0 0 0.... , )
# 1이 256개, 0이 256 * 10000 - 256

array([   1,   14,   22,   16,   43,  530,  973, 1622, 1385,   65,  458,
       4468,   66, 3941,    4,  173,   36,  256,    5,   25,  100,   43,
        838,  112,   50,  670,    2,    9,   35,  480,  284,    5,  150,
          4,  172,  112,  167,    2,  336,  385,   39,    4,  172, 4536,
       1111,   17,  546,   38,   13,  447,    4,  192,   50,   16,    6,
        147, 2025,   19,   14,   22,    4, 1920, 4613,  469,    4,   22,
         71,   87,   12,   16,   43,  530,   38,   76,   15,   13, 1247,
          4,   22,   17,  515,   17,   12,   16,  626,   18,    2,    5,
         62,  386,   12,    8,  316,    8,  106,    5,    4, 2223, 5244,
         16,  480,   66, 3785,   33,    4,  130,   12,   16,   38,  619,
          5,   25,  124,   51,   36,  135,   48,   25, 1415,   33,    6,
         22,   12,  215,   28,   77,   52,    5,   14,  407,   16,   82,
          2,    8,    4,  107,  117, 5952,   15,  256,    4,    2,    7,
       3766,    5,  723,   36,   71,   43,  530,  4

### create model

In [45]:
model = Sequential()
model.add(Embedding(vocab_size, 128))
model.add(GlobalAveragePooling1D())
model.add(Dense(1, activation='sigmoid'))

#### Explanation

10000 nodes, 128 hidden nodes (128 = arbitrary하게 넣은 것), 1 output
(10000개의 단어 처리)
but must be over 10000
if over 10000, then unknown
0 .... 9999 index, make a dictionary
0 about 
1 aim ...
one hot encoding
1 0 0 0 0 0 0 0
"review = text"
if word, just train with this one hot encoding
word, and like/dislike about it 

but review, sentence, more than two words

"I love" ...
1 * 10000 vector, weight 10000 * 128, 

none * 10000 || 10000 * 128 || 128 * 1
if two words, none = 2
then result matrix = 2 * 1
(simultaenous computation)
if 100 words,
100 * 10000 || 10000 * 128 || 100 * 1
(but the end should not be 100 * 1, but be 1 * 1)
thus, collapse the hidden layer, making it into one vector

In [46]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 128)         1280000   
_________________________________________________________________
global_average_pooling1d_2 ( (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 1,280,129
Trainable params: 1,280,129
Non-trainable params: 0
_________________________________________________________________


In [47]:
testID = 100
print(' '.join([idx2word[idx] for idx in x_test[testID]]))

out = model.predict(x_test[testID].reshape(1,256))
print(y_test[testID])
print(out)

KeyError: 6

### need to decide how many words in a text, not too long or short... around 256 words!
if shorter than 256, "pad"