# Get data

In [1]:
import tensorflow_datasets as tfds 

data, info = tfds.load('imdb_reviews/subwords8k', with_info=True, as_supervised=True)

08/50000 [01:01<00:19, 714.70 examples/s][A
Generating unsupervised examples...:  73%|███████▎  | 36481/50000 [01:01<00:18, 718.88 examples/s][A
Generating unsupervised examples...:  73%|███████▎  | 36564/50000 [01:01<00:17, 750.91 examples/s][A
Generating unsupervised examples...:  73%|███████▎  | 36648/50000 [01:01<00:17, 777.33 examples/s][A
Generating unsupervised examples...:  73%|███████▎  | 36729/50000 [01:01<00:16, 786.45 examples/s][A
Generating unsupervised examples...:  74%|███████▎  | 36813/50000 [01:02<00:16, 800.26 examples/s][A
Generating unsupervised examples...:  74%|███████▍  | 36894/50000 [01:02<00:16, 775.41 examples/s][A
Generating unsupervised examples...:  74%|███████▍  | 36980/50000 [01:02<00:16, 799.31 examples/s][A
Generating unsupervised examples...:  74%|███████▍  | 37061/50000 [01:02<00:16, 797.84 examples/s][A
Generating unsupervised examples...:  74%|███████▍  | 37144/50000 [01:02<00:15, 807.08 examples/s][A
Generating unsupervised examples...: 

In [11]:
tokenizer = info.features["text"].encoder

In [12]:
train, test = data["train"], data["test"]

In [13]:
print(len(train))
print(len(test))

25000
25000


In [47]:
import tensorflow as tf 

buffer_size = 5000
batch_size = 64

train_data = train.shuffle(buffer_size)
train_data = train_data.padded_batch(batch_size, tf.compat.v1.data.get_output_shapes(train_data))

test_data = test.shuffle(buffer_size)
test_data = test_data.padded_batch(batch_size, tf.compat.v1.data.get_output_shapes(test_data))

# Playing around with tokenizer

In [45]:
string = "You are crazy hvgkuj"
string1 = "You are crazy adsvdav"

print(tokenizer.encode(string))
print(tokenizer.encode(string+'tr'))
print(tokenizer.encode(string1))
print(tokenizer.encode('You are crazy'))
print(tokenizer.encode('You are'))
print(tokenizer.encode('You '))
print(tokenizer.encode('You'))

[298, 29, 2653, 8033, 8047, 8032, 3702, 8035]
[298, 29, 2653, 8033, 8047, 8032, 3702, 8035, 8045, 8043]
[298, 29, 2653, 1077, 8044, 8047, 1059, 8047]
[298, 29, 3863, 8050]
[298, 762]
[298]
[1162]


In [44]:
sample_strings = ['You are crazy hvgkuj', 'You are not crazy hvgkuj', 'You are crazy', 'You are', 'You ', 'You']

for string in sample_strings:
    print(string)
    tokenized_string = tokenizer.encode(string)
    for ts in tokenized_string:
        print ('\t{} ----> {}'.format(ts, tokenizer.decode([ts])))

You are crazy hvgkuj
	298 ----> You 
	29 ----> are 
	2653 ----> crazy 
	8033 ----> h
	8047 ----> v
	8032 ----> g
	3702 ----> ku
	8035 ----> j
You are not crazy hvgkuj
	298 ----> You 
	29 ----> are 
	33 ----> not 
	2653 ----> crazy 
	8033 ----> h
	8047 ----> v
	8032 ----> g
	3702 ----> ku
	8035 ----> j
You are crazy
	298 ----> You 
	29 ----> are 
	3863 ----> craz
	8050 ----> y
You are
	298 ----> You 
	762 ----> are
You 
	298 ----> You 
You
	1162 ----> You


# Model

In [56]:
vocab_size = 10000
embedding_dim = 16
epochs = 1 # Because its damn slow

## Model with single LSTM layer

In [52]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 16)          160000    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128)               41472     
_________________________________________________________________
dense_4 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 209,793
Trainable params: 209,793
Non-trainable params: 0
_________________________________________________________________


## Model with multiple LSTM layer

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequence=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

## Model with conv1D

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.Conv1D(64, 5, activation='relu'),
    tf.keras.layers.Flatten(),
    # tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

In [53]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=["accuracy"]
)

In [55]:
history = model.fit(
    train_data,
    epochs=epochs,
    validation_data=test_data
)

Epoch 1/10

KeyboardInterrupt: 