In [None]:
import tensorflow_datasets as tfds


In [None]:
imdb, info = tfds.load("imdb_reviews", with_info=True,as_supervised=True)

In [None]:
import numpy as np

In [None]:
train_data, test_data = imdb['train'], imdb['test']

In [None]:
training_sentences = []
training_labels = []
testing_sentences = []
testing_labels = []

# for l,s in train_data:
#   training_sentences.append(s.numpy().decode('utf8'))
#   training_labels.append(l.numpy())

# for l,s in test_data:
#   testing_sentences.append(s.numpy().decode('utf8'))
#   testing_labels.append(l.numpy())
for s,l in train_data:
  training_sentences.append(s.numpy().decode('utf8'))
  training_labels.append(l.numpy())
  
for s,l in test_data:
  testing_sentences.append(s.numpy().decode('utf8'))
  testing_labels.append(l.numpy())
  
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [None]:
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)

In [None]:
vocab_size=10000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = "<oov>"

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
tokenizer = Tokenizer(num_words=vocab_size,oov_token=oov_tok)


In [None]:
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index

In [None]:
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences,maxlen=max_length,truncating=trunc_type)

In [None]:
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length,truncating=trunc_type)

In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(padded[3]))
print(training_sentences[3])

? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? this is the kind of film for a snowy sunday afternoon when the rest of the world can go ahead with its own business as you <oov> into a big arm chair and <oov> for a couple of hours wonderful performances from cher and nicolas cage as always gently row the plot along there are no <oov> to cross no dangerous waters just a warm and witty <oov> through new york life at its best a family film in every sense and one that deserves the praise it received
This is the kind of film for a snowy Sunday afternoon when the rest of the world can go ahead with its own business as you descend into a big arm-chair and mellow for a couple of hours. Wonderful performances from Cher and Nicolas Cage (as always) gently row the plot along. There are no rapids to cross, no dangerous waters, just a warm and witty paddle through New York life at its best. A family film in every sense and one that deserves the praise it received.


In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Flatten, GlobalAvgPool1D

In [None]:
model = Sequential()

In [None]:
model.add(Embedding(vocab_size,embedding_dim,input_length=max_length))
model.add(Flatten())
# model.add(GlobalAvgPool1D())
model.add(Dense(6, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
flatten_2 (Flatten)          (None, 1920)              0         
_________________________________________________________________
dense_10 (Dense)             (None, 6)                 11526     
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='binary_crossentropy',optimizer='Adam',metrics=['accuracy'])

In [None]:
model.fit(padded,training_labels_final,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f6bf04b76d0>

In [None]:
padded

array([[   0,    0,    0, ...,  867,  141,   10],
       [   0,    0,    0, ...,   20,   31,   30],
       [4383, 6109,    2, ...,  550,    5,  735],
       ...,
       [ 861,   36,   11, ...,    8,    8,    2],
       [  52,   11,  217, ...,  251,   94,   42],
       [  84,    5,   30, ...,  131,    7,    6]], dtype=int32)

In [None]:
training_labels_final

array([0, 0, 0, ..., 0, 0, 1])

In [None]:
testing_padded

array([[  48,   24,  106, ...,    2, 5619,    1],
       [   4,    1,  696, ...,    1,  188,    7],
       [ 627,   18,  298, ...,    4,   50,    1],
       ...,
       [  89,    1,   18, ...,  731,  100,   19],
       [   0,    0,    0, ...,   53,   71,  222],
       [   0,    0,    0, ...,  448, 8608, 2127]], dtype=int32)

In [None]:
testing_labels_final

array([1, 1, 0, ..., 0, 1, 1])

In [None]:
model.evaluate(testing_padded,testing_labels_final)



[0.9575713872909546, 0.8088799715042114]

In [None]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

(10000, 16)


In [None]:
import io
out_v = io.open('vecs.tsv','w',encoding='utf-8')
out_m = io.open('meta.tsv','w',encoding='utf-8')

for word_num in range(1,vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word+"\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")

out_v.close()
out_m.close()

In [None]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>