In [None]:
import tensorflow as tf

In [None]:
!pip install -q tensorflow_datasets

In [None]:
import tensorflow_datasets as tfds
imdb,info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)

In [None]:
import numpy as np

train_data, test_data = imdb['train'], imdb['test']

In [None]:
def _create_sentences(str_: str):
    exec(f'{str_}_sentences,{str_}_labels = [],[]')
    for s,l in eval(f'{str_}_data'):
        eval(f'{str_}_sentences.append(str(s.numpy()))')
        eval(f'{str_}_labels.append(l.numpy())')
    return (eval(f'{str_}_sentences'), eval(f'{str_}_labels'))

train_sentences, train_labels = _create_sentences(str_='train')
test_sentences, test_labels = _create_sentences(str_='test')

In [None]:
train_labels_final = np.array(train_labels)
test_labels_final = np.array(test_labels)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocab_size = 10000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = '<OOV>'

In [None]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(train_sentences)
padded = pad_sequences(sequences, maxlen=max_length, truncating=trunc_type)

test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, maxlen=max_length)

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense, GlobalAveragePooling1D
model = Sequential([
                    Embedding(vocab_size, embedding_dim, input_length=max_length), # Where the magic happens
                    # Flatten(),
                    GlobalAveragePooling1D(), # instead of flatten
                    Dense(6, activation='relu'),
                    Dense(1, activation='sigmoid')         
])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 120, 16)           160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 6)                 102       
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 7         
Total params: 160,109
Trainable params: 160,109
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
num_epochs = 30

model.fit(padded, train_labels_final, epochs=num_epochs,
          validation_data=(test_padded, test_labels_final),verbose=2)

<tensorflow.python.keras.callbacks.History at 0x7ffb732cdb00>

In [None]:
e = model.layers[0]
weights = e.get_weights()[0]

print(weights.shape)

(10000, 16)


In [None]:
reverse_word_index = dict([(value,key) for (key,value) in word_index.items()])

In [None]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    out_m.write(word+'\n')
    out_v.write('\t'.join([str(x) for x in embeddings]) + '\n')
out_v.close()
out_m.close()

In [None]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>