In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np
import csv

In [2]:
import tensorflow_datasets as tfds
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

In [3]:
train_data, test_data = imdb['train'], imdb['test']

training_sentences = []
training_labels = []

testing_sentences = []
testing_labels = []

train_w2v_sent = []
train_w2v_labels = []
test_w2v_sent = []
test_w2v_labels = []
temp = []

# str(s.tonumpy()) is needed in Python3 instead of just s.numpy()
for s,l in train_data:
    training_sentences.append(str(s.numpy()))
    training_labels.append(l.numpy())

    
    
for s,l in test_data:
    testing_sentences.append(str(s.numpy()))
    testing_labels.append(l.numpy())

    
training_labels_final = np.array(training_labels)
testing_labels_final = np.array(testing_labels)


In [4]:
vocab_size = 10000
embedding_dim = 200
max_length = 120
trunc_type='post'
oov_tok = "<OOV>"


from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(training_sentences)
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences,maxlen=max_length)

In [5]:
print(len(sequences))
for i in range(len(sequences)):
    for j in range(len(sequences[i])):
        train_w2v_labels.append(sequences[i][j])
        if j == 0:
            temp = [sequences[i][j+1],sequences[i][j+2],sequences[i][j+3]]
            train_w2v_sent.append(temp)
            
        elif j == 1:
            temp = [sequences[i][j+1],sequences[i][j+2],sequences[i][j-1],sequences[i][j+3]]
            train_w2v_sent.append(temp)

        elif j == 2:
            temp = [sequences[i][j-1],sequences[i][j-2],sequences[i][j+3],sequences[i][j+1],sequences[i][j+2]]
            train_w2v_sent.append(temp)
            
        elif j == len(sequences[i])-2:
            temp = [sequences[i][j+1],sequences[i][j-1],sequences[i][j-2],sequences[i][j-3]]
            train_w2v_sent.append(temp)
            
        elif j == len(sequences[i])-1:
            temp = [sequences[i][j-1],sequences[i][j-2],sequences[i][j-3]]
            train_w2v_sent.append(temp)

        elif j == len(sequences[i])-3:
            temp = [sequences[i][j-1],sequences[i][j-2],sequences[i][j-3],sequences[i][j+1],sequences[i][j+2]]
            train_w2v_sent.append(temp)
            
        else:
            temp = [sequences[i][j-3],sequences[i][j-2],sequences[i][j-1],sequences[i][j+1],sequences[i][j+2],sequences[i][j+3]]
            train_w2v_sent.append(temp)
            

25000


In [6]:
print(len(testing_sequences))
for i in range(int(len(testing_sequences)/4)):
    for j in range(len(testing_sequences[i])):
        test_w2v_labels.append(testing_sequences[i][j])
        if j == 0:
            temp = [testing_sequences[i][j+1],testing_sequences[i][j+2],testing_sequences[i][j+3]]
            test_w2v_sent.append(temp)
            
        elif j == 1:
            temp = [testing_sequences[i][j+1],testing_sequences[i][j+2],testing_sequences[i][j-1],testing_sequences[i][j+3]]
            test_w2v_sent.append(temp)

        elif j == 2:
            temp = [testing_sequences[i][j-1],testing_sequences[i][j-2],testing_sequences[i][j+3],testing_sequences[i][j+1],testing_sequences[i][j+2]]
            test_w2v_sent.append(temp)
            
        elif j == len(testing_sequences[i])-2:
            temp = [testing_sequences[i][j+1],testing_sequences[i][j-1],testing_sequences[i][j-2],testing_sequences[i][j-3]]
            test_w2v_sent.append(temp)
            
        elif j == len(testing_sequences[i])-1:
            temp = [testing_sequences[i][j-1],testing_sequences[i][j-2],testing_sequences[i][j-3]]
            test_w2v_sent.append(temp)

        elif j == len(testing_sequences[i])-3:
            temp = [testing_sequences[i][j-1],testing_sequences[i][j-2],testing_sequences[i][j-3],testing_sequences[i][j+1],testing_sequences[i][j+2]]
            test_w2v_sent.append(temp)
            
        else:
            temp = [testing_sequences[i][j-3],testing_sequences[i][j-2],testing_sequences[i][j-1],testing_sequences[i][j+1],testing_sequences[i][j+2],testing_sequences[i][j+3]]
            test_w2v_sent.append(temp)
            

25000


In [7]:
train_padded = pad_sequences(train_w2v_sent,maxlen=6)
test_padded = pad_sequences(test_w2v_sent,maxlen=6)

In [8]:
print(train_w2v_sent[1])
print(test_w2v_sent[1])
print(train_padded[1])
print(test_padded[1])
print(train_w2v_labels[1])
print(test_w2v_labels[1])

[14, 35, 59, 439]
[25, 109, 59, 13]
[  0   0  14  35  59 439]
[  0   0  25 109  59  13]
12
44


In [9]:
train_labels_final = np.array(train_w2v_labels)
test_labels_final = np.array(test_w2v_labels)
# train_sent_final = np.array(train_w2v_sent)
# test_sent_final = np.array(test_w2v_sent)
train_sent_final = np.array(train_padded)
test_sent_final = np.array(test_padded)

In [10]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

# print(decode_review(padded[3]))
# print(sequences[3])

In [11]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=6),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
#     tf.keras.layers.Dense(6, activation='tanh'),
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])
model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 6, 200)            2000000   
_________________________________________________________________
flatten (Flatten)            (None, 1200)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 7206      
_________________________________________________________________
dense_1 (Dense)              (None, 10000)             70000     
Total params: 2,077,206
Trainable params: 2,077,206
Non-trainable params: 0
_________________________________________________________________


In [12]:
num_epochs = 6
# model.fit(padded, training_labels_final, epochs=num_epochs, validation_data=(testing_padded, testing_labels_final))
model.fit(train_sent_final,
          train_labels_final,
          batch_size=10000,
          epochs=num_epochs,
          validation_data=(test_sent_final, test_labels_final))

Epoch 1/6

KeyboardInterrupt: 

In [0]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(10000, 200)


In [0]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

In [0]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')