In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_hub as hub
import dataset as d
import random
from tqdm import tqdm

In [2]:
BATCH_SIZE = 4

In [3]:
# grab the universal sentence encoder from tf hub
url = "https://tfhub.dev/google/universal-sentence-encoder-large/5"
embed = hub.load(url)

In [4]:
# Loads n article summaries from wikipedia and makes sequences to predict on
data = d.load_data(10000)
train, test = d.split_data(data)
train = d.process_data(train, 10, 100)
x, y = d.shuffle_and_split(train)


9999it [00:00, 22217.09it/s]
100%|██████████| 9000/9000 [00:00<00:00, 23381.43it/s]


In [7]:
# essentially creates a corpus to use to use
# you should save the tokenizer.word_index to ensure that its consistent with
# whatwas trained on
tokenizer = d.create_tokenizer(data)
vocab_size = len(tokenizer.word_index) + 1

In [8]:
# The unviersal sentence encoder was causing issues with input size 
# so it was moved to the generator intstead of a keras layer
# @TODO : move the encoder to the model
def generator(x,y, batch_size):
    total = int(len(x) / batch_size)
    
    i = 0 
    one_hot = np.eye(vocab_size, dtype='int8')
    
    while True:
        idx = i % total
        batch_x = x[idx * batch_size: (idx+1) * batch_size]
        batch_y = y[idx * batch_size: (idx+1) * batch_size]
        
        e_x = [embed([a]) for a in batch_x]
        
        e_y = []
        for a in batch_y:
            if a in tokenizer.word_index:
                e_y.append(one_hot[tokenizer.word_index[a]])
            else:
                e_y.append(one_hot[0])

        i += 1
        
        yield (tf.reshape(tf.convert_to_tensor(e_x), (batch_size, -1)), \
               tf.convert_to_tensor(e_y))

In [9]:
model = keras.Sequential()
model.add(keras.Input((512)))
model.add(layers.Dense(512, activation="relu"))
model.add(layers.Dense(512*2, activation="relu"))
model.add(layers.Dense(512*2, activation="relu"))
model.add(layers.Dense(512*2, activation="relu"))
model.add(layers.Dense(512*8, activation="relu"))
model.add(layers.Dense(vocab_size, activation="softmax"))



In [10]:
model.build(input_shape=(None, 512))

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 512)               262656    
_________________________________________________________________
dense_1 (Dense)              (None, 1024)              525312    
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
dense_3 (Dense)              (None, 1024)              1049600   
_________________________________________________________________
dense_4 (Dense)              (None, 4096)              4198400   
_________________________________________________________________
dense_5 (Dense)              (None, 77748)             318533556 
Total params: 325,619,124
Trainable params: 325,619,124
Non-trainable params: 0
__________________________________________

In [12]:
# @TODO : add tensorboard callback
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy', tf.keras.metrics.CategoricalAccuracy(), ])

In [13]:
gen = generator(x,y, BATCH_SIZE)

In [14]:
model.fit_generator(generator=gen, epochs=2, steps_per_epoch=len(x))

Instructions for updating:
Please use Model.fit, which supports generators.


Instructions for updating:
Please use Model.fit, which supports generators.


Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7ef80be70610>

In [20]:
# Save the corpous word index and the model / weights 
import pickle
model.save("model_data/model.h5")



In [23]:
with open('model_data/corpus.pickle', 'wb') as corpus:
    pickle.dump([tokenizer.word_index, tokenizer.index_word], corpus)

'a'