In [26]:
from __future__ import absolute_import, division, print_function, unicode_literals
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf 
import numpy as np
import io
import json
from keras_preprocessing.text import tokenizer_from_json
from tensorflow.keras.utils import Sequence
import datetime
import os
from prefetch_generator import BackgroundGenerator, background,__doc__
import time
import threading

In [27]:
#PRINT VERSION!!!
tf.__version__

'2.0.0'

In [3]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  1


## import dataset
### I'm using the amazon food reviews dataset here

In [4]:
train = pd.read_csv('~/Data_Science/tests/reviews.csv')
train = train[['Summary','Text']]
train.head()

Unnamed: 0,Summary,Text
0,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,"""Delight"" says it all",This is a confection that has been around a fe...
3,Cough Medicine,If you are looking for the secret ingredient i...
4,Great taffy,Great taffy at a great price. There was a wid...


## getting word counts

In [5]:
train['text_length'] = train['Text'].str.count(' ')
train['text_length'].describe()

count    568454.000000
mean         81.005522
std          80.807102
min           2.000000
25%          33.000000
50%          57.000000
75%          99.000000
max        3525.000000
Name: text_length, dtype: float64

In [6]:
train['summary_length'] = train['Summary'].str.count(' ')
train['summary_length'].describe()

count    568427.000000
mean          3.128462
std           2.619420
min           0.000000
25%           1.000000
50%           3.000000
75%           4.000000
max          41.000000
Name: summary_length, dtype: float64

In [7]:
train.head()

Unnamed: 0,Summary,Text,text_length,summary_length
0,Good Quality Dog Food,I have bought several of the Vitality canned d...,48,3.0
1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,30,2.0
2,"""Delight"" says it all",This is a confection that has been around a fe...,98,3.0
3,Cough Medicine,If you are looking for the secret ingredient i...,42,1.0
4,Great taffy,Great taffy at a great price. There was a wid...,29,1.0


## bounding data lengths

In [8]:
#get rid of weirdness in test/train set

train = train[train['summary_length']>=2].reset_index(drop=True)
train = train[train['summary_length']<=20].reset_index(drop=True)
train = train[train['text_length']<=100].reset_index(drop=True)

In [9]:
print(train.shape)
print(train.head())

(276719, 4)
                                         Summary  \
0                          Good Quality Dog Food   
1                              Not as Advertised   
2                          "Delight" says it all   
3  Great!  Just as good as the expensive brands!   
4                         Wonderful, tasty taffy   

                                                Text  text_length  \
0  I have bought several of the Vitality canned d...           48   
1  Product arrived labeled as Jumbo Salted Peanut...           30   
2  This is a confection that has been around a fe...           98   
3  This saltwater taffy had great flavors and was...           52   
4  This taffy is so good.  It is very soft and ch...           27   

   summary_length  
0             3.0  
1             2.0  
2             3.0  
3             8.0  
4             2.0  


## cleaning data and making and saving test set

In [10]:
train['text_lower'] = train['Text'].str.lower()
train['text_no_punctuation'] = train['text_lower'].str.replace('[^\w\s]','')

In [11]:
### adding "_start_" and "_end_" delimeters to summary this tells the model where to start

train['summary_lower'] = train["Summary"].str.lower()
train['summary_no_punctuation'] =  '_start_' + ' ' +train['summary_lower'].str.replace('[^\w\s]','')+ ' ' +'_end_'

In [12]:
#shuffle dataset and reset index

train = train.sample(frac=1).reset_index(drop=True)

test = train[0:100]
train = train[100:]

test.to_csv('test_set.csv')

## playing with max features

In [13]:
#setting max features and max len for text and summarty for model

max_features1 = 100000
maxlen1 = 100

max_features2 = 100000
maxlen2 = 20

## making tokenizers and saving them

In [14]:
tok1 = tf.keras.preprocessing.text.Tokenizer(num_words=max_features1) 
tok1.fit_on_texts(list(train['text_no_punctuation'].astype(str))) #fit to cleaned text
tf_train_text =tok1.texts_to_sequences(list(train['text_no_punctuation'].astype(str)))
tf_train_text =tf.keras.preprocessing.sequence.pad_sequences(tf_train_text, maxlen=maxlen1) #let's execute pad step 

In [15]:
#save tokenizer for scoring later on

tokenizer1_json = tok1.to_json()
with io.open('tok1.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer1_json, ensure_ascii=False))

In [16]:
#the processing has to be done for both 
#two different tokenizers

In [17]:
tok2 = tf.keras.preprocessing.text.Tokenizer(num_words=max_features2, filters = '*') 
tok2.fit_on_texts(list(train['summary_no_punctuation'].astype(str))) #fit to cleaned text
tf_train_summary = tok2.texts_to_sequences(list(train['summary_no_punctuation'].astype(str)))
tf_train_summary = tf.keras.preprocessing.sequence.pad_sequences(tf_train_summary, maxlen=maxlen2, padding ='post') 

In [18]:
tokenizer2_json = tok2.to_json()
with io.open('tok2.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer2_json, ensure_ascii=False))

## setting dimensions and getting the shapes

In [19]:
vectorized_summary = tf_train_summary
# For Decoder Input, you don't need the last word as that is only for prediction
# when we are training using Teacher Forcing.
decoder_input_data = vectorized_summary[:, :-1]

# Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
decoder_target_data = vectorized_summary[:, 1:]

print(f'Shape of decoder input: {decoder_input_data.shape}')
print(f'Shape of decoder target: {decoder_target_data.shape}')

vectorized_text = tf_train_text
# Encoder input is simply the body of the text
encoder_input_data = vectorized_text
doc_length = encoder_input_data.shape[1]
print(f'Shape of encoder input: {encoder_input_data.shape}')

Shape of decoder input: (276619, 19)
Shape of decoder target: (276619, 19)
Shape of encoder input: (276619, 100)


In [20]:
#setting size of vocabulary encoder and decoder

vocab_size_encoder = len(tok1.word_index) + 1 
vocab_size_decoder = len(tok2.word_index) + 1

In [21]:
#set latent dimension for embedding and hidden units

latent_dim = 100

## GloVe embedding layer

In [22]:
# Preparing GloVe

GLOVE_DIR = "/home/tiana/Data_Science/tests/GloVe"

embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.{}d.txt'.format(latent_dim)))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400001 word vectors.


In [23]:
#build embedding weights matrix for text

embedding_matrix = np.zeros((len(tok1.word_index) + 1, latent_dim))
for word, i in tok1.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# model

In [24]:
########################
#### Encoder Model ####

#setting Encoder Input
encoder_inputs = tf.keras.Input(shape=(doc_length,), name='Encoder-Input')

# GloVe Embeding for encoder
x = tf.keras.layers.Embedding(vocab_size_encoder, 
                              latent_dim, 
                              name='Body-Word-Embedding',
                              weights=[embedding_matrix],
                              mask_zero=False, 
                              trainable=False)(encoder_inputs)

#Batch normalization is used so that the distribution of the inputs 
#to a specific layer doesn't change over time
x = tf.keras.layers.BatchNormalization(name='Encoder-Batchnorm-1')(x)


# We do not need the `encoder_output` just the hidden state
_, state_h = tf.keras.layers.GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

# Set the encoder as a separate entity so we can encode without decoding if desired
encoder_model = tf.keras.Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')


seq2seq_encoder_out = encoder_model(encoder_inputs)



########################
#### Decoder Model ####
decoder_inputs = tf.keras.Input(shape=(None,), name='Decoder-Input')  # for teacher forcing

# Embedding For Decoder, not GloVe 
dec_emb = tf.keras.layers.Embedding(vocab_size_decoder, 
                                    latent_dim, 
                                    name='Decoder-Word-Embedding',
                                    mask_zero=False, )(decoder_inputs)

#batch normalization
dec_bn = tf.keras.layers.BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

# Set up the decoder, using `decoder_state_input` as initial state.
decoder_gru = tf.keras.layers.GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')
#the decoder "decodes" the encoder out
decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out)
x = tf.keras.layers.BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

# Dense layer for prediction
decoder_dense = tf.keras.layers.Dense(vocab_size_decoder, activation='softmax', name='Final-Output-Dense')
decoder_outputs = decoder_dense(x)


########################
#### Seq2Seq Model ####
seq2seq_Model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)


#parallelize data on N GPUs if desired
#seq2seq_Model = tf.keras.utils.multi_gpu_model(seq2seq_Model, gpus=N)

seq2seq_Model.compile(optimizer=tf.keras.optimizers.Nadam(lr=0.001), 
                      loss='sparse_categorical_crossentropy', 
                      metrics=['accuracy'])

** Examine Model Architecture Summary **

In [25]:
#from seq2seq_utils import viz_model_architecture
seq2seq_Model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Decoder-Input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
Decoder-Word-Embedding (Embeddi (None, None, 100)    2876300     Decoder-Input[0][0]              
__________________________________________________________________________________________________
Encoder-Input (InputLayer)      [(None, 100)]        0                                            
__________________________________________________________________________________________________
Decoder-Batchnorm-1 (BatchNorma (None, None, 100)    400         Decoder-Word-Embedding[0][0]     
______________________________________________________________________________________________

fit(x=None, y=None, 
    batch_size=None, 
    epochs=1, verbose=1, 
    callbacks=None, 
    validation_split=0.0, 
    validation_data=None, 
    shuffle=True, 
    class_weight=None, 
    sample_weight=None, 
    initial_epoch=0, 
    steps_per_epoch=None, 
    validation_steps=None, 
    validation_freq=1, 
    max_queue_size=10, 
    workers=1, 
    use_multiprocessing=False)

fit_generator(generator, 
              steps_per_epoch=None, 
              epochs=1, verbose=1, 
              callbacks=None, 
              validation_data=None, 
              validation_steps=None, 
              validation_freq=1, 
              class_weight=None, 
              max_queue_size=10, 
              workers=1, 
              use_multiprocessing=False, 
              shuffle=True, 
              initial_epoch=0)

The use of keras.utils.Sequence guarantees the ordering and guarantees the single use of every input 
per epoch when using use_multiprocessing=True.

#spencer generator

features = [encoder_input_data, decoder_input_data]
target = np.expand_dims(decoder_target_data, -1)
batch_size = 32
num_samples = ____

def S_generator(features, target, batch_size):
    while True:
        for offset in range(0, num_samples, batch_size):
            for x,y in zip(features, target):
                X_train = features[i]
                y_train = target[j]
                prefetched = dataset.prefetch(2)
            yield prefetched.as_numpy_iterator()

def generator(samples, batch_size=32):
    """
    Yields the next training batch.
    Suppose `samples` is an array [[image1_filename,label1], [image2_filename,label2],...].
    """
    num_samples = len(samples)
    while True: # Loop forever so the generator never terminates
        # Get index to start each batch: [0, batch_size, 2*batch_size, ..., max multiple of batch_size <= num_samples]
        for offset in range(0, num_samples, batch_size):
            # Get the samples you'll use in this batch
            batch_samples = samples[offset:offset+batch_size]

            # Initialise X_train and y_train arrays for this batch
            X_train = []
            y_train = []

            # For each example
            for batch_sample in batch_samples:
                # Load image (X)
                filename = './common_filepath/'+batch_sample[0]
                image = mpimg.imread(filename)
                # Read label (y)
                y = batch_sample[1]
                # Add example to arrays
                X_train.append(image)
                y_train.append(y)

            # Make sure they're numpy arrays (as opposed to lists)
            X_train = np.array(X_train)
            y_train = np.array(y_train)

            # The generator-y part: yield the next training batch            
            yield X_train, y_train

# Import list of train and validation data (image filenames and image labels)
# Note this is not valid code.
train_samples = ...
validation_samples = ...

# Create generator
train_generator = generator(train_samples, batch_size=32)
validation_generator = generator(validation_samples, batch_size=32)

#######################
# Use generator to train neural network in Keras
#######################

# Create model in Keras
from keras.models import Sequential
from keras.layers import Dense, Activation

model = Sequential([
    Dense(32, input_shape=(784,)),
    Activation('relu'),
    Dense(10),
    Activation('softmax'),
])

# Fit model using generator
model.fit_generator(train_generator, 
                    samples_per_epoch=len(train_samples), 
                    validation_data=validation_generator,
                    nb_val_samples=len(validation_samples), nb_epoch=100)

In [27]:
def data_generator_plain(features, labels, batch_size):

    batch_features = np.zeros((batch_size, IMG_SIZE, IMG_SIZE, NUM_CHANNELS), dtype=np.float64)
    batch_labels = np.zeros((batch_size, NUM_KEYPOINTS * 2), dtype=np.float64)

    while True:
        steps = len(batch_features) // batch_size
        for i in range(steps):
            for j in range(batch_size):
                batch_features[j] = features[(i*batch_size)+j]
                batch_labels[j] = labels[(i*batch_size)+j]

            yield batch_features, batch_labels

In [None]:
data = pd.read_csv("data.csv", header=0, delimiter="\t", quoting=3, encoding="utf-8")
y = data.label
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2)

def data_genereator(data, batch_size):
        num_rows = int(data.shape[0])
        # Initialize a counter
        counter = 0
        while True:
            for content, label in zip(data['content'], data['label']):
                X_train[counter%batch_size] = transform(content)
                y_train[counter%batch_size] = np.asarray(label)
                counter = counter + 1
                if(counter%batch_size == 0):
                    yield X_train, y_train

In [58]:
def generator_queue(generator, max_q_size=10,
                    wait_time=0.05, nb_worker=1):
    '''Builds a threading queue out of a data generator.
    Used in `fit_generator`, `evaluate_generator`, `predict_generator`.
    '''
    q = queue.Queue()
    _stop = threading.Event()

    def data_generator_task():
        while not _stop.is_set():
            try:
                if q.qsize() < max_q_size:
                    try:
                        generator_output = next(generator)
                    except ValueError:
                        continue
                    q.put(generator_output)
                else:
                    time.sleep(wait_time)
            except Exception:
                _stop.set()
                raise

    generator_threads = [threading.Thread(target=data_generator_task)
                         for _ in range(nb_worker)]

    for thread in generator_threads:
        thread.daemon = True
        thread.start()

    return q, _stop

[19:19, 3/2/2020] Spencer Thomas Hoffman: 'How ds.repeat() works

As soon as all the entries are read from the dataset and you try to read the next element, the dataset will throw an error. That's where ds.repeat() comes into play. It will re-initialize the dataset, making it again like this:

[1,2,3] <= [4,5,6]'
[19:20, 3/2/2020] Spencer Thomas Hoffman: also repeat() does not work in place; you have to assign to a new var
[19:20, 3/2/2020] Spencer Thomas Hoffman: Better explanation:
[19:20, 3/2/2020] Spencer Thomas Hoffman: 'As we know, each epoch in the training process of a model takes in the whole dataset and breaks it into batches. This happens on every epoch. Suppose, we have a dataset with 100 samples. On every epoch, the 100 samples are broken into 5 batches ( of 20 each ) for feeding them to the model. But, if I have to train the model for say 5 epochs then, I need to repeat the dataset 5 times. Meaning, the total elements in the repeated dataset will have 500 samples ( 100 samples multipled 5 times ).

Now, this job is done by the tf.data.Dataset.repeat() method. Usually we pass the num_epochs argument to the method.

The iterator.get_next() is just a way of getting the next batch of data from the tf.data.Dataset. You are iterating the dataset batch by batch.

That's the difference. The tf.data.Dataset.repeat() repeats the samples in the dataset whereas iterator.get_next() one-by-one fetches the data in the form of batches.'
[19:21, 3/2/2020] Spencer Thomas Hoffman: IMO this actually should work
[19:21, 3/2/2020] Spencer Thomas Hoffman: another example
[19:21, 3/2/2020] Spencer Thomas Hoffman: dataset = tf.data.Dataset.from_tensor_slices((features,labels)).repeat().batch(BATCH_SIZE)
[19:22, 3/2/2020] Spencer Thomas Hoffman: iter = dataset.make_one_shot_iterator()
x, y = iter.get_next()
[19:24, 3/2/2020] Spencer Thomas Hoffman: I tried it with an array
[19:24, 3/2/2020] Spencer Thomas Hoffman: it worked
[19:24, 3/2/2020] Spencer Thomas Hoffman: In [54]: for x in dataset2.repeat(3):
    ...:     print(x)
    ...:
tf.Tensor([0 1 2], shape=(3,), dtype=int32)
tf.Tensor([3 4 5], shape=(3,), dtype=int32)
tf.Tensor([6 7 8], shape=(3,), dtype=int32)
tf.Tensor([ 9 10 11], shape=(3,), dtype=int32)
tf.Tensor([0 1 2], shape=(3,), dtype=int32)
tf.Tensor([3 4 5], shape=(3,), dtype=int32)
tf.Tensor([6 7 8], shape=(3,), dtype=int32)
tf.Tensor([ 9 10 11], shape=(3,), dtype=int32)
tf.Tensor([0 1 2], shape=(3,), dtype=int32)
tf.Tensor([3 4 5], shape=(3,), dtype=int32)
tf.Tensor([6 7 8], shape=(3,), dtype=int32)
tf.Tensor([ 9 10 11], shape=(3,), dtype=int32)
[19:24, 3/2/2020] Spencer Thomas Hoffman: Simple test
[19:25, 3/2/2020] Tiana Cornelius: hmmm - taking this in rn
[19:25, 3/2/2020] Spencer Thomas Hoffman: My code:
[19:25, 3/2/2020] Spencer Thomas Hoffman: dataset = tf.data.Dataset.from_tensor_slices([[0,1,2],[3,4,5],[6,7,8],[9,10,11]])
[19:25, 3/2/2020] Spencer Thomas Hoffman: dataset2 = dataset.prefetch(2)
[19:25, 3/2/2020] Spencer Thomas Hoffman: for x in dataset2.repeat(3):
    ...:     print(x)

In [54]:
dataset = tf.data.Dataset.from_tensor_slices([[0,1,2],[3,4,5],[6,7,8],[9,10,11]])
dataset2 = dataset.prefetch(2)
for x in dataset2.repeat(3):
    print(x)

tf.Tensor([0 1 2], shape=(3,), dtype=int32)
tf.Tensor([3 4 5], shape=(3,), dtype=int32)
tf.Tensor([6 7 8], shape=(3,), dtype=int32)
tf.Tensor([ 9 10 11], shape=(3,), dtype=int32)
tf.Tensor([0 1 2], shape=(3,), dtype=int32)
tf.Tensor([3 4 5], shape=(3,), dtype=int32)
tf.Tensor([6 7 8], shape=(3,), dtype=int32)
tf.Tensor([ 9 10 11], shape=(3,), dtype=int32)
tf.Tensor([0 1 2], shape=(3,), dtype=int32)
tf.Tensor([3 4 5], shape=(3,), dtype=int32)
tf.Tensor([6 7 8], shape=(3,), dtype=int32)
tf.Tensor([ 9 10 11], shape=(3,), dtype=int32)


for reusing data we could take the data and put it back into a dataframe or dictionary or something and then shuffle it and put it back into the inddpendent arrays

or what is this Keras sequence thing?

In [30]:
#test
encoder_input_data = [[1,2],[4,5]]
decoder_input_data = [[7,8],[10,11]]
y_t = [[12, 13, 14], [15, 16, 17]]

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((x, y))

BATCH_SIZE = 64
BUFFER_SIZE = 10000

train_dataset = train_dataset.shuffle(BUFFER_SIZE).repeat().batch(BATCH_SIZE)

if BUFFER_SIZE % BATCH_SIZE != 0:
    parallel_steps = BUFFER_SIZE // BATCH_SIZE + 1
else:
    parallel_steps = BUFFER_SIZE // BATCH_SIZE

# This `fit` call will be distributed on 2 GPUs.
# Since the batch size is 64, each GPU will process 32 samples.
parallel_model.fit(train_dataset, epochs=10, steps_per_epoch = parallel_steps)


In [81]:
#np.repeat(a = np_array_2d, repeats = 2, axis = 0)

#X_train = [encoder_input_data, decoder_input_data]
X_enc = np.repeat(a=encoder_input_data, repeats=50, axis=0) # array with shape (276619, 100)
X_dec = np.repeat(a=decoder_input_data, repeats=50, axis=0) # array with shape (276619, 19)
y_t = np.expand_dims(np.repeat(a=decoder_target_data, repeats=50, axis=0), -1) #array with shape (276619, 19, 1)
#batch_size = 128

In [28]:
def iterate_minibatches(n_batches, rows=500):
    counter = 0
    for b_i in range(n_batches):
        time.sleep(0.1) #here it could read file or SQL-get or do some math
        Xe = X_enc[counter:(counter+rows)]
        Xd = X_dec[counter:(counter+rows)]
        y_train = y_t[counter:(counter+rows)]
        X_train = [Xe,Xd] 
        counter = counter + rows
        yield X_train,y_train
%time

print('/'+'-'*42+' Progress Bar ' + '-'*42 + '\\')

for b_x,b_y in iterate_minibatches(50):
    #training
    time.sleep(0.1)#you guessed it
    print('!',end=" ")
print()

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 6.68 µs
/------------------------------------------ Progress Bar ------------------------------------------\
! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! 


In [None]:
class generatorClass(Sequence):

    def __init__(self, x_set, y_set, batch_size):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size
        self.lock = threading.Lock()   #Set self.lock

    def __len__(self):
        return int(np.floor(len(self.x) / float(self.batch_size)))

    def __getitem__(self, idx):
        with self.lock:                #Use self.lock
            batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
            batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]

            return ...

In [95]:
class SeqDataset(Sequence):
    """ Implements the Sequence iterator for a sequence of tokens """
    def __init__(self,
                 X_enc, 
                 X_def,
                 y_t,
                 look_back=1,
                 batch_size=5):
        #self.tokens_sequence = tokens_sequence - X_enc, X_dec
        #self.num_classes = num_classes - y_t
        self.look_back = look_back
        self.batch_size = batch_size

    def __getitem__(self,
                    index):
        """Gets batch at position `index`.
        # Arguments
            index: position of the batch in the Sequence.
        # Returns
            A batch
        """
        begin = index*self.batch_size
        if (index + 1) == self.__len__():
            # In the last batch we add all the remaining data
            end = len(self.tokens_sequence) - self.look_back
        else:
            end = (index+1)*self.batch_size
        x = np.array([self.tokens_sequence[i:i+self.look_back]
                      for i in range(begin, end)])
        y = np.array(self.tokens_sequence[
            (begin + self.look_back):(end + self.look_back)])
        y = y[:, np.newaxis]
        y = to_categorical(y, self.num_classes)
        return x, y

    def __len__(self):
        """Number of batch in the Sequence.
        # Returns
            The number of batches in the Sequence.
        """
        # We round to the floor, therefore we "skip" some data.
        # As a work around, the remainder is added to the last batch
        return int(np.floor(
            (len(self.tokens_sequence) - self.look_back) / self.batch_size))

    def on_epoch_end(self):
        """Method called at the end of every epoch.
        """
        pass

276619

In [42]:
data_len = len(encoder_input_data)
val_split = int(np.floor(data_len*.15))
train_split = int(np.floor(data_len*.85))

In [43]:
val_split

41492

In [44]:
train_split

235126

In [45]:
# separating into train and validation data

X_enc_train = encoder_input_data[0:train_split]
X_dec_train = decoder_input_data[0:train_split]
y_t_train = np.expand_dims(decoder_target_data, -1)[0:train_split]

X_enc_val = encoder_input_data[-val_split:-1]
X_dec_val = decoder_input_data[-val_split:-1]
y_t_val = np.expand_dims(decoder_target_data, -1)[-val_split:-1]

In [28]:
# rename variables for generator
#X_enc = encoder_input_data # array with shape (276619, 100)
#X_dec = decoder_input_data # array with shape (276619, 19)
#y_t = np.expand_dims(decoder_target_data, -1) #array with shape (276619, 19, 1)
#idx = 0

In [47]:
class generatorClass(Sequence):

    def __init__(self, X_enc, X_dec, y_t, batch_size):
        self.X_enc = X_enc
        self.X_dec = X_dec
        self.y_t = y_t
        self.batch_size = batch_size
        self.lock = threading.Lock()

    def __len__(self):
        return int(np.ceil(len(self.X_enc) / float(self.batch_size)))
            
    def __getitem__(self, idx):
        with self.lock:
            batch_index1 = idx * self.batch_size
            batch_index2 = (idx + 1) * self.batch_size
            batch_Xe = self.X_enc[batch_index1:batch_index2]
            batch_Xd = self.X_dec[batch_index1:batch_index2]
            batch_y = self.y_t[batch_index1:batch_index2]
            batch_X = [batch_Xe, batch_Xd]

            return batch_X, batch_y
    def on_epoch_end(self):
        """Method called at the end of every epoch.
        """
        pass

# train model

In [48]:
#tensorboard
log_dir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

#checkpoints
checkpointer = tf.keras.callbacks.ModelCheckpoint(
    filepath='/tmp/weights.{epoch:02d}-{val_loss:.2f}.hdf5', 
    monitor = 'val_loss',
    verbose=1, 
    save_best_only=True, 
    sav_freq='epoch')

#early_stopping
#early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss',  
#                                              patience=10, verbose=1, mode='auto', 
#                                              restore_best_weights=True)

#model
#X_train = [encoder_input_data, decoder_input_data]
#y_train = np.expand_dims(decoder_target_data, -1)
epochs = 10
batch_size = 700
idx = 0
gen_instance = generatorClass(X_enc_train, X_dec_train, y_t_train, batch_size)
val_instance = generatorClass(X_enc_val, X_dec_val, y_t_val, batch_size)
#n_batches = (int(encoder_input_data.shape[0]) // batch_size) * epochs
history = seq2seq_Model.fit_generator(generator = gen_instance,
                            epochs=epochs ,  
                            max_queue_size=50, 
                            validation_data = val_instance,
                            validation_freq=1,
                            steps_per_epoch = int(encoder_input_data.shape[0]) // batch_size,
                            callbacks=[tensorboard_callback, checkpointer], #early_stop],
                            use_multiprocessing=True,
                            workers=7) 

#save final model
#seq2seq_Model.save('200_epochs_amazon_glove.h5')

Epoch 1/10
Epoch 00001: val_loss improved from inf to 8.83589, saving model to /tmp/weights.01-8.84.hdf5
Epoch 2/10
Epoch 00002: val_loss improved from 8.83589 to 1.39935, saving model to /tmp/weights.02-1.40.hdf5
Epoch 3/10
Epoch 00003: val_loss improved from 1.39935 to 1.11244, saving model to /tmp/weights.03-1.11.hdf5
Epoch 4/10
Epoch 00004: val_loss improved from 1.11244 to 1.06726, saving model to /tmp/weights.04-1.07.hdf5
Epoch 5/10
Epoch 00005: val_loss improved from 1.06726 to 1.04713, saving model to /tmp/weights.05-1.05.hdf5
Epoch 6/10
Epoch 00007: val_loss improved from 1.03785 to 1.03630, saving model to /tmp/weights.07-1.04.hdf5
Epoch 8/10
Epoch 00008: val_loss improved from 1.03630 to 1.03600, saving model to /tmp/weights.08-1.04.hdf5
Epoch 9/10
Epoch 00009: val_loss improved from 1.03600 to 1.03517, saving model to /tmp/weights.09-1.04.hdf5
Epoch 10/10
Epoch 00010: val_loss did not improve from 1.03517


In [125]:
#save final model
seq2seq_Model.save('10_epochs_amazon_glove.h5')

# scoring

In [None]:
#load the model
seq2seq_Model = tf.keras.models.load_model('200_epochs_amazon_glove.h5')

# Show the model architecture
seq2seq_Model.summary()

In [None]:
#open the tokenizers

with open('tok1.json') as f:
    data = json.load(f)
    tok1 = tokenizer_from_json(data)
    
with open('tok2.json') as f:
    data = json.load(f)
    tok2 = tokenizer_from_json(data)

In [49]:
#look at test set
test.head()

Unnamed: 0,Summary,Text,text_length,summary_length,text_lower,text_no_punctuation,summary_lower,summary_no_punctuation
0,Great cookie...a little pricey,One of the things that I missed the most when ...,100,3.0,one of the things that i missed the most when ...,one of the things that i missed the most when ...,great cookie...a little pricey,_start_ great cookiea little pricey _end_
1,Always the right formula,I trust this brand--the flavors are blended ju...,20,3.0,i trust this brand--the flavors are blended ju...,i trust this brandthe flavors are blended just...,always the right formula,_start_ always the right formula _end_
2,the real taste of an Italian coffee,This is my favorite one. I'm Italian and I use...,50,6.0,this is my favorite one. i'm italian and i use...,this is my favorite one im italian and i used ...,the real taste of an italian coffee,_start_ the real taste of an italian coffee _end_
3,Love the bags,"No matter what kind of coffee that I brew, my ...",32,2.0,"no matter what kind of coffee that i brew, my ...",no matter what kind of coffee that i brew my h...,love the bags,_start_ love the bags _end_
4,"Unique flavor, excellent healthy snack",We love Garden of Eatin' chips. Our favorites ...,42,4.0,we love garden of eatin' chips. our favorites ...,we love garden of eatin chips our favorites va...,"unique flavor, excellent healthy snack",_start_ unique flavor excellent healthy snack ...


In [108]:
#pick a cell from the clean data to test and look at it
test_text = [test['text_no_punctuation'][8]]
test_text

['they are like most chinese fast food place and low price eatery fortune cookiesbr br many have motivating messages perfect for company functions but lack soulful messages often found in better fortune cookies']

In [109]:
# get the encoder's features for the decoder

tok1.fit_on_texts(test_text)

In [110]:
#tokenize test text

raw_tokenized = tok1.texts_to_sequences(test_text)
raw_tokenized = tf.keras.preprocessing.sequence.pad_sequences(raw_tokenized, maxlen=maxlen1)

In [111]:
#predict the encoder state of the new sentence
body_encoding = encoder_model.predict(raw_tokenized) 

In [112]:
#get output shapes of decoder word embedding
latent_dim = seq2seq_Model.get_layer('Decoder-Word-Embedding').output_shape[-1]

In [113]:
#get layer method for getting the embedding (word clusters)

decoder_inputs = seq2seq_Model.get_layer('Decoder-Input').input 
dec_emb = seq2seq_Model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
dec_bn = seq2seq_Model.get_layer('Decoder-Batchnorm-1')(dec_emb)

gru_inference_state_input = tf.keras.Input(shape=(latent_dim,), name='hidden_state_input')

gru_out, gru_state_out = seq2seq_Model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])

# Reconstruct dense layers
dec_bn2 = seq2seq_Model.get_layer('Decoder-Batchnorm-2')(gru_out)
dense_out = seq2seq_Model.get_layer('Final-Output-Dense')(dec_bn2)

In [114]:
decoder_model = tf.keras.Model([decoder_inputs, gru_inference_state_input],
                          [dense_out, gru_state_out])

In [115]:
# save the encoder's embedding before its updated by decoder for later

original_body_encoding = body_encoding

In [116]:
state_value = np.array(tok2.word_index['_start_']).reshape(1, 1)

In [117]:
state_value

array([[1]])

In [118]:
decoded_sentence = []
stop_condition = False

In [119]:
vocabulary_inv = dict((v, k) for k, v in tok2.word_index.items())
#vocabulary_inv[0] = "<PAD/>"
#vocabulary_inv[1] = "unknown"

In [120]:
vocabulary_inv

{1: '_start_',
 2: '_end_',
 3: 'great',
 4: 'the',
 5: 'good',
 6: 'for',
 7: 'a',
 8: 'and',
 9: 'best',
 10: 'not',
 11: 'my',
 12: 'love',
 13: 'this',
 14: 'it',
 15: 'coffee',
 16: 'but',
 17: 'i',
 18: 'of',
 19: 'tea',
 20: 'to',
 21: 'is',
 22: 'taste',
 23: 'in',
 24: 'these',
 25: 'like',
 26: 'product',
 27: 'dog',
 28: 'very',
 29: 'price',
 30: 'flavor',
 31: 'ever',
 32: 'food',
 33: 'delicious',
 34: 'as',
 35: 'you',
 36: 'loves',
 37: 'with',
 38: 'favorite',
 39: 'too',
 40: 'are',
 41: 'dogs',
 42: 'on',
 43: 'tasty',
 44: 'excellent',
 45: 'better',
 46: 'so',
 47: 'snack',
 48: 'healthy',
 49: 'them',
 50: 'just',
 51: 'chocolate',
 52: 'than',
 53: 'what',
 54: 'at',
 55: 'free',
 56: 'no',
 57: 'one',
 58: 'yummy',
 59: 'tastes',
 60: 'hot',
 61: 'tasting',
 62: 'stuff',
 63: 'all',
 64: 'chips',
 65: 'cats',
 66: 'cat',
 67: 'really',
 68: 'nice',
 69: 'treat',
 70: 'sweet',
 71: 'have',
 72: 'its',
 73: 'little',
 74: 'me',
 75: 'buy',
 76: 'quality',
 77: 'pe

In [121]:
while not stop_condition:
    #print(1)
    preds, st = decoder_model.predict([state_value, body_encoding])

    pred_idx = np.argmax(preds[:, :, 2:]) + 2
    pred_word_str = vocabulary_inv[pred_idx]
    print(pred_word_str)
    if pred_word_str == '_end_' or len(decoded_sentence) >= maxlen2:
        stop_condition = True
        break
    decoded_sentence.append(pred_word_str)

    # update the decoder for the next word
    body_encoding = st
    state_value = np.array(pred_idx).reshape(1, 1)
    #print(state_value)

great
cookie
but
_end_


In [122]:
#compare to original summary

print([test['summary_no_punctuation'][8]])

['_start_ fast food fortune cookie _end_']
