In [1]:
import tensorflow as tf
import numpy as np
from load_data import get_index_tables, filter_len, index_words, find_longest_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
import gc

## Check tf version

In [2]:
tf.__version__

'2.1.0'

## Import Data

In [3]:
word2index, index2word = get_index_tables()
text, summary = filter_len()

## For Debugging

In [4]:
for (i,entry) in enumerate(word2index.items()):
    if i > 10:
        break;
    else:
        print(entry)

print()

for (i,entry) in enumerate(index2word.items()):
    if i > 10:
        break;
    else:
        print(entry)

('the', 0)
('to', 1)
('of', 2)
('a', 3)
('and', 4)
('in', 5)
('that', 6)
('for', 7)
('is', 8)
('on', 9)
('said', 10)

(0, 'the')
(1, 'to')
(2, 'of')
(3, 'a')
(4, 'and')
(5, 'in')
(6, 'that')
(7, 'for')
(8, 'is')
(9, 'on')
(10, 'said')


## Process Text and Summary Data

Turn text and summary from lists of words into lists of ints using word2index table

In [5]:
indexed_text = index_words(text, word2index)
indexed_summary = index_words(summary, word2index) 

del text
del summary
gc.collect()

print(indexed_summary[1])

[0, 1386, 4214, 11, 323, 1, 3, 405, 243, 306, 7, 405, 52, 4934, 105, 1528, 7, 3633, 1121, 975, 459, 12, 40, 41, 16, 3, 455, 129, 5, 275, 99, 47, 17, 513, 214, 81, 53, 36, 24, 41, 13, 113, 73, 0, 1386, 9352, 286]


## Pad the Data

Zero pad text and summary. First find longest sequence in text and summary then pad each text and summary to respective longest lengths

In [10]:
longest_text = find_longest_sequence(indexed_text)
longest_summary = find_longest_sequence(indexed_summary)

text = pad_sequences(indexed_text, maxlen=longest_text)
summary = pad_sequences(indexed_summary, maxlen=longest_summary)

print(text.shape)
print(summary.shape)

(35878, 491)
(35878, 78)
