In [1]:
### Imports

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
def generate_sequence(list_of_words, sequence_length):
    res = []
    for i in range(len(list_of_words)-sequence_length):
        res.append((list_of_words[i:i+sequence_length],list_of_words[i+sequence_length]))
    return res

def splitter(database, fraction, num_words):
    
    training = [x for x in database['tokens']]
    labels = [x for x in database['label_tokens']]

    X_train = np.array(training[:int(fraction*len(training))])
    X_test = np.array(training[int(fraction*len(training)):])

    y_train_base = np.array(labels)[:int(fraction*len(labels))]
    y_test_base = np.array(labels)[int(fraction*len(labels)):]

    y_train = np.zeros((len(y_train_base), num_words), dtype=np.int8)
    y_test = np.zeros((len(y_test_base), num_words), dtype=np.int8)

    # One hot encoding of labels
    for example_index, word_index in enumerate(y_train_base):
        y_train[example_index, word_index] = 1

    for example_index, word_index in enumerate(y_test_base):
        y_test[example_index, word_index] = 1

    print(f'The training sequence shape is {X_train.shape}, the training label shape is {y_train.shape}')
    print(f'The test sequence shape is {X_test.shape}, the test label shape is  {y_test.shape}')
    return X_train, X_test, y_train, y_test

In [3]:
books= []


delims = ' |\n|\ufeff|_'
punctuation = '.|,|;|:'
wrapping = r'\--(.--?)\--'
sequence_length = 100

for i in range(1,11):
    with open(f'../Data/ABT/{i}.txt', mode="r", encoding="utf-8") as file:
        contents = file.read().lower()
        contents = re.sub(r'[^\w\s]', '', contents)
        contents = re.split(delims, contents)
        books.append([x for x in contents if x != ''])

for ind, book in enumerate(books):
    books[ind] = generate_sequence(book, sequence_length=sequence_length)

master = []

for book in books:
    for sequence in book:
        master.append(sequence)

master_frame = pd.DataFrame(master, columns=['sequence','label'])
master_frame.sample(10)

Unnamed: 0,sequence,label
416318,"[after, him, and, settled, down, but, hardly, ...",light
626335,"[added, eh, she, said, nothing, to, that, it, ...",know
708629,"[presence, bowed, silently, and, went, out, th...",this
543168,"[i, owed, there, was, no, doubt, flattery, in,...",ago
85962,"[judgment, based, on, what, seemed, a, natural...",by
433982,"[remembered, because, her, untidy, back, hair,...",face
494117,"[him, from, all, directions, burying, him, so,...",it
220968,"[the, edge, of, the, wall, satan, has, come, t...",really
173165,"[separatedifferent, things, i, suppose, so, no...",our
135856,"[felt, himself, into, the, rôle, until, he, ha...",momentsone


In [4]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(lower=False)
tokenizer.fit_on_texts(master_frame['sequence'])
master_frame['tokens'] = tokenizer.texts_to_sequences(master_frame['sequence'])
master_frame['label_tokens'] = tokenizer.texts_to_sequences(master_frame['label'])

word_lexicon = tokenizer.word_index
word_index = tokenizer.index_word
num_words = len(word_lexicon) + 1
word_counts = tokenizer.word_counts
sorted_counts = dict(sorted(dict(word_counts).items(), key=lambda item: item[1],reverse=True))

master_frame.sample(5)

Unnamed: 0,sequence,label,tokens,label_tokens
628528,"[were, springs, in, her, feet, and, her, movem...",to,"[32, 6681, 7, 19, 246, 2, 19, 967, 72, 5, 2823...",[5]
167537,"[own, theories, and, prejudices, might, be, in...",whereas,"[79, 3114, 2, 7966, 136, 44, 7, 25, 525, 333, ...",[1581]
494021,"[changed, room, plucked, at, the, center, of, ...",burying,"[808, 97, 6281, 18, 1, 2131, 3, 10, 145, 115, ...",[5178]
648693,"[enduring, quality, in, his, character, led, h...",the,"[9913, 938, 7, 10, 889, 446, 17, 9, 8, 59, 237...",[1]
52974,"[the, heart, of, the, whole, thing, you, ask, ...",the,"[1, 145, 3, 1, 160, 155, 21, 551, 268, 11, 29,...",[1]


In [5]:
import tensorflow as tf
from keras.layers import Embedding, LSTM, Dropout, Bidirectional, Dense
from keras.models import Sequential, load_model

callback = tf.keras.callbacks.ModelCheckpoint(
                                                filepath='../Data/weights_RNN.h5',
                                                save_weights_only=True,
                                                monitor='val_accuracy',
                                                mode='max',
                                                save_best_only=True)

LSTM_cells = 64

model = Sequential()
model.add(Embedding(num_words, 300, input_length=sequence_length))
model.add(LSTM(LSTM_cells,return_sequences=True,dropout=0.1))
model.add(Bidirectional(LSTM(LSTM_cells,return_sequences=False,dropout=0.1)))
model.add(Dropout(0.5))
model.add(Dense(num_words, activation='softmax'))
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

model.save('../Data/RNN_model.h5')

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 300)          7485600   
                                                                 
 lstm (LSTM)                 (None, 100, 64)           93440     
                                                                 
 bidirectional (Bidirectiona  (None, 128)              66048     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 24952)             3218808   
                                                                 
Total params: 10,863,896
Trainable params: 10,863,896
Non-trainable params: 0
____________________________________________

In [6]:
BATCH_SIZE = 64
steps_per_epoch = 40
epochs = 40

step = 0.1
cut_list = step * np.arange(10)

for start, end in zip(cut_list, cut_list+step):
    
        temp_frame = master_frame.iloc[int(start*len(master_frame)):int(end*len(master_frame))+1]

        X_train, X_test, y_train, y_test = splitter(temp_frame,0.9, num_words)

        model = load_model('../Data/RNN_model.h5')

        model.fit(X_train, y_train, batch_size=64,
                steps_per_epoch=steps_per_epoch,
                epochs=epochs,
                callbacks=callback,
                validation_data=(X_test,y_test),
                validation_steps=10)

        del (X_train, X_test, y_train, y_test)

        model.save('../Data/RNN_model.h5')

The training sequence shape is (64737, 100), the training label shape is (64737, 24952)
The test sequence shape is (7193, 100), the test label shape is  (7193, 24952)
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
The training sequence shape is (64737, 100), the training label shape is (64737, 24952)
The test sequence shape is (7193, 100), the test label shape is  (7193, 24952)
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40


InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.