In [None]:
### Imports

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
def generate_sequence(list_of_words, sequence_length):
    res = []
    for i in range(len(list_of_words)-sequence_length):
        res.append((list_of_words[i:i+sequence_length],list_of_words[i+sequence_length]))
    return res

def splitter(database, fraction):
    
    training = database['tokens']
    labels = database['label_tokens']

    X_train = np.array(training[:int(0.75*len(training))])
    X_test = np.array(training[int(0.75*len(training)):])

    y_train_base = np.array(labels)[:int(0.75*len(labels))]
    y_test_base = np.array(labels)[int(0.75*len(labels)):]

    y_train = np.zeros((len(y_train_base), num_words), dtype=np.int8)
    y_test = np.zeros((len(y_test_base), num_words), dtype=np.int8)

    # One hot encoding of labels
    for example_index, word_index in enumerate(y_train_base):
        y_train[example_index, word_index] = 1

    for example_index, word_index in enumerate(y_test_base):
        y_test[example_index, word_index] = 1

    print(f'The training sequence shape is {X_train.shape}, the training label shape is {y_train.shape}')
    print(f'The test sequence shape is {X_test.shape}, the test label shape is  {y_test.shape}')
    return X_train, X_test, y_train, y_test

In [None]:
books= []


delims = ' |\n|\ufeff|_'
punctuation = '.|,|;|:'
wrapping = r'\--(.--?)\--'
sequence_length = 50

for i in range(1,11):
    with open(f'../Data/ABT/{i}.txt', mode="r", encoding="utf-8") as file:
        contents = file.read().lower()
        contents = re.sub(r'[^\w\s]', '', contents)
        contents = re.split(delims, contents)
        books.append([x for x in contents if x != ''])

for ind, book in enumerate(books):
    books[ind] = generate_sequence(book, sequence_length=sequence_length)

master = []

for book in books:
    for sequence in book:
        master.append(sequence)

master_frame = pd.DataFrame(master, columns=['sequence','label'])
master_frame.sample(10)

In [None]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(lower=False)
tokenizer.fit_on_texts(master_frame['sequence'])
master_frame['tokens'] = tokenizer.texts_to_sequences(master_frame['sequence'])
master_frame['label_tokens'] = tokenizer.texts_to_sequences(master_frame['label'])

word_lexicon = tokenizer.word_index
word_index = tokenizer.index_word
num_words = len(word_lexicon) + 1
word_counts = tokenizer.word_counts
sorted_counts = dict(sorted(dict(word_counts).items(), key=lambda item: item[1],reverse=True))

X_train, X_test, y_train, y_test = splitter(master_frame,0.9)

master_frame.sample(5)

In [None]:
import tensorflow as tf
from keras.layers import Embedding, LSTM, Dropout, Bidirectional, Dense
from keras.models import Sequential

LSTM_cells = 64

model = Sequential()
model.add(Embedding(num_words, 300, input_length=50))
model.add(LSTM(LSTM_cells,return_sequences=True,dropout=0.1))
model.add(LSTM(LSTM_cells,return_sequences=True,dropout=0.1))
model.add(Bidirectional(LSTM(LSTM_cells,return_sequences=False,dropout=0.1)))
model.add(Dense(300, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_words, activation='softmax'))
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

model.summary()

In [None]:
BATCH_SIZE = 64
steps_per_epoch = 40
epochs = 40

callback = tf.keras.callbacks.ModelCheckpoint(
                                                filepath='../Data/weights_RNN.h5',
                                                save_weights_only=True,
                                                monitor='val_accuracy',
                                                mode='max',
                                                save_best_only=True)

model.fit(X_train, y_train, batch_size=64,
          steps_per_epoch=steps_per_epoch,
          epochs=epochs,
          callbacks=callback,
          validation_data=(X_test,y_test),
          validation_steps=10)