In [1]:
import re
import nltk
import numpy
import string
import numpy as np
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding

In [2]:
nltk.download('punkt')
string.punctuation = string.punctuation + '“' + '”' +'-' + '’' + '‘' + '—'
string.punctuation = string.punctuation.replace('.', '')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
# path to text file
doc_path = "drive/MyDrive/Newcastle University/Deep Learning/Data/61262-0.txt"

In [4]:
# Loads the data and preprocesses data and stores corpus in raw_text
raw_text = open(doc_path, encoding = 'utf8').read()

file_nl_removed = ""
for line in raw_text:
  line_nl_removed = line.replace("\n", " ")           #removes newlines
  file_nl_removed += line_nl_removed

file_p = "".join([char for char in file_nl_removed if char not in string.punctuation])   #removes all special characters
sents = nltk.sent_tokenize(file_p)
print("The number of sentences is", len(sents)) #prints the number of sentences

string.punctuation = string.punctuation + '.'
file_q = "".join([char for char in file_p if char not in string.punctuation])   #removes even periods.
words = nltk.word_tokenize(file_q)
print("The number of tokens is", len(words)) #prints the number of tokens

average_tokens = round(len(words)/len(sents))
print("The average number of tokens per sentence is", average_tokens) #prints the average number of tokens per sentence

unique_tokens = set(words)
print("The number of unique tokens are", len(unique_tokens)) #prints the number of unique tokens

preprocessed_text = file_p.lower()       #converts corpus into lowercase

The number of sentences is 3945
The number of tokens is 55708
The average number of tokens per sentence is 14
The number of unique tokens are 7326


In [5]:
print(preprocessed_text[1500:1638])

appearance of mr. davenheim    x the adventure of the italian nobleman    xi the case of the missing will       poirot investigates       


In [6]:
print(preprocessed_text[1638:2000])

poirot investigates     i     the adventure of the western star  i was standing at the window of poirots rooms looking out idly on the street below.  thats queer i ejaculated suddenly beneath my breath.  what is mon ami asked poirot placidly from the depths of his comfortable chair.  deduce poirot from the following facts here is a young lady richly dressedfas


In [7]:
print(preprocessed_text[-18700:-18640])

h wonderwhat old andrew marsh would have thought     the end


In [8]:
# removing disclaimer and other non novel text
preprocessed_text = preprocessed_text[1638:-18640]

In [9]:
set(preprocessed_text)

{' ',
 '.',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '£',
 'à',
 'â',
 'æ',
 'ç',
 'è',
 'é',
 'ê',
 'ô',
 '•'}

In [10]:
# Uses the preprocessed data and create raw_text
raw_text = preprocessed_text   #periods have not been removed for better results

# creates mapping of unique characters to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

In [11]:
# Prints the total characters and character vocab size
n_chars = len(raw_text)
n_vocab = len(chars)

print("The number of total characters are", n_chars)
print("\nThe character vocab size is", n_vocab)

The number of total characters are 285205

The character vocab size is 48


In [12]:
#Prepares dataset where the input is sequence of 100 characters and target is next character.
seq_length = 100

dataX = []
dataY = []

for i in range(0, n_chars - seq_length, 1):
  seq_in = raw_text[i:i + seq_length]
  seq_out = raw_text[i + seq_length]

  dataX.append([char_to_int[char] for char in seq_in])
  dataY.append(char_to_int[seq_out])

n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)

Total Patterns:  285105


In [13]:
# reshapes X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))

# one hot encodes the output variable
y = to_categorical(dataY)

In [14]:
embedding_dim =100
max_length =100

In [20]:
model1 = Sequential()
model1.add(Embedding(n_vocab, embedding_dim, input_length=max_length))
model1.add(LSTM(256, input_shape=(X.shape[1], embedding_dim),return_sequences=True))
model1.add(Dropout(0.2))
model1.add(LSTM(256))
model1.add(Dropout(0.2))
model1.add(Dense(y.shape[1], activation='softmax'))

In [21]:
cp = ModelCheckpoint('drive/MyDrive/Newcastle University/Deep Learning/Models/DL_LM_CB.h5', monitor='val_loss')

In [22]:
model1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [23]:
model1.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 100)          4800      
                                                                 
 lstm_2 (LSTM)               (None, 100, 256)          365568    
                                                                 
 dropout_2 (Dropout)         (None, 100, 256)          0         
                                                                 
 lstm_3 (LSTM)               (None, 256)               525312    
                                                                 
 dropout_3 (Dropout)         (None, 256)               0         
                                                                 
 dense_1 (Dense)             (None, 48)                12336     
                                                                 
Total params: 908,016
Trainable params: 908,016
Non-tr

In [24]:
model1.fit(X, y, epochs = 20, batch_size=64, callbacks=cp)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f7ae2195310>

In [25]:
# Generates the sequence similar to above methods. Gets the generated string using the model.
def predict_next_n_chars(pattern, n):
    for i in range(n):
      x = numpy.reshape(pattern, (1, len(pattern), 1))
      prediction = model1.predict(x, verbose=0)
      print (int_to_char[numpy.argmax(prediction)], end = '')   #get next char index.
      seq_in = [int_to_char[value] for value in pattern]
      pattern.append(numpy.argmax(prediction))
      pattern = pattern[1:len(pattern)]

In [42]:
#picks a random seed
start = numpy.random.randint(0, len(dataX)-1)
pattern = dataX[start]
input_str = ''.join([int_to_char[value] for value in pattern])
print ("Seed -",  input_str, sep = '\n\n')
print ("\nGenerated string -\n")

predict_next_n_chars(pattern, 50)

Seed -

drove there in a taxi.  mr. philip ridgeway was there before us and looked somewhat surprised to see

Generated string -

 him the story of the case of the case of the case

In [32]:
input_str = "The boy laughed at the fright he had caused. This time, the villagers left angrily. The third day, as the boy went up\
 the small hill, he suddenly saw a wolf attacking his sheep. He cried as hard as he could, “Wolf! Wolf! Wolf!”, but not \
 a single villager came to help him. The villagers thought that he was trying to fool them again and did not come to rescue \
 him or his sheep."

In [37]:
#Uses the first 100 characters from given input_str as input to generate next 200 characters. 
input_str = input_str.lower()
input_string = ''
for each in input_str:
  if each in chars:
    if (len (input_string)<100):
      input_string += each

In [46]:
pattern = []
pattern.append([char_to_int[char] for char in input_string])

In [47]:
print ("Seed -",  input_str, sep = '\n\n')
print ("\nGenerated string -\n")
predict_next_n_chars(pattern[0], 50)

Seed -

drove there in a taxi.  mr. philip ridgeway was there before us and looked somewhat surprised to see

Generated string -

i should have been a man of the station of the cas

In [40]:
pattern = []
pattern.append([char_to_int[char] for char in input_string])

In [41]:
print ("Seed -",  input_str, sep = '\n\n')
print ("\nGenerated string -\n")
predict_next_n_chars(pattern[0], 50)

Seed -

the boy laughed at the fright he had caused. this time, the villagers left angrily. the third day, as the boy went up the small hill, he suddenly saw a wolf attacking his sheep. he cried as hard as he could, “wolf! wolf! wolf!”, but not  a single villager came to help him. the villagers thought that he was trying to fool them again and did not come to rescue  him or his sheep.

Generated string -

i should have been a man of the station of the cas