## imports

In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense , Dropout , LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

2022-11-05 15:22:37.147443: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Load ascii text and convert it to lower case

In [2]:
filename = "data.txt"
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()

## Create mapping of unique chars to integers

In [3]:
chars = sorted(list(set(raw_text)))
print(chars,"\n")

# give an integer to each character, index in the list as the integer value for character
char_to_int = dict((c,i) for i,c in enumerate(chars))
print(char_to_int)

['\n', ' ', '!', '"', '#', '$', '%', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] 

{'\n': 0, ' ': 1, '!': 2, '"': 3, '#': 4, '$': 5, '%': 6, "'": 7, '(': 8, ')': 9, '*': 10, ',': 11, '-': 12, '.': 13, '/': 14, '0': 15, '1': 16, '2': 17, '3': 18, '4': 19, '5': 20, '6': 21, '7': 22, '8': 23, '9': 24, ':': 25, ';': 26, '?': 27, '@': 28, '[': 29, ']': 30, '_': 31, 'a': 32, 'b': 33, 'c': 34, 'd': 35, 'e': 36, 'f': 37, 'g': 38, 'h': 39, 'i': 40, 'j': 41, 'k': 42, 'l': 43, 'm': 44, 'n': 45, 'o': 46, 'p': 47, 'q': 48, 'r': 49, 's': 50, 't': 51, 'u': 52, 'v': 53, 'w': 54, 'x': 55, 'y': 56, 'z': 57}


## Getting the details of the dataset

In [4]:
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total number of Characters: ", n_chars)
print("Total Vocab(Unique characters): ", n_vocab)

Total number of Characters:  163780
Total Vocab(Unique characters):  58


## Prepare the dataset of input to output pairs encoded as integers
<br>
select 100 letters at a time and count the number of 100 letter blocks

In [5]:
seq_length = 100 # can be changed
dataX = []
dataY = [] 
for i in range(0,n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])

n_patterns = len(dataY)
print("Total Patterns: ", n_patterns)

Total Patterns:  163680


In [6]:
# second 100 character block (starting from r)
print(dataX[1])
# what character will come after that 100 character block
print(dataY[1])

[49, 46, 41, 36, 34, 51, 1, 38, 52, 51, 36, 45, 33, 36, 49, 38, 7, 50, 1, 32, 43, 40, 34, 36, 7, 50, 1, 32, 35, 53, 36, 45, 51, 52, 49, 36, 50, 1, 40, 45, 1, 54, 46, 45, 35, 36, 49, 43, 32, 45, 35, 11, 1, 33, 56, 1, 43, 36, 54, 40, 50, 1, 34, 32, 49, 49, 46, 43, 43, 0, 0, 51, 39, 40, 50, 1, 36, 33, 46, 46, 42, 1, 40, 50, 1, 37, 46, 49, 1, 51, 39, 36, 1, 52, 50, 36, 1, 46, 37, 1]
32


## Transform input sequences into form expected by LSTM network & Rescale integers to 0-1

In [7]:
# reshape x to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize - rescaling the integer values
X = X / float(n_vocab)
print(X)

[[[0.81034483]
  [0.84482759]
  [0.79310345]
  ...
  [0.01724138]
  [0.79310345]
  [0.63793103]]

 [[0.84482759]
  [0.79310345]
  [0.70689655]
  ...
  [0.79310345]
  [0.63793103]
  [0.01724138]]

 [[0.79310345]
  [0.70689655]
  [0.62068966]
  ...
  [0.63793103]
  [0.01724138]
  [0.55172414]]

 ...

 [[0.87931034]
  [0.79310345]
  [0.01724138]
  ...
  [0.79310345]
  [0.79310345]
  [0.72413793]]

 [[0.79310345]
  [0.01724138]
  [0.67241379]
  ...
  [0.79310345]
  [0.72413793]
  [0.86206897]]

 [[0.01724138]
  [0.67241379]
  [0.62068966]
  ...
  [0.72413793]
  [0.86206897]
  [0.22413793]]]


## Convert output values (single characters converted to integers) to one hot encoding

In [8]:
# one hot encode the output variable
y = np_utils.to_categorical(dataY)
print(y)

[[0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


## Define LSTM model

In [9]:
model = Sequential()
# it can have one or more training samples
model.add(LSTM(256, input_shape = (X.shape[1], X.shape[2])))
# can reduce complexity by adding dropout
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy',optimizer = 'adam')

# define the checkpoint
filepath = 'weights-improvement-{epoch:02d}-{loss:.4f}.hdf5'
checkpoint = ModelCheckpoint(filepath = filepath, monitor = 'loss', verbose = 1, save_best_only = True, mode = 'min')
callbacks_list = [checkpoint]

2022-11-05 15:22:52.673595: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Fitting the model to data

In [10]:
# change the hyperparameter values and train model
# when no.of epochs increases model can be trainable well and loss will decrease 
epochs = 10
batch_size = 128
model.fit(X, y, epochs = epochs, batch_size = batch_size, callbacks = callbacks_list)

Epoch 1/10
Epoch 1: loss improved from inf to 2.97230, saving model to weights-improvement-01-2.9723.hdf5
Epoch 2/10
Epoch 2: loss improved from 2.97230 to 2.80086, saving model to weights-improvement-02-2.8009.hdf5
Epoch 3/10
Epoch 3: loss improved from 2.80086 to 2.71533, saving model to weights-improvement-03-2.7153.hdf5
Epoch 4/10
Epoch 4: loss improved from 2.71533 to 2.64485, saving model to weights-improvement-04-2.6449.hdf5
Epoch 5/10
Epoch 5: loss improved from 2.64485 to 2.59266, saving model to weights-improvement-05-2.5927.hdf5
Epoch 6/10
Epoch 6: loss improved from 2.59266 to 2.53773, saving model to weights-improvement-06-2.5377.hdf5
Epoch 7/10
Epoch 7: loss improved from 2.53773 to 2.48698, saving model to weights-improvement-07-2.4870.hdf5
Epoch 8/10
Epoch 8: loss improved from 2.48698 to 2.44015, saving model to weights-improvement-08-2.4402.hdf5
Epoch 9/10
Epoch 9: loss improved from 2.44015 to 2.39981, saving model to weights-improvement-09-2.3998.hdf5
Epoch 10/10
Ep

<keras.callbacks.History at 0x14436ea10>

## Generate text with the trained LSTM model

In [11]:
# get the weight with lowest loss
filename = "weights-improvement-10-2.4187.hdf5"
model.load_weights(filename)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

## Create mapping of unique integers to chars

In [12]:
int_to_char = dict((i,c) for i,c in enumerate(chars))

## Generate a random seed

In [13]:
print(len(dataX))
start = np.random.randint(0, len(dataX) - 1)
print(start)
# dataX contains list of patterns
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")

163680
53033
Seed:
"  said to herself, and nibbled a little of
the right-hand bit to try the effect: the next moment she  "


## Generate characters

In [15]:
# generate next 100 characters
length = 100
final = []

for i in range(length):
    # reshaping the seed sequence before passing it to the LSTM model
    x = np.reshape(pattern, (1, len(pattern), 1))
    # print(x)
    # normalize the integer values
    x = x / float(n_vocab)
    # print(x)
    # making the predictions
    prediction = model.predict(x, verbose = 0)
    # get the predicted value with maximum probability
    index = np.argmax(prediction)

    # convert the predicted integer values to char
    result = int_to_char[index]
    # print(result)
    final.append(result)
    # adding the predicted character to the sequence
    pattern.append(index)
    # removing the first character from the seed sequence
    pattern = pattern[1:len(pattern)]
print(final)


['w', 'o', 'r', 'k', 'e', ' ', 't', 'h', ' ', 't', 'h', 'e', ' ', 'w', 'o', 'r', 'k', ' ', ' ', "'", ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ']
