In [20]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [21]:
filename='/content/drive/MyDrive/data.txt'
raw_text = open(filename, 'r', encoding='utf-8').read()
raw_text = raw_text.lower()

In [22]:
chars = sorted(list(set(raw_text)))
print(chars)
char_to_int = dict((c,i) for i,c in enumerate(chars))

['\n', ' ', '!', '"', '#', '$', '%', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '@', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [None]:
int_to_char = {i: char for i, char in enumerate(chars)} # for future perpus

In [23]:
# get the idea of the data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab(unique characters) : ", n_vocab)

Total Characters:  163780
Total Vocab(unique characters) :  58


In [24]:
seq_length = 15
dataX = []
dataY = []

for i in range(0, n_chars - seq_length, 1):
  seq_in = raw_text[i:i + seq_length]
  seq_out = raw_text[i + seq_length]
  dataX.append([char_to_int[char] for char in seq_in])
  dataY.append(char_to_int[seq_out])
n_patterns = len(dataY)
print("Total Patterns: ", n_patterns)

Total Patterns:  163765


In [25]:
dataX

[[47, 49, 46, 41, 36, 34, 51, 1, 38, 52, 51, 36, 45, 33, 36],
 [49, 46, 41, 36, 34, 51, 1, 38, 52, 51, 36, 45, 33, 36, 49],
 [46, 41, 36, 34, 51, 1, 38, 52, 51, 36, 45, 33, 36, 49, 38],
 [41, 36, 34, 51, 1, 38, 52, 51, 36, 45, 33, 36, 49, 38, 7],
 [36, 34, 51, 1, 38, 52, 51, 36, 45, 33, 36, 49, 38, 7, 50],
 [34, 51, 1, 38, 52, 51, 36, 45, 33, 36, 49, 38, 7, 50, 1],
 [51, 1, 38, 52, 51, 36, 45, 33, 36, 49, 38, 7, 50, 1, 32],
 [1, 38, 52, 51, 36, 45, 33, 36, 49, 38, 7, 50, 1, 32, 43],
 [38, 52, 51, 36, 45, 33, 36, 49, 38, 7, 50, 1, 32, 43, 40],
 [52, 51, 36, 45, 33, 36, 49, 38, 7, 50, 1, 32, 43, 40, 34],
 [51, 36, 45, 33, 36, 49, 38, 7, 50, 1, 32, 43, 40, 34, 36],
 [36, 45, 33, 36, 49, 38, 7, 50, 1, 32, 43, 40, 34, 36, 7],
 [45, 33, 36, 49, 38, 7, 50, 1, 32, 43, 40, 34, 36, 7, 50],
 [33, 36, 49, 38, 7, 50, 1, 32, 43, 40, 34, 36, 7, 50, 1],
 [36, 49, 38, 7, 50, 1, 32, 43, 40, 34, 36, 7, 50, 1, 32],
 [49, 38, 7, 50, 1, 32, 43, 40, 34, 36, 7, 50, 1, 32, 35],
 [38, 7, 50, 1, 32, 43, 40, 34, 

In [26]:
#Pre Processing task
import numpy as np
X = np.reshape(dataX, (n_patterns, seq_length, 1) )

X = X/ float(n_vocab)
print(X)

[[[0.81034483]
  [0.84482759]
  [0.79310345]
  ...
  [0.77586207]
  [0.56896552]
  [0.62068966]]

 [[0.84482759]
  [0.79310345]
  [0.70689655]
  ...
  [0.56896552]
  [0.62068966]
  [0.84482759]]

 [[0.79310345]
  [0.70689655]
  [0.62068966]
  ...
  [0.62068966]
  [0.84482759]
  [0.65517241]]

 ...

 [[0.55172414]
  [0.56896552]
  [0.79310345]
  ...
  [0.79310345]
  [0.79310345]
  [0.72413793]]

 [[0.56896552]
  [0.79310345]
  [0.89655172]
  ...
  [0.79310345]
  [0.72413793]
  [0.86206897]]

 [[0.79310345]
  [0.89655172]
  [0.87931034]
  ...
  [0.72413793]
  [0.86206897]
  [0.22413793]]]


In [10]:
import tensorflow as tf
from tensorflow import keras
from keras import utils

y = utils.to_categorical(dataY)


In [15]:
from tensorflow.keras.layers import LSTM, Dense, Dropout, Attention
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint

In [27]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

#define the checkpoint
filepath = "weights-improvement-{epoch:02d}-{loss:.4f}.hdf5" #a file format
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

In [33]:
epochs = 10
batch_size = 500

In [34]:
model.fit(X,y, epochs=epochs, batch_size = batch_size, callbacks=callbacks_list)

Epoch 1/10
Epoch 1: loss improved from inf to 3.08257, saving model to weights-improvement-01-3.0826.hdf5
Epoch 2/10
Epoch 2: loss improved from 3.08257 to 2.95103, saving model to weights-improvement-02-2.9510.hdf5
Epoch 3/10
Epoch 3: loss improved from 2.95103 to 2.86995, saving model to weights-improvement-03-2.8700.hdf5
Epoch 4/10
Epoch 4: loss improved from 2.86995 to 2.81344, saving model to weights-improvement-04-2.8134.hdf5
Epoch 5/10
Epoch 5: loss improved from 2.81344 to 2.76696, saving model to weights-improvement-05-2.7670.hdf5
Epoch 6/10
Epoch 6: loss improved from 2.76696 to 2.73159, saving model to weights-improvement-06-2.7316.hdf5
Epoch 7/10
Epoch 7: loss improved from 2.73159 to 2.70049, saving model to weights-improvement-07-2.7005.hdf5
Epoch 8/10
Epoch 8: loss improved from 2.70049 to 2.67153, saving model to weights-improvement-08-2.6715.hdf5
Epoch 9/10
Epoch 9: loss improved from 2.67153 to 2.64660, saving model to weights-improvement-09-2.6466.hdf5
Epoch 10/10
Ep

<keras.src.callbacks.History at 0x78fbec4b5000>

In [38]:
filename = "weights-improvement-10-2.6222.hdf5" #give the filename given in results
model.load_weights(filename)
model.compile(loss='categorical_crossenrtropy', optimizer='adam')

In [41]:
int_to_char = {i: char for i, char in enumerate(chars)}

In [31]:
print(len(dataX))
start = np.random.randint(0, len(dataX) - 1)
print(start)
pattern = dataX[start]
print("Seed : ")
print("\"",''.join([int_to_char[value] for value in pattern]),"\"")

163765
19581
Seed : 
" hat i should th "


In [None]:
#the letters that can be predicted
length = 10
final = []
for i in range(length):
  x = np.reshape(pattern, (1, len(pattern), 1))
  x = x / float(n_vocab)

  prediction = model.predict(x, verbose = 0)
  index = np.argmax(prediction)

  result = int_to_char[index]
  final.append(result)

  pattern.append(index)

  pattern = pattern[1: len(pattern)]

print(final)

['a', 'n', 'd', ' ', 't', 'h', 'e', ' ', 't', 'a']
