In [4]:
#Read the text file
with open('datashakes.txt', 'r') as file:
    text = file.read()
    lines = text.lower().split('\n')  #split text into lines

In [17]:
#Define words, vocabulary size and sequences of words as lines
from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer

words = text_to_word_sequence(text)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(words)

vocabulary_size = len(tokenizer.word_index) + 1 
sequences = tokenizer.texts_to_sequences(lines)

In [18]:
#Find subsequences 
subsequences = []
for sequence in sequences:
    for i in range(1, len(sequence)):
       subsequence = sequence[:i+1]
       subsequences.append(subsequence)            

In [20]:
#Padding sequence
from keras.preprocessing.sequence import pad_sequences
sequence_length = max([len(sequence) for sequence in sequences])
sequences = pad_sequences(subsequences, maxlen=sequence_length, padding='pre')

In [22]:
#Encode the target labels
from keras.utils import to_categorical
x, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocabulary_size)

In [26]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dropout, Dense 
model = Sequential()
model.add(Embedding(input_dim = vocabulary_size, output_dim = 100, input_length=sequence_length-1))
model.add(LSTM(100))
model.add(Dropout(0.1))
model.add(Dense(units=vocabulary_size, activation='softmax'))



In [27]:
model.compile(optimizer='adam', loss='categorical_crossentropy',  metrics=['accuracy'])

In [30]:
model.fit(x, y, epochs=10)

Epoch 1/10
[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 20ms/step - accuracy: 0.0172 - loss: 7.0974
Epoch 2/10
[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 28ms/step - accuracy: 0.0265 - loss: 6.4583
Epoch 3/10
[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 25ms/step - accuracy: 0.0320 - loss: 6.3179
Epoch 4/10
[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 19ms/step - accuracy: 0.0451 - loss: 6.1923
Epoch 5/10
[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 20ms/step - accuracy: 0.0489 - loss: 5.9544
Epoch 6/10
[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 17ms/step - accuracy: 0.0558 - loss: 5.7818
Epoch 7/10
[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 22ms/step - accuracy: 0.0676 - loss: 5.5896
Epoch 8/10
[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 25ms/step - accuracy: 0.0746 - loss: 5.3998
Epoch 9/10
[1m485/485[0m

<keras.src.callbacks.history.History at 0x1a2fde76060>

In [31]:
#Test the accuracy of the model
loss, accuracy = model.evaluate(x, y)
print(f"\nAccuracy: {accuracy * 100:.2f}%")

[1m485/485[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.1314 - loss: 4.7706

Accuracy: 12.96%
