In [35]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import os

In [36]:
from tensorflow.keras import mixed_precision
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

In [37]:
data_path = "C:\\Users\\Jahnavi Reddy.A\\Downloads\\t8.shakespeare.txt"

In [38]:
import os
current_directory = os.getcwd()
file_name = "t8.shakespeare.txt"
data_path = os.path.join(current_directory, file_name)
print(f"File path: {data_path}")
data = pd.read_fwf(data_path, header=None)

File path: /content/t8.shakespeare.txt


In [39]:
print(data.columns)
data = data.rename(columns={data.columns[0]: 'PlayerLine'})

Index([0], dtype='int64')


In [40]:
corpus = data['PlayerLine'].dropna().tolist()

In [41]:
corpus = corpus[:1000]

In [42]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [43]:
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [44]:
max_sequence_len = 20
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

In [45]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [46]:
model = Sequential([
    Embedding(total_words, 50, input_length=max_sequence_len-1),  # Reduced embedding size
    LSTM(100),  # Reduced LSTM units
    Dense(total_words, activation='softmax')
])



In [47]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Check model summary (optional)
model.summary()

In [48]:
model.fit(X, y, epochs=20, batch_size=16, verbose=1)


Epoch 1/20
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.0241 - loss: 7.0756
Epoch 2/20
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.0258 - loss: 6.4758
Epoch 3/20
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.0337 - loss: 6.2857
Epoch 4/20
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.0377 - loss: 6.1269
Epoch 5/20
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.0473 - loss: 5.9362
Epoch 6/20
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.0542 - loss: 5.7397
Epoch 7/20
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 9ms/step - accuracy: 0.0567 - loss: 5.5623
Epoch 8/20
[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.0661 - loss: 5.3515
Epoch 9/20
[1m432/432[0m [32m━━━━━━━━

<keras.src.callbacks.history.History at 0x7fedda8287f0>

In [49]:
def predict_next_word(model, tokenizer, text, max_sequence_len):
    # Tokenize and pad the input text
    token_list = tokenizer.texts_to_sequences([text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

    # Get the predicted word index
    predicted = np.argmax(model.predict(token_list), axis=-1)

    # Map the predicted index to the word
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            return word
    return ""

In [50]:
input_text = "where are my"

predicted_word = predict_next_word(model, tokenizer, input_text, max_sequence_len)

print(f"Input text: {input_text}")
print(f"Predicted next word: {predicted_word}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step
Input text: where are my
Predicted next word: love
