<a href="https://colab.research.google.com/github/Greeshmasindhu24/Next-Word-Prediction-Using--LSTM/blob/main/6_Next_Word_Prediction_Using_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install PyMuPDF

import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import fitz

# Load and clean data
doc = fitz.open('/content/drive/MyDrive/Hands on/power-bi-question.pdf')
text = ""
for page in doc:
    text += page.get_text().lower()  # Extract text from each page

text = re.sub('[^a-z\s]', '', text)  # Remove everything except lowercase letters and spaces
words = text.split()

# Tokenize
tokenizer = Tokenizer(oov_token='<UNK>')
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

# Prepare sequences
max_len = 5
X, y = [], []
for i in range(max_len, len(words)):
    context = ' '.join(words[i-max_len:i])
    X.append(context)
    y.append(words[i])

X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=max_len, padding='post')
y_labels = [tokenizer.word_index[word] for word in y]

# Model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(total_words, 64, input_length=max_len),
    tf.keras.layers.LSTM(128),
    tf.keras.layers.Dense(total_words, activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.fit(X_pad, np.array(y_labels), epochs=10, batch_size=32)

# ✅ Accuracy calculation (manual word prediction accuracy)
correct = 0
total = len(X_pad)

for i in range(total):
    input_seq = X_pad[i].reshape(1, -1)
    true_label = y_labels[i]
    pred_probs = model.predict(input_seq, verbose=0)[0]
    pred_label = np.argmax(pred_probs)
    if pred_label == true_label:
        correct += 1

accuracy = correct / total
print(f"\n✅ Next-word prediction accuracy: {accuracy:.4f}")

# Prediction function
def predict_word(sentence, top_k=3):
    words = sentence.lower().split()[-max_len:]
    seq = tokenizer.texts_to_sequences([' '.join(words)])[0]
    padded = pad_sequences([seq], maxlen=max_len, padding='post')
    probs = model.predict(padded, verbose=0)[0]
    top_indices = np.argsort(probs)[-top_k:]
    return tokenizer.index_word[np.random.choice(top_indices)]

# Interactive prediction
while True:
    user_input = input("\nEnter your sentence (or 'q' to quit): ").strip()
    if user_input.lower() == 'q':
        print("Exiting...")
        break
    if not user_input:
        print("Error: Please type a sentence.")
        continue

    predicted_word = predict_word(user_input)
    print(f"\nPredicted next word: '{predicted_word}'")

# Test prediction
print("\nTest prediction for: 'power bi is used for'")
print("Predicted word:", predict_word("power bi is used for"))

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Collecting PyMuPDF
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.5




Epoch 1/10
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - accuracy: 0.0305 - loss: 6.5746
Epoch 2/10
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.0448 - loss: 5.7016
Epoch 3/10
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.0623 - loss: 5.6055
Epoch 4/10
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.0716 - loss: 5.3997
Epoch 5/10
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.0775 - loss: 5.3325
Epoch 6/10
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.0913 - loss: 5.1767
Epoch 7/10
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.1029 - loss: 5.0163
Epoch 8/10
[1m169/169[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.1212 - loss: 4.8642
Epoch 9/10
[1m169/169[0m [32m━━━━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/drive')