In [None]:
# =========================
# NEXT WORD PREDICTOR
# =========================

import re
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout,Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
import pickle
import random

# =========================
# 1. TEXT DATA (Your story)
text_data
with open('dataForNextWordPredictor.txt') as f:
    text_data=f.read()
# =========================


# =========================
# 2. DATA PREPROCESSING
# =========================

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters but keep spaces and periods
    text = re.sub(r'[^a-zA-Z\s\.]', '', text)
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    # Split into sentences
    sentences = text.split('.')
    # Remove empty sentences
    sentences = [s.strip() for s in sentences if len(s.strip()) > 0]
    return sentences

sentences = preprocess_text(text_data)
print(f"Total sentences: {len(sentences)}")
print(f"Sample sentence: {sentences[0][:50]}...")

# =========================
# 3. TOKENIZATION
# =========================
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
total_words = len(tokenizer.word_index) + 1
print(f"Total unique words: {total_words}")

# =========================
# 4. CREATE SEQUENCES
# =========================
input_sequences = []
for sentence in sentences:
    token_list = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

print(f"Total sequences created: {len(input_sequences)}")

# =========================
# 5. PAD SEQUENCES
# =========================
max_sequence_len = max([len(seq) for seq in input_sequences])
print(f"Maximum sequence length: {max_sequence_len}")

input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

# Split into features and labels
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

print(f"Features shape: {X.shape}")
print(f"Labels shape: {y.shape}")

# =========================
# 6. TRAIN-TEST SPLIT
# =========================
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

# =========================
# 7. BUILD MODEL
# =========================


model = Sequential([
    
    Embedding(total_words, 100, input_length=max_sequence_len-1),
    Bidirectional(LSTM(200, dropout=0.2, return_sequences=False)),
    Dense(total_words, activation='softmax')
])


model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

print(model.summary())

# =========================
# 8. TRAIN MODEL
# =========================

early_stop = EarlyStopping(monitor='val_loss', patience=25, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_test, y_test),
    callbacks=[early_stop],
    verbose=1
)

# =========================
# 9. PLOT TRAINING RESULTS
# =========================
print("\n" + "=" * 50)
print("STEP 6: Training Results")
print("=" * 50)

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

# =========================
# 10. PREDICTION FUNCTION
# =========================

def predict_next_words(model, tokenizer, text, num_words=3):
    """
    Predict the next words given a seed text
    """
    for _ in range(num_words):
        # Convert text to sequence
        token_list = tokenizer.texts_to_sequences([text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        
        # Get predictions
        predicted_probs = model.predict(token_list, verbose=0)[0]
        
        # Get the word with highest probability
        predicted_index = np.argmax(predicted_probs)
        
        # Convert index to word
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        
        text += " " + output_word
    
    return text

# Test the model
test_phrases = [
    "sherlock",
    
]

print("\nPredictions:")
for phrase in test_phrases:
    prediction = predict_next_words(model, tokenizer, phrase, num_words=2)
    print(f"'{phrase}' -> '{prediction}'")

# =========================
# 11. INTERACTIVE MODE
# =========================
print("\n" + "=" * 50)
print("STEP 8: Interactive Predictor")
print("=" * 50)
print("Type 'quit' to exit")

while True:
    user_input = input("\nEnter a word or phrase: ").lower()
    
    if user_input == 'quit':
        break
    
    if user_input.strip() == "":
        continue
    
    # Make prediction
    prediction = predict_next_words(model, tokenizer, user_input, num_words=3)
    print(f"Next words: {prediction}")

