In [1]:
import nltk
import numpy as np
import random
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
from tensorflow.keras.utils import to_categorical # type: ignore
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import Embedding, LSTM, Dense # type: ignore

# Download necessary NLTK datasets
nltk.download('punkt')
nltk.download('gutenberg')

# Load Corpus from NLTK
from nltk.corpus import gutenberg
corpus_text = gutenberg.raw('shakespeare-caesar.txt').lower()

# Tokenization and cleaning: keep only alphabetic tokens
tokens = word_tokenize(corpus_text)
tokens = [t for t in tokens if t.isalpha()]

# ---------------------------------------
# 1. Traditional Feedforward NN (N-grams)
# ---------------------------------------
print("\n=== Traditional Model: Feedforward NN ===")

n = 3  # trigram model (2 words input to predict 3rd word)
X_ngrams = [' '.join(tokens[i:i+n-1]) for i in range(len(tokens)-n)]
y_ngrams = [tokens[i+n-1] for i in range(len(tokens)-n)]

# Vectorize input phrases
vectorizer = CountVectorizer()
X_vec = vectorizer.fit_transform(X_ngrams)

# Encode target words
le = LabelEncoder()
y_enc = le.fit_transform(y_ngrams)

# Train/Test split using .shape[0] instead of len()
split = int(0.8 * X_vec.shape[0])
X_train, X_test = X_vec[:split], X_vec[split:]
y_train, y_test = y_enc[:split], y_enc[split:]

# Train Feedforward NN (MLPClassifier) with increased max_iter and random_state
mlp = MLPClassifier(hidden_layer_sizes=(128,), max_iter=100, random_state=42)
mlp.fit(X_train, y_train)

# Predict & evaluate
y_pred = mlp.predict(X_test)
acc_ff = accuracy_score(y_test, y_pred)
print(f"Feedforward NN Accuracy: {acc_ff:.4f}")

# ---------------------------------------
# 2. Neural Network Model (LSTM)
# ---------------------------------------
print("\n=== Neural Network Model: LSTM ===")

sequence_len = 4  # 3 input words to predict 1 output word
sequences = []
for i in range(len(tokens) - sequence_len):
    seq = tokens[i:i + sequence_len]
    sequences.append(seq)

# Vocabulary indexing
word_index = {word: i + 1 for i, word in enumerate(set(tokens))}
index_word = {i: word for word, i in word_index.items()}
vocab_size = len(word_index) + 1

# Encode sequences to integers
encoded_sequences = []
for seq in sequences:
    encoded_sequences.append([word_index[word] for word in seq])

encoded_sequences = np.array(encoded_sequences)
X_lstm = encoded_sequences[:, :-1]  # first 3 words
y_lstm = to_categorical(encoded_sequences[:, -1], num_classes=vocab_size)  # 4th word one-hot

# Build LSTM model (input_length removed to avoid warning)
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=50))  # Removed input_length here
model.add(LSTM(100))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train LSTM
model.fit(X_lstm, y_lstm, epochs=5, verbose=1)

# Evaluate LSTM accuracy on training data (no separate test split)
loss, acc_lstm = model.evaluate(X_lstm, y_lstm, verbose=0)
print(f"LSTM Model Accuracy: {acc_lstm:.4f}")

# Prediction helper for LSTM
def predict_next_word_lstm(model, seed_text):
    words = seed_text.lower().split()
    if len(words) != 3:
        return "Please provide exactly 3 words."
    encoded = [word_index.get(w, 0) for w in words]
    padded = pad_sequences([encoded], maxlen=3)
    pred_id = np.argmax(model.predict(padded, verbose=0), axis=-1)[0]
    return index_word.get(pred_id, "<UNK>")

# --- Next word prediction for phrase "brutus is noble" ---

# Feedforward NN expects 2 words input (because trigram model)
ff_input = "brutus is noble"
ff_vec = vectorizer.transform([ff_input])
ff_pred_word = le.inverse_transform(mlp.predict(ff_vec))[0]
print(f"\n[Feedforward NN] Prediction for '{ff_input}': {ff_pred_word}")

# LSTM expects 3 words input
lstm_input = "brutus is noble"
lstm_pred_word = predict_next_word_lstm(model, lstm_input)
print(f"[LSTM] Prediction for '{lstm_input}': {lstm_pred_word}")

# ---------------------------------------
# Comparison Summary
# ---------------------------------------
print("\n=== Model Accuracy Comparison ===")
print(f"Feedforward NN Accuracy: {acc_ff:.4f}")
print(f"LSTM Model Accuracy: {acc_lstm:.4f}")

if acc_ff > acc_lstm:
    print("Result: Feedforward NN performed better in next word prediction.")
elif acc_ff < acc_lstm:
    print("Result: LSTM performed better in next word prediction.")
else:
    print("Result: Both models performed equally well.")



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gilli\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\gilli\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!



=== Traditional Model: Feedforward NN ===




Feedforward NN Accuracy: 0.0430

=== Neural Network Model: LSTM ===
Epoch 1/5
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 12ms/step - accuracy: 0.0260 - loss: 6.9559
Epoch 2/5
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - accuracy: 0.0307 - loss: 6.2060
Epoch 3/5
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 11ms/step - accuracy: 0.0348 - loss: 6.0864
Epoch 4/5
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.0417 - loss: 5.9551
Epoch 5/5
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.0608 - loss: 5.7887
LSTM Model Accuracy: 0.0669

[Feedforward NN] Prediction for 'brutus is noble': wise
[LSTM] Prediction for 'brutus is noble': i

=== Model Accuracy Comparison ===
Feedforward NN Accuracy: 0.0430
LSTM Model Accuracy: 0.0669
Result: LSTM performed better in next word prediction.
