#RNN - Language Model - Next Word Predictions

In [10]:
import numpy as np
from nltk.corpus import brown
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from sklearn.model_selection import train_test_split
import nltk
nltk.download('brown')
nltk.download('punkt')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
# Step 1: Load and clean data
sentences = brown.sents()
sentences = [' '.join(sent).lower() for sent in sentences if len(sent) >= 3]
sentences = sentences[:5000]  # Limit for quick training

In [12]:
len(sentences)

5000

# Preprocessing the data

In [13]:
# Step 2: Tokenize
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1

In [22]:
vocab_size

13032

In [14]:
word_index

{'the': 1,
 'of': 2,
 'and': 3,
 'to': 4,
 'a': 5,
 'in': 6,
 'for': 7,
 'that': 8,
 'is': 9,
 'was': 10,
 'on': 11,
 "''": 12,
 'he': 13,
 'at': 14,
 'with': 15,
 'be': 16,
 'as': 17,
 'it': 18,
 'by': 19,
 'his': 20,
 'will': 21,
 'said': 22,
 'from': 23,
 'this': 24,
 'has': 25,
 'are': 26,
 'an': 27,
 'but': 28,
 'not': 29,
 'have': 30,
 'had': 31,
 'who': 32,
 'they': 33,
 'would': 34,
 'which': 35,
 'were': 36,
 'new': 37,
 'mrs': 38,
 'their': 39,
 'been': 40,
 'one': 41,
 'all': 42,
 'year': 43,
 'its': 44,
 'there': 45,
 'more': 46,
 'mr': 47,
 'two': 48,
 'or': 49,
 'i': 50,
 'last': 51,
 'other': 52,
 'when': 53,
 'state': 54,
 'out': 55,
 'up': 56,
 'first': 57,
 'president': 58,
 'than': 59,
 'after': 60,
 'about': 61,
 'no': 62,
 'some': 63,
 'home': 64,
 'over': 65,
 'also': 66,
 '000': 67,
 'into': 68,
 'time': 69,
 'only': 70,
 'we': 71,
 'three': 72,
 '1': 73,
 'her': 74,
 'if': 75,
 'made': 76,
 'what': 77,
 'house': 78,
 'years': 79,
 'can': 80,
 's': 81,
 'them': 8

# Prepare the training samples

In [15]:
# Step 3: Create training samples (trigrams: context → next word)
X, y = [], []
for sent in sentences:
    tokens = tokenizer.texts_to_sequences([sent])[0]
    for i in range(2, len(tokens)):
        context = tokens[i-2:i]
        target = tokens[i]
        X.append(context)
        y.append(target)

X = np.array(X)
y = np.array(y)

#Preparing the dataset and model building

In [16]:
# Step 4: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Step 5: Model
embedding_dim = 100
rnn_units = 128

model = Sequential()
# Changed here: input_shape instead of input_length
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(2,)))
model.add(SimpleRNN(rnn_units))
model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

  super().__init__(**kwargs)


| Layer     | Formula                                        | Params    |
| --------- | ---------------------------------------------- | --------- |
| Embedding | vocab\_size × embedding\_dim                   | 1,303,200 |
| SimpleRNN | (input\_dim × units) + (units × units) + units | 29,312    |
| Dense     | (units × vocab\_size) + vocab\_size            | 1,681,128 |
| **Total** | Sum of all above                               | 3,013,640 |

vocab_size = 13032
units = 128
embedding_dim = 100

In [18]:
# Step 6: Train
model.fit(X_train, y_train, epochs=5, batch_size=256, validation_data=(X_test, y_test))

Epoch 1/5
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 136ms/step - accuracy: 0.0615 - loss: 8.2824 - val_accuracy: 0.0835 - val_loss: 7.2682
Epoch 2/5
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 130ms/step - accuracy: 0.0871 - loss: 6.9144 - val_accuracy: 0.0950 - val_loss: 7.1483
Epoch 3/5
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 127ms/step - accuracy: 0.1043 - loss: 6.5359 - val_accuracy: 0.1026 - val_loss: 7.1260
Epoch 4/5
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 121ms/step - accuracy: 0.1204 - loss: 6.2297 - val_accuracy: 0.1072 - val_loss: 7.1661
Epoch 5/5
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 136ms/step - accuracy: 0.1321 - loss: 5.9837 - val_accuracy: 0.1109 - val_loss: 7.2233


<keras.src.callbacks.history.History at 0x7f08a0c9b590>

# Evaluation the model

In [19]:

# Step 7: Evaluation on Test Set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

# Optional: Top-3 Accuracy
def top_k_accuracy(model, X, y_true, k=3):
    preds = model.predict(X, verbose=0)
    top_k_preds = np.argsort(preds, axis=1)[:, -k:]
    match = np.any(top_k_preds == y_true.reshape(-1, 1), axis=1)
    return np.mean(match)

top3 = top_k_accuracy(model, X_test, y_test, k=3)
print(f"Top-3 Accuracy: {top3:.4f}")

[1m568/568[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.1162 - loss: 7.2243
Test Accuracy: 0.1109
Top-3 Accuracy: 0.1796


#Sample Prediction

In [20]:
# Step 8: Sample Predictions
reverse_word_index = {v: k for k, v in word_index.items()}

print("Sample Predictions:")
for i in range(5):
    context = X_test[i]
    true_word = reverse_word_index.get(y_test[i], "<UNK>")
    pred = model.predict(np.array([context]), verbose=0)
    pred_word = reverse_word_index.get(np.argmax(pred), "<UNK>")
    print(f"Context: '{reverse_word_index[context[0]]} {reverse_word_index[context[1]]}' → Prediction: '{pred_word}' | Actual: '{true_word}'")


Sample Predictions:
Context: 'catholic atmosphere' → Prediction: 'of' | Actual: 'is'
Context: 'any test' → Prediction: 'the' | Actual: 'of'
Context: 'that an' → Prediction: 'own' | Actual: 'increase'
Context: 'city's snow' → Prediction: 'the' | Actual: 'clearing'
Context: 'interstate commerce' → Prediction: 'and' | Actual: 'commission'


#Manual Validation

In [21]:
context = 'i am'
# Tokenize the context words
context_sequence = tokenizer.texts_to_sequences([context.split()])[0]

# Ensure the context has two words, padding or truncating if necessary
if len(context_sequence) > 2:
    context_sequence = context_sequence[-2:]
elif len(context_sequence) < 2:
    # Handle cases where the context has fewer than two words
    print("Error: Context must contain at least two words.")
    pred_word = "<Error>"
else:
    # Reshape for the model
    context_sequence = np.array([context_sequence])

    # Predict the next word probabilities
    pred = model.predict(context_sequence, verbose=0)[0]

    # Get the index of the word with the highest probability
    predicted_word_index = np.argmax(pred)

    # Get the predicted word from the reverse word index
    pred_word = reverse_word_index.get(predicted_word_index, "<UNK>")

print (pred_word)

a
