In [None]:
from datasets import load_dataset

# Load the full dataset (or a predefined split)
wiki = load_dataset("wikipedia", "20220301.en", split="train")

# Take only a small portion manually (first 10,000 examples)
small_wiki = wiki.select(range(10000))

# Join non-empty text entries
texts = [item["text"] for item in small_wiki if item["text"]]
data = " ".join(texts)


In [2]:
import re
def simple_sentence_tokenizer(text):
    # Split on period, exclamation, or question marks followed by a space and uppercase letter
    sentence_endings = re.compile(r'(?<=[.!?])\s+(?=[A-Z])')
    return sentence_endings.split(text)

sentences = simple_sentence_tokenizer(data)

In [3]:
# Tokenization (with limited vocab size)
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

Basically we are converting our classification problem into a supervised learning problem by forming input-output pairs for each sentence

In [4]:
# Creating n-gram sequences with cap
from tensorflow.keras.preprocessing.sequence import pad_sequences

input_sequences = []
max_sequences = 200000
count = 0

for sentence in sentences:
    tokenized = tokenizer.texts_to_sequences([sentence])[0]

    for i in range(1, len(tokenized)):
        input_sequences.append(tokenized[:i+1])
        count += 1
        if count >= max_sequences:
            break
    if count >= max_sequences:
        break

print(f"Total input sequences: {len(input_sequences)}")

Total input sequences: 200000


In [5]:
# Capping max_len and pad sequences
max_len = min(max(len(x) for x in input_sequences), 100)
padded_input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='pre')
print(f"Padded shape: {padded_input_sequences.shape}")

Padded shape: (200000, 100)


In [6]:
padded_input_sequences

array([[    0,     0,     0, ...,     0,  5401,     8],
       [    0,     0,     0, ...,  5401,     8,     7],
       [    0,     0,     0, ...,     8,     7,   189],
       ...,
       [    0,     0,     0, ...,    33, 12307,  2181],
       [    0,     0,     0, ..., 12307,  2181,    70],
       [    0,     0,     0, ...,  2181,    70,  1665]])

In [7]:
X = padded_input_sequences[:,:-1]
y = padded_input_sequences[:,-1]
X.shape

(200000, 99)

In [8]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Embedding, Bidirectional, LSTM, Dense, Attention, Concatenate, Layer
)
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy, SparseTopKCategoricalAccuracy
from tensorflow.keras.layers import LayerNormalization

In [9]:
# Hyperparameters
vocab_size = 20000  # Must match tokenizer
embedding_dim = 128
lstm_units = 128
max_len = X.shape[1]

In [10]:
# Input
inputs = Input(shape=(max_len,), name="input")

# Embedding
x = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(inputs)

# BiLSTM
x = Bidirectional(LSTM(lstm_units, return_sequences=True))(x)

# Attention Mechanism
# We'll use built-in Additive Attention
attention = Attention(name="attention_layer")([x, x])  # Query and Value are both `x`
x = Concatenate()([x, attention])
x = LayerNormalization()(x)

# Optional Dense layers
x = Dense(128, activation='relu')(x)
x = GlobalAveragePooling1D()(x)
x = Dense(64, activation='relu')(x)

# Output
outputs = Dense(vocab_size, activation='softmax')(x)

In [11]:
# Model
model = Model(inputs=inputs, outputs=outputs)
model.compile(
    loss=SparseCategoricalCrossentropy(),
    optimizer=Adam(learning_rate=0.001),
    metrics=[
        SparseCategoricalAccuracy(name='accuracy'),
        SparseTopKCategoricalAccuracy(k=5, name='top_5_accuracy')
    ]
)
model.summary()

In [12]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='loss', patience=3, restore_best_weights=True)

batch_size = 128
dataset = tf.data.Dataset.from_tensor_slices((X, y))
dataset = dataset.shuffle(buffer_size=10000).batch(batch_size).prefetch(tf.data.AUTOTUNE)

# Train
model.fit(dataset, epochs=20, callbacks=[early_stop])

Epoch 1/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1099s[0m 699ms/step - accuracy: 0.0744 - loss: 7.0667 - top_5_accuracy: 0.2303
Epoch 2/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m934s[0m 598ms/step - accuracy: 0.0870 - loss: 6.5469 - top_5_accuracy: 0.2403
Epoch 3/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m936s[0m 599ms/step - accuracy: 0.1239 - loss: 6.2388 - top_5_accuracy: 0.2710
Epoch 4/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m889s[0m 569ms/step - accuracy: 0.1443 - loss: 5.9839 - top_5_accuracy: 0.2906
Epoch 5/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m882s[0m 564ms/step - accuracy: 0.1607 - loss: 5.7349 - top_5_accuracy: 0.3074
Epoch 6/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m879s[0m 562ms/step - accuracy: 0.1733 - loss: 5.5058 - top_5_accuracy: 0.3230
Epoch 7/20
[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m894s[0m 572ms/step -

<keras.src.callbacks.history.History at 0x20c4545f7a0>

In [13]:
results = model.evaluate(dataset, verbose=1)

# Print metrics
for name, value in zip(model.metrics_names, results):
    print(f"{name}: {value:.4f}")

[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m223s[0m 142ms/step - accuracy: 0.2230 - loss: 6.0284 - top_5_accuracy: 0.3717
loss: 5.3835
compile_metrics: 0.2596


In [None]:
# Generate text
def generate_text(seed_text, next_words=10, max_len=50):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_len, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted_id = tf.argmax(predicted_probs).numpy()
        predicted_word = tokenizer.index_word.get(predicted_id, '')
        seed_text += ' ' + predicted_word
    return seed_text

In [2]:
import heapq

def beam_search_decoder(seed_text, beam_width=3, next_words=10, max_len=50):
    sequences = [(seed_text, 0.0)]  # (sequence, score)

    for _ in range(next_words):
        all_candidates = []
        for seq, score in sequences:
            tokenized = tokenizer.texts_to_sequences([seq])[0]
            tokenized = pad_sequences([tokenized], maxlen=max_len, padding='pre')
            preds = model.predict(tokenized, verbose=0)[0]
            
            top_indices = np.argsort(preds)[-beam_width:]  # top beam_width words
            for idx in top_indices:
                word = tokenizer.index_word.get(idx, '')
                if not word: continue
                candidate = (seq + ' ' + word, score - np.log(preds[idx] + 1e-10))  # use log prob
                all_candidates.append(candidate)

        # Select best `beam_width` sequences
        sequences = heapq.nsmallest(beam_width, all_candidates, key=lambda tup: tup[1])

    return sequences[0][0]  # Return best scoring sequence


In [3]:
# BLEU Score
from nltk.translate.bleu_score import sentence_bleu

def evaluate_bleu(reference_sentence, generated_sentence):
    reference = [reference_sentence.split()]
    candidate = generated_sentence.split()
    return sentence_bleu(reference, candidate, weights=(0.5, 0.5))  # bigram BLEU


In [4]:
# Perplexity
import math
import numpy as np

def compute_perplexity(model, dataset):
    loss = model.evaluate(dataset, verbose=0)[0]
    return math.exp(loss)

In [None]:
while True:
    user_input = input("\nEnter a seed text (or type 'exit' to quit): ").strip()
    if user_input.lower() == 'exit':
        break
    try:
        next_words = int(input("How many words do you want to generate? (e.g., 5, 10): "))
    except ValueError:
        print("⚠️ Please enter a valid number.")
        continue

    generated_output = generate_text(user_input, next_words=next_words, max_len=X.shape[1])
    print(f"\n📝 Generated Text:\n{generated_output}")

    # Ask for reference sentence for BLEU
    reference = input("Enter reference sentence for BLEU score (or press Enter to skip): ").strip()
    if reference:
        bleu = evaluate_bleu(reference, generated_output)
        print(f"🔵 BLEU Score: {bleu:.4f}")
    else:
        print("ℹ️ BLEU Score skipped.")

    # Perplexity
    perplexity = compute_perplexity(model, dataset)
    print(f"📉 Perplexity: {perplexity:.2f}")

In [None]:
# with beam search decoding
while True:
    user_input = input("\nEnter a seed text (or type 'exit' to quit): ").strip()
    if user_input.lower() == 'exit':
        break
    try:
        next_words = int(input("How many words do you want to generate? (e.g., 5, 10): "))
    except ValueError:
        print("⚠️ Please enter a valid number.")
        continue

    generated_output = beam_search_decoder(user_input, beam_width=5, next_words=next_words, max_len=X.shape[1])
    print(f"\n📝 Generated Text:\n{generated_output}")

    # Ask for reference sentence for BLEU
    reference = input("Enter reference sentence for BLEU score (or press Enter to skip): ").strip()
    if reference:
        bleu = evaluate_bleu(reference, generated_output)
        print(f"🔵 BLEU Score: {bleu:.4f}")
    else:
        print("ℹ️ BLEU Score skipped.")

    # Perplexity
    perplexity = compute_perplexity(model, dataset)
    print(f"📉 Perplexity: {perplexity:.2f}")

In [None]:
# Save model
model.save("word-wave.keras")

# Save tokenizer
import pickle
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
