In [2]:
! gdown --id 1O9YIeDxN1GeJUt62aK-weybfTVW5Hwj_

Downloading...
From: https://drive.google.com/uc?id=1O9YIeDxN1GeJUt62aK-weybfTVW5Hwj_
To: /content/1661-0.txt
100% 608k/608k [00:00<00:00, 107MB/s]


In [3]:
# ------------------------
# STEP 1: IMPORT LIBRARIES
# ------------------------

import re     # 're' is Python's regular expressions library for text cleaning


# ------------------------
# STEP 2: LOAD THE TEXT FILE
# ------------------------

# The file you downloaded via gdown gets saved with its original name.
# file_path = "1661-0.txt"

# Open and read the entire book into a single string variable
with open("/content/1661-0.txt", "r", encoding="utf-8") as f:
    text = f.read()


# ------------------------
# STEP 3: EXTRACT ONLY BOOK CONTENT
# (remove Project Gutenberg headers/footers)
# ------------------------

start_marker = "*** START OF THIS PROJECT"
end_marker   = "*** END OF THIS PROJECT"

start_idx = text.find(start_marker)
end_idx   = text.find(end_marker)

# Slice text from START marker to END marker
book_text = text[start_idx:end_idx]


# ------------------------
# STEP 4: BASIC CLEANING
# (lowercase, remove unusual symbols, collapse whitespace)
# ------------------------

# Convert to lowercase for uniformity
book_text = book_text.lower()

# Keep only letters, numbers, spaces, punctuations . , ! ?
book_text = re.sub(r"[^a-z0-9\s,.!?]", " ", book_text)

# Replace multiple spaces/newlines with a single space
book_text = re.sub(r"\s+", " ", book_text).strip()


# ------------------------
# STEP 5: DISPLAY SAMPLE OUTPUT
# ------------------------

print("Cleaned text preview:\n", book_text[:500])
print("\nTotal length of cleaned text:", len(book_text))


Cleaned text preview:
 start of this project gutenberg ebook the adventures of sherlock holmes produced by an anonymous project gutenberg volunteer and jose menendez cover the adventures of sherlock holmes by arthur conan doyle contents i. a scandal in bohemia ii. the red headed league iii. a case of identity iv. the boscombe valley mystery v. the five orange pips vi. the man with the twisted lip vii. the adventure of the blue carbuncle viii. the adventure of the speckled band ix. the adventure of the engineer s thumb

Total length of cleaned text: 552984


In [4]:
# ------------------------------------------
# STEP 2: TOKENIZATION AND VOCABULARY SETUP
# ------------------------------------------

from tensorflow.keras.preprocessing.text import Tokenizer

# Initialize the tokenizer
# - filters='' ensures we DO NOT remove punctuation like . , ? !
tokenizer = Tokenizer(filters='', lower=True, oov_token="<OOV>")

# Fit tokenizer on the cleaned text
tokenizer.fit_on_texts([book_text])

# Total vocabulary size
total_words = len(tokenizer.word_index) + 1

print("Total unique words (vocab size):", total_words)

# Show first 20 word mappings
print("\nSample word index mapping (first 20):")
sample_items = list(tokenizer.word_index.items())[:20]
print(sample_items)


Total unique words (vocab size): 12003

Sample word index mapping (first 20):
[('<OOV>', 1), ('the', 2), ('i', 3), ('and', 4), ('to', 5), ('of', 6), ('a', 7), ('in', 8), ('that', 9), ('it', 10), ('was', 11), ('he', 12), ('you', 13), ('his', 14), ('is', 15), ('my', 16), ('have', 17), ('as', 18), ('with', 19), ('had', 20)]


In [14]:
# ------------------------------------------
# STEP 3: CONVERT TEXT TO INTEGER SEQUENCES
# ------------------------------------------

# Convert entire book text into a sequence of integer tokens
sequences = tokenizer.texts_to_sequences([book_text])

# sequences is a list containing ONE long list → extract it
sequences = sequences[0]

print("Total tokens in the cleaned book:", len(sequences))

# Show first 50 tokens
print("\nFirst 50 tokens:\n", sequences[:50])


Total tokens in the cleaned book: 106055

First 50 tokens:
 [837, 6, 29, 2738, 2739, 3695, 2, 1533, 6, 112, 65, 3696, 43, 45, 5622, 2738, 2739, 5623, 4, 5624, 5625, 1534, 2, 1533, 6, 112, 65, 43, 611, 3697, 3698, 2740, 371, 7, 1095, 8, 1805, 2741, 2, 253, 484, 984, 2742, 7, 146, 6, 2743, 3699, 2, 612]


In [5]:
# ------------------------------------------
# STEP 4: CREATE INPUT–OUTPUT SEQUENCES (Beginner Friendly)
# ------------------------------------------

# 1. Convert the book text into numbers (tokens)
token_list = tokenizer.texts_to_sequences([book_text])[0]

print("Total number of tokens =", len(token_list))


# 2. Create MANY sequences where:
#    Input = some words
#    Output = next word
#    Example: [the, dog] → barked

input_sequences = []

# Start from index 3 (we need at least 2 words before predicting 3rd word)
for i in range(3, len(token_list)):

    # Create a sequence from the start up to i
    # Example: token_list[:5] means first 5 words
    # current_sequence = token_list[0:i]

    # Add this sequence to our list
    input_sequences.append(current_sequence)

print("Total sequences created =", len(input_sequences))


# 3. Find the maximum length of any sequence
# (This is needed so we can pad all sequences to same size)

max_seq_len = 0

for seq in input_sequences:
    if len(seq) > max_seq_len:
        max_seq_len = len(seq)

print("Maximum sequence length =", max_seq_len)


# 4. Pad all sequences on the left side with 0s
#    Example: [3, 5, 9] → [0, 0, 3, 5, 9] (to match max length)

from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_sequences = pad_sequences(
    input_sequences,
    maxlen=max_seq_len,
    padding='pre'   # add 0's on the left
)

print("Padded sequences shape =", padded_sequences.shape)


# 5. Split sequences into Input (X) and Output (y)
#    Example sequence: [0, 0, 3, 5, 9]
#    X = [0, 0, 3, 5]
#    y = 9

import numpy as np

X = padded_sequences[:, :-1]   # all words except last
y = padded_sequences[:, -1]    # last word only

print("Shape of X =", X.shape)
print("Shape of y =", y.shape)


# 6. Convert y (labels) into one-hot encoding
#    Example: word 9 → [0 0 0 0 0 0 0 0 1 0 0 ... ]

# from tensorflow.keras.utils import to_categorical

# y = to_categorical(y, num_classes=total_words)

# print("Final shape of y =", y.shape)
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

history = model.fit(X, y, epochs=50, verbose=1)


Total number of tokens = 106055


NameError: name 'current_sequence' is not defined

In [None]:
# STEP 4: CREATE INPUT–OUTPUT SEQUENCES (Sliding Window Version)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional



token_list = tokenizer.texts_to_sequences([book_text])[0]
print("Total tokens =", len(token_list))

input_sequences = []
window_size = 20   # keep sequences short

for i in range(window_size, len(token_list)):
    current_sequence = token_list[i-window_size:i]
    input_sequences.append(current_sequence)

print("Total sequences =", len(input_sequences))

from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_sequences = pad_sequences(input_sequences, maxlen=window_size, padding='pre')

X = padded_sequences[:, :-1]
y = padded_sequences[:, -1]

print("X shape =", X.shape)
print("y shape =", y.shape)

model = Sequential()
model.add(Embedding(total_words, 100, input_length=window_size-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X, y, epochs=50, batch_size=128)


Total tokens = 106055
Total sequences = 106035
X shape = (106035, 19)
y shape = (106035,)
Epoch 1/50




[1m759/829[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m17s[0m 245ms/step - accuracy: 0.0542 - loss: 7.1689

In [16]:
# STEP 5 — TEXT GENERATION

def generate_text(seed_text, next_words=30):
    for _ in range(next_words):

        # Convert seed text → numbers (tokens)
        token_list = tokenizer.texts_to_sequences([seed_text])[0]

        # Pad tokens to match training input size
        token_list = pad_sequences([token_list], maxlen=X.shape[1], padding='pre')

        # Predict probabilities for next word
        predicted_probs = model.predict(token_list, verbose=0)

        # Pick the word with highest probability
        predicted_index = np.argmax(predicted_probs)

        # Convert index → word
        output_word = tokenizer.index_word.get(predicted_index, "")

        seed_text += " " + output_word

    return seed_text


In [17]:
# FUNCTION: Get Top-5 Most Likely Next Words

def predict_top_words(seed_text, top_k=5):
    # Convert seed text to tokens
    token_list = tokenizer.texts_to_sequences([seed_text])[0]

    # Pad sequence
    token_list = pad_sequences([token_list], maxlen=X.shape[1], padding='pre')

    # Predict probabilities for next word
    predicted_probs = model.predict(token_list, verbose=0)[0]

    # Get top-k highest probability indices
    top_indices = predicted_probs.argsort()[-top_k:][::-1]

    # Convert indices to words
    predicted_words = [tokenizer.index_word.get(idx, "") for idx in top_indices]

    return predicted_words



In [18]:

while True:
    user_input = input("\nEnter the beginning of a sentence: ")

    # If user enters empty input → skip
    if user_input.strip() == "":
        print("Please type something…")
        continue

    # Predict next words
    predictions = predict_top_words(user_input, top_k=7)
    print("\nTop predictions:", predictions)

    # Ask user if they want to continue
    choice = input("\nDo you want to try again? (yes/no): ").lower()

    if choice in ["no", "n", "exit", "quit"]:
        print("Exiting… Thank you!")
        break

print("Model stopped.")




Enter the beginning of a sentence: 
Please type something…

Enter the beginning of a sentence: The Sherlock Holmes entered the

Top predictions: ['kitchen', 'red', 'key', 'young', 'bed', 'door', 'house,', 'room.', 'door,', 'left']

Do you want to try again? (yes/no): n
Exiting… Thank you!
Model stopped.


In [19]:
model.save("nwp_model.h5")




In [20]:
import pickle
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


In [24]:
from google.colab import files
files.download("nwp_model.h5")
files.download("tokenizer.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [1]:
!pip install gradio

import tensorflow as tf
import pickle
import numpy as np
import gradio as gr

# Load model
model = tf.keras.models.load_model("nwp_model.keras")

# Load tokenizer
with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

max_len = 18   # your window_size

def predict_words(sentence):
    seq = tokenizer.texts_to_sequences([sentence])[0]
    seq = seq[-max_len:]
    seq = np.array(seq).reshape(1, -1)

    preds = model.predict(seq)[0]
    top_5 = preds.argsort()[-5:][::-1]

    words = []
    for i in top_5:
        for k, v in tokenizer.word_index.items():
            if v == i:
                words.append(k)
                break

    return ", ".join(words)

ui = gr.Interface(
    fn=predict_words,
    inputs=gr.Textbox(label="Enter text"),
    outputs="text",
    title="Next Word Prediction (Gradio)"
)

ui.launch()



ValueError: File not found: filepath=nwp_model.keras. Please ensure the file is an accessible `.keras` zip file.