# **Next Word Prediction**
FACEIN Internship Project

## **Import Libraries**

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np

# **Dataset Preprocessing**

In [2]:
with open('alllines.txt', 'r',encoding='utf-8') as file:
    text = file.read().lower()

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

print("Vocabulary size:", total_words)

Vocabulary size: 25576


Saving the tokens

In [4]:
import pickle

with open("next_word_prediction_tokens0.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [5]:
X = []  #inputs
y = []  #outputs

for l in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([l])[0]
    for i in range(1, len(token_list)):
        X.append(token_list[:i])
        y.append(token_list[i])

X = X[:50000]
y = y[:50000]

#max_sequence_length = 40
#X = pad_sequences(X, maxlen=max_sequence_length, padding='pre')

X = pad_sequences(X, padding='pre')
y = np.array(y)


print("X shape:", X.shape)
print("y shape:", y.shape)
print("Example input (as word index):", X[0])
print("Example output (as word index):", y[0])

X shape: (50000, 55)
y shape: (50000,)
Example input (as word index): [  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
 315]
Example output (as word index): 3


## **Building & Training Model**

### **LSTM Model**

model = Sequential([
    Embedding(input_dim=total_words, output_dim=32), #input_length=1 latest versions might not require this field
    LSTM(100),
    Dense(total_words, activation='softmax')
    ])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
    )

model.summary()

### **Model Training**

In [6]:
from tensorflow.keras.models import load_model
model = load_model('next_word_prediction_model_0.h5')
model.summary()



In [7]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
from keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)
model.fit(X, y, epochs=10, batch_size=1024, validation_split=0.2, callbacks=[early_stop])

In [12]:
loss, accuracy = model.evaluate(X, y)
print(f"Test Accuracy: {accuracy:.4f}")

[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 16ms/step - accuracy: 0.8342 - loss: 0.6553
Test Accuracy: 0.8309


In [13]:
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 83.09%


Saving the model

In [None]:
model.save('next_word_prediction_model_0.h5')

## **Model Testing**

In [None]:
def interactive_predict(model, tokenizer, max_sequence_length):
    sequence = []
    index_word = {v: k for k, v in tokenizer.word_index.items()}

    while True:
        print("Current sequence:", " ".join(sequence))
        input_word = input("Enter next word (or '.' to finish): ").lower()
        if input_word == '.':
            break

        sequence.append(input_word)

        token_list = tokenizer.texts_to_sequences([sequence])[0]
        if not token_list:
            print("Unknown word. Try something else.")
            sequence.pop()
            continue

        padded_sequence = pad_sequences([token_list], maxlen=max_sequence_length, padding='pre')
        predicted_probs = model.predict(padded_sequence, verbose=0)[0]
        predicted_indices = np.argsort(predicted_probs)[-5:][::-1]

        predicted_words = [index_word.get(index, '?') for index in predicted_indices]

        print("Predicted next words:", predicted_words)
        print("-" * 20)

    print("Final sequence:", " ".join(sequence), '.')

interactive_predict(model, tokenizer, max_sequence_length=55)


Current sequence: 


## **Application**

In [1]:
%%writefile next_word_app.py
import streamlit as st
import numpy as np
from keras.preprocessing.sequence import pad_sequences
import pickle
from tensorflow.keras.models import load_model

st.markdown(
    """
    <style>
    .stApp {
        background-image: url("https://img.pikbest.com/wp/202405/nodes-1-3d-render-of-connected-by-lines-on-a-black-background_9857780.jpg!sw800");
        background-size: cover;
        background-repeat: no-repeat;
        background-attachment: fixed;
    }
    </style>
    """,
    unsafe_allow_html=True
)

#st.markdown("<h1 style='color:blue;'>Next Word Predictor (LSTM)</h1>", unsafe_allow_html=True)
#st.markdown("<p style='color:green;'>Enter a sequence of words, and the model will predict the next word.</p>", unsafe_allow_html=True)

st.title("Next Word Predictor (LSTM)")
st.write("Enter a sequence of words, and the model will predict the next word.")

#Load tokenizer and model
with open('next_word_prediction_tokens0.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

model = load_model('next_word_prediction_model_0.h5')
max_sequence_length = 55
top_k = 5


#Initialize session state
if 'sequence' not in st.session_state:
    st.session_state.sequence = []

if 'done' not in st.session_state:
    st.session_state.done = False

if 'last_input' not in st.session_state:
    st.session_state.last_input = ""


def get_next_word_suggestions(current_sequence):
    token_list = tokenizer.texts_to_sequences([current_sequence])[0]
    if not token_list:
        return []

    token_list = pad_sequences([token_list], maxlen=max_sequence_length, padding='pre')
    predicted_probs = model.predict(token_list, verbose=0)[0]
    predicted_indices = np.argsort(predicted_probs)[-top_k:][::-1]

    index_word = tokenizer.index_word
    predicted_words = [index_word.get(i, "?") for i in predicted_indices]

    return predicted_words

#Interactive Loop
if not st.session_state.done:
    current_text = " ".join(st.session_state.sequence)
    input_text = st.text_input("Type your sentence here", value=current_text)

    # Detect new word without requiring enter after each word
    if input_text != st.session_state.last_input:
        words = input_text.strip().split()

        #fullstop or new word entry
        if len(words) > len(st.session_state.sequence):
            new_word = words[-1]

            if new_word == ".":
                st.session_state.done = True
            else:
                st.session_state.sequence.append(new_word)

            st.session_state.last_input = input_text
            st.rerun()

        # Handle case when user deletes or rewrites sentence
        elif len(words) < len(st.session_state.sequence):
            st.session_state.sequence = words
            st.session_state.last_input = input_text
            st.rerun()

        else:
            st.session_state.last_input = input_text

    #clickable suggestions
    suggestions = get_next_word_suggestions(input_text)
    if suggestions:
        st.write("### Suggestions:")
        cols = st.columns(top_k)
        for i in range(top_k):
            if cols[i].button(suggestions[i]):
                st.session_state.sequence.append(suggestions[i])
                st.session_state.last_input = " ".join(st.session_state.sequence)
                st.rerun()

#Final Display of Entered Sentence
else:
    full_sentence = " ".join(st.session_state.sequence) + "."
    st.success("Final Sentence:")
    st.write(f"`{full_sentence}`")

    if st.button("Reset"):
        st.session_state.sequence = []
        st.session_state.done = False
        st.session_state.last_input = ""
        st.rerun()

Overwriting next_word_app.py


In [2]:
! streamlit run next_word_app.py

^C
