# 📘 Next-Word Prediction using MLP (War and Peace)

In [None]:
# ✅ Install Dependencies (Colab only)
!pip install -q tensorflow


In [None]:
# ✅ Imports
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import requests
import pickle


In [None]:
# ✅ Download and Clean Text (War and Peace)
def download_text():
    url = "https://www.gutenberg.org/files/2600/2600-0.txt"
    response = requests.get(url)
    text = response.text.lower()
    return re.sub(r'[^a-zA-Z0-9 \.]', '', text)

text = download_text()
sentences = text.split('.')
sentences = [s.strip() for s in sentences if len(s.strip().split()) > 5]
print(f"Total sentences: {len(sentences)}")


Total sentences: 23843


In [None]:
# ✅ Tokenize into Words
context_len = 5
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocab size: {vocab_size}")


Vocab size: 46190


In [None]:
# ✅ Create (X, y) Word Sequences
sequences = []
for sentence in sentences:
    words = sentence.split()
    for i in range(context_len, len(words)):
        context = words[i - context_len:i]
        target = words[i]
        sequences.append((context, target))

X = []
y = []
for context, target in sequences:
    ctx_seq = tokenizer.texts_to_sequences([' '.join(context)])[0]
    target_seq = tokenizer.texts_to_sequences([target])[0][0]
    X.append(ctx_seq)
    y.append(target_seq)

X = np.array(X)
y = np.array(y)
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)


Shape of X: (395873, 5)
Shape of y: (395873,)


In [None]:
# ✅ Define and Train MLP Model
embedding_dim = 64
model = Sequential([
    Embedding(vocab_size, embedding_dim, input_shape=(context_len,), name="embedding"),
    Flatten(),
    Dense(1024, activation='relu'),
    Dense(vocab_size, activation='softmax')
])


model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


  super().__init__(**kwargs)


In [None]:
# ✅ Train the model (use GPU on Colab)
model.fit(X, y, epochs=50, batch_size=512)


Epoch 1/50
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 61ms/step - accuracy: 0.0758 - loss: 7.7071
Epoch 2/50
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 60ms/step - accuracy: 0.1333 - loss: 6.1897
Epoch 3/50
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 61ms/step - accuracy: 0.1651 - loss: 5.5416
Epoch 4/50
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 61ms/step - accuracy: 0.2031 - loss: 4.8395
Epoch 5/50
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 60ms/step - accuracy: 0.2556 - loss: 3.9580
Epoch 6/50
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 61ms/step - accuracy: 0.4003 - loss: 3.0393
Epoch 7/50
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 61ms/step - accuracy: 0.5000 - loss: 2.4400
Epoch 8/50
[1m774/774[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 61ms/step - accuracy: 0.5807 - loss: 2.0029
Epoch 9/50
[1m774/774[

<keras.src.callbacks.history.History at 0x7846d60a9210>

In [None]:
# ✅ Save model and tokenizer for use in Streamlit
model.save("mlp_model.h5")
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
print("Saved model and tokenizer.")




Saved model and tokenizer.


In [None]:
# ✅ Verify GPU is being used
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available: 1
