In [14]:
## Data Loading
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd
import pickle


[nltk_data] Downloading package gutenberg to C:\Users\Rahul
[nltk_data]     patel\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [2]:
## Function to load subset of dataset to  disk (~5GB)
def save_subset(dataset, max_size_mb=5000, output_file='openwebtext_subset.txt'):
    """
    Save a subset of the dataset to disk, limiting to max_size_mb (in MB).
    """
    max_size_bytes = max_size_mb * 1024 * 1024  # Convert MB to bytes
    current_size = 0
    
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in dataset:
            text = item['text']
            text_size = len(text.encode('utf-8'))
            if current_size + text_size > max_size_bytes:
                break
            f.write(text + '\n')
            current_size += text_size
    
    print(f"Saved subset to {output_file}, size: {current_size / (1024 * 1024):.2f} MB")


In [3]:
## Load Dataset
print("Loading Dataset...")
data = gutenberg.raw('austen-emma.txt')
print("Dataset Loaded")

# Save File
with open('emma.txt', 'w', encoding='utf-8') as file:
    file.write(data)
    

Loading Dataset...
Dataset Loaded


In [4]:
## Data Preprocessing
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Load the dataset
with open('emma.txt', 'r',encoding='utf-8') as file:
    text = file.read().lower()

## Tokenize the text ( convert words to integers)
print("Tokenizing text...")
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1  # +1 for padding token
print(f"Total words: {total_words}")


Tokenizing text...
Total words: 7233


In [5]:
## Input Sequences
print("Generating input sequences...")
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)
print("Total input sequences generated:", len(input_sequences))


Generating input sequences...
Total input sequences generated: 146818


In [None]:
## Pad Sequences
max_sequence_length = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre')
print(f"Input sequences shape: {input_sequences.shape}")


Input sequences shape: (146818, 17)


In [7]:
input_sequences

array([[   0,    0,    0, ...,    0,   32,   45],
       [   0,    0,    0, ...,   32,   45,   92],
       [   0,    0,    0, ...,   45,   92, 4410],
       ...,
       [   0,    0,    0, ...,  534,  260,    4],
       [   0,    0,    0, ...,  260,    4,    2],
       [   0,    0,    0, ...,    4,    2, 2784]], dtype=int32)

In [8]:
## Create predictors and label
print("Creating predictors and labels...")
x, y = input_sequences[:, :-1], input_sequences[:, -1]
y = tf.keras.utils.to_categorical(y, num_classes=total_words)
print("Created predictors and labels")


Creating predictors and labels...
Created predictors and labels


In [9]:
## Split the data into training and testing sets
print("Splitting data into training and testing sets...")
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


Splitting data into training and testing sets...


In [None]:
## Training the Model (LSTM RNN)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

## Define the model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_length-1))  # Define input_length
model.add(LSTM(200, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))
model.build(input_shape=(None, max_sequence_length-1))  # Define input_shape for the model

## Compiling the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
print("Model Compiled")


Model Compiled


In [12]:
## Training the model
print("Training the model...")
history = model.fit(x_train, y_train, epochs=60, validation_data=(x_test, y_test), verbose=1)
print("Model Training Completed")


Training the model...
Epoch 1/60
[1m3671/3671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 34ms/step - accuracy: 0.0390 - loss: 6.4944 - val_accuracy: 0.0924 - val_loss: 5.8378
Epoch 2/60
[1m3671/3671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 35ms/step - accuracy: 0.1016 - loss: 5.6070 - val_accuracy: 0.1127 - val_loss: 5.6265
Epoch 3/60
[1m3671/3671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 35ms/step - accuracy: 0.1179 - loss: 5.2918 - val_accuracy: 0.1233 - val_loss: 5.5615
Epoch 4/60
[1m3671/3671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 33ms/step - accuracy: 0.1296 - loss: 5.0723 - val_accuracy: 0.1279 - val_loss: 5.5697
Epoch 5/60
[1m3671/3671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 31ms/step - accuracy: 0.1385 - loss: 4.9218 - val_accuracy: 0.1333 - val_loss: 5.6033
Epoch 6/60
[1m3671/3671[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 31ms/step - accuracy: 0.1440 - loss: 4.7960 - val_accuracy: 0.

In [15]:
## Saving the model
model.save('Next_Word_Predictor_LSTM.h5')

## Save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    



In [17]:
## Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_length):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_length:
        token_list = token_list[-(max_sequence_length-1):] # Ensure the sequence length matches max_sequence_length-1
    token_list = pad_sequences([token_list], maxlen=max_sequence_length-1, padding='pre')
    
    # Predict the next word
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [18]:
## Testing the model
input_text = "The evil of the actual disparity in their"
print("Input text: ", input_text)
max_sequence_length = model.input_shape[1] + 1  # +1 for padding token
predicted_word = predict_next_word(model, tokenizer, input_text, max_sequence_length)
print("Predicted next word: ", predicted_word)


Input text:  The evil of the actual disparity in their
Predicted next word:  ages
