In [None]:
from datasets import Dataset

with open("/content/David_Copperfield.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

# Remove empty lines and strip
lines = [line.strip() for line in lines if line.strip()]

# Create a Dataset from a list of dicts
dataset = Dataset.from_dict({"text": lines})


In [None]:
import pandas as pd
from datasets import load_dataset
from IPython.display import HTML, display

def display_table(dataset_or_sample):
  # A helper fuction to display a Transformer dataset or single sample contains multi-line string nicely
  pd.set_option("display.max_colwidth", None)
  pd.set_option("display.width", None)
  pd.set_option("display.max_rows", None)

  if isinstance(dataset_or_sample, dict):
      df = pd.DataFrame(dataset_or_sample, index=[0])
  else:
      df = pd.DataFrame(dataset_or_sample)

  html = df.to_html()
  styled_html = f"""<style> .dataframe th, .dataframe tbody td {{ text-align: left; padding-right: 30px; }} </style> {html}"""
  display(HTML(styled_html))


display_table(dataset.select(range(40)))

Unnamed: 0,text
0,By Charles Dickens.
1,"This ebook is the product of many hours of hard work by volunteers for Standard Ebooks, and builds on the hard work of other literature lovers made possible by the public domain."
2,This particular ebook is based on a transcription from Project Gutenberg and on digital scans from the Internet Archive.
3,"The source text and artwork in this ebook are believed to be in the United States public domain; that is, they are believed to be free of copyright restrictions in the United States. They may still be copyrighted in other countries, so users located outside of the United States must check their local laws before using this ebook. The creators of, and contributors to, this ebook dedicate their contributions to the worldwide public domain via the terms in the CC0 1.0 Universal Public Domain Dedication. For full license information, see the Uncopyright at the end of this ebook."
4,"Standard Ebooks is a volunteer-driven project that produces ebook editions of public domain literature using modern typography, technology, and editorial standards, and distributes them free of cost. You can download this and other ebooks carefully produced for true book lovers at standardebooks.org."
5,"I do not find it easy to get sufficiently far away from this book, in the first sensations of having finished it, to refer to it with the composure which this formal heading would seem to require. My interest in it, is so recent and strong; and my mind is so divided between pleasure and regret⁠—pleasure in the achievement of a long design, regret in the separation from many companions⁠—that I am in danger of wearying the reader whom I love, with personal confidences, and private emotions."
6,"Besides which, all that I could say of the story, to any purpose, I have endeavoured to say in it."
7,"It would concern the reader little, perhaps, to know, how sorrowfully the pen is laid down at the close of a two-years’ imaginative task; or how an author feels as if he were dismissing some portion of himself into the shadowy world, when a crowd of the creatures of his brain are going from him forever. Yet, I have nothing else to tell; unless, indeed, I were to confess (which might be of less moment still) that no one can ever believe this narrative, in the reading, more than I have believed it in the writing."
8,"Instead of looking back, therefore, I will look forward. I cannot close this volume more agreeably to myself, than with a hopeful glance towards the time when I shall again put forth my two green leaves once a month, and with a faithful remembrance of the genial sun and showers that have fallen on these leaves of David Copperfield, and made me happy."
9,"London, October, 1850."


In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import pickle
import re
from collections import Counter

class LSTMWordPredictor:
    def __init__(self, sequence_length=10, vocab_size=10000, embedding_dim=100, lstm_units=128):
        """
        Initialize the LSTM word predictor with configurable parameters.

        sequence_length: How many previous words to use for prediction
        vocab_size: Maximum number of unique words to keep in vocabulary
        embedding_dim: Dimension of word embeddings
        lstm_units: Number of LSTM units in the hidden layer
        """
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.lstm_units = lstm_units
        self.model = None
        self.tokenizer = None

    def preprocess_text(self, text):
        """
        Clean and preprocess the input text for training.
        This step is crucial for good model performance.
        """
        # Convert to lowercase for consistency
        text = text.lower()

        # Replace multiple spaces with single space
        text = re.sub(r'\s+', ' ', text)

        # Keep only letters, spaces, and basic punctuation
        text = re.sub(r'[^a-zA-Z\s\.\,\!\?\;\:]', '', text)

        # Split into sentences and then words
        sentences = re.split(r'[.!?]+', text)
        words = []

        for sentence in sentences:
            sentence_words = sentence.strip().split()
            if len(sentence_words) > self.sequence_length:  # Only keep longer sentences
                words.extend(sentence_words)

        return words

    def create_sequences(self, words):
        """
        Create input-output pairs for training the LSTM.
        Each sequence of 'sequence_length' words predicts the next word.
        """
        sequences = []
        next_words = []

        # Create overlapping sequences
        for i in range(len(words) - self.sequence_length):
            # Input: sequence of words
            seq = words[i:i + self.sequence_length]
            # Output: the next word
            next_word = words[i + self.sequence_length]

            sequences.append(seq)
            next_words.append(next_word)

        return sequences, next_words

    def prepare_data(self, text):
        """
        Complete data preparation pipeline from raw text to model-ready arrays.
        """
        print("Preprocessing text...")
        words = self.preprocess_text(text)
        print(f"Total words after preprocessing: {len(words)}")

        print("Creating sequences...")
        sequences, next_words = self.create_sequences(words)
        print(f"Created {len(sequences)} training sequences")

        # Initialize and fit tokenizer on all words
        print("Building vocabulary...")
        all_words = words  # Use all words for vocabulary
        self.tokenizer = Tokenizer(num_words=self.vocab_size, oov_token="<OOV>")
        self.tokenizer.fit_on_texts([all_words])

        # Convert sequences to numbers
        sequences_encoded = self.tokenizer.texts_to_sequences(sequences)
        next_words_encoded = self.tokenizer.texts_to_sequences([[word] for word in next_words])
        next_words_encoded = [seq[0] if seq else 0 for seq in next_words_encoded]

        # Convert to numpy arrays
        X = np.array(sequences_encoded)
        y = np.array(next_words_encoded)

        # Convert target to categorical (one-hot encoding)
        actual_vocab_size = min(self.vocab_size, len(self.tokenizer.word_index) + 1)
        #y = to_categorical(y, num_classes=actual_vocab_size)

        print(f"Input shape: {X.shape}")
        print(f"Output shape: {y.shape}")
        print(f"Actual vocabulary size: {actual_vocab_size}")

        return X, y, actual_vocab_size

    def build_model(self, actual_vocab_size):
        """
        Build the LSTM neural network architecture.
        This is where the magic happens - the model learns patterns in word sequences.
        """
        self.model = Sequential([
            # Embedding layer: converts word indices to dense vectors
            # This learns meaningful representations for each word
            Embedding(actual_vocab_size, self.embedding_dim,
                     input_length=self.sequence_length),

            # LSTM layer: the core of our model
            # It learns to remember relevant information from the sequence
            LSTM(self.lstm_units, dropout=0.2, recurrent_dropout=0.2),

            # Dropout for regularization to prevent overfitting
            Dropout(0.3),

            # Dense output layer: predicts probability for each word in vocabulary
            Dense(actual_vocab_size, activation='softmax')
        ])

        loss = tf.keras.losses.SparseCategoricalCrossentropy()

        # Compile with appropriate loss function for multi-class classification
        self.model.compile(
            loss=loss,
            optimizer='adam',
            metrics=['accuracy']
        )

        print("Model architecture:")
        self.model.summary()

        return self.model

    def train(self, text_file_path, epochs=50, batch_size=128, validation_split=0.1):
        """
        Complete training pipeline from text file to trained model.
        """
        try:
            # Read the text file
            print(f"Reading text from {text_file_path}...")
            with open(text_file_path, 'r', encoding='utf-8') as file:
                text = file.read()

            print(f"Loaded text with {len(text)} characters")

            # Prepare training data
            X, y, actual_vocab_size = self.prepare_data(text)

            # Build model
            print("Building model...")
            self.build_model(actual_vocab_size)

            # Train the model
            print("Starting training...")
            history = self.model.fit(
                X, y,
                batch_size=batch_size,
                epochs=epochs,
                validation_split=validation_split,
                verbose=1
            )

            print("Training completed!")
            return history

        except FileNotFoundError:
            print(f"Error: Could not find file '{text_file_path}'")
            print("Please make sure the David_Copperfield.txt file is in the same directory.")
            return None
        except Exception as e:
            print(f"An error occurred during training: {e}")
            return None

    def predict_next_word(self, seed_text, num_predictions=5):
        """
        Predict the next word given a seed text.
        Returns the top predictions with their probabilities.
        """
        if not self.model or not self.tokenizer:
            print("Model not trained yet. Please train the model first.")
            return []

        # Preprocess the seed text the same way as training data
        words = seed_text.lower().split()

        # Take the last 'sequence_length' words
        if len(words) >= self.sequence_length:
            sequence = words[-self.sequence_length:]
        else:
            # Pad with zeros if not enough words
            sequence = [''] * (self.sequence_length - len(words)) + words

        # Convert to numbers
        sequence_encoded = self.tokenizer.texts_to_sequences([sequence])[0]

        # Pad sequence to required length
        sequence_padded = pad_sequences([sequence_encoded],
                                      maxlen=self.sequence_length,
                                      padding='pre')

        # Get predictions
        predictions = self.model.predict(sequence_padded, verbose=0)[0]

        # Get top predictions
        top_indices = np.argsort(predictions)[-num_predictions:][::-1]

        # Convert back to words
        results = []
        reverse_word_map = {v: k for k, v in self.tokenizer.word_index.items()}

        for idx in top_indices:
            if idx in reverse_word_map:
                word = reverse_word_map[idx]
                probability = predictions[idx]
                results.append((word, probability))

        return results

    def generate_text(self, seed_text, num_words=20, temperature=1.0):
        """
        Generate a sequence of text by repeatedly predicting next words.
        Temperature controls randomness: lower = more predictable, higher = more creative.
        """
        if not self.model or not self.tokenizer:
            print("Model not trained yet. Please train the model first.")
            return seed_text

        generated = seed_text.lower().split()

        for _ in range(num_words):
            # Get the sequence for prediction
            if len(generated) >= self.sequence_length:
                sequence = generated[-self.sequence_length:]
            else:
                sequence = [''] * (self.sequence_length - len(generated)) + generated

            # Encode and predict
            sequence_encoded = self.tokenizer.texts_to_sequences([sequence])[0]
            sequence_padded = pad_sequences([sequence_encoded],
                                          maxlen=self.sequence_length,
                                          padding='pre')

            predictions = self.model.predict(sequence_padded, verbose=0)[0]

            # Apply temperature scaling for creativity control
            predictions = np.log(predictions + 1e-8) / temperature
            predictions = np.exp(predictions)
            predictions = predictions / np.sum(predictions)

            # Sample from the probability distribution
            next_index = np.random.choice(len(predictions), p=predictions)

            # Convert back to word
            reverse_word_map = {v: k for k, v in self.tokenizer.word_index.items()}
            if next_index in reverse_word_map:
                next_word = reverse_word_map[next_index]
                generated.append(next_word)
            else:
                break  # Stop if we can't find the word

        return ' '.join(generated)

    def save_model(self, model_path='lstm_word_predictor.h5', tokenizer_path='tokenizer.pkl'):
        """Save the trained model and tokenizer for later use."""
        if self.model:
            self.model.save(model_path)
            print(f"Model saved to {model_path}")

        if self.tokenizer:
            with open(tokenizer_path, 'wb') as f:
                pickle.dump(self.tokenizer, f)
            print(f"Tokenizer saved to {tokenizer_path}")

    def load_model(self, model_path='lstm_word_predictor.h5', tokenizer_path='tokenizer.pkl'):
        """Load a previously trained model and tokenizer."""
        try:
            self.model = tf.keras.models.load_model(model_path)
            print(f"Model loaded from {model_path}")

            with open(tokenizer_path, 'rb') as f:
                self.tokenizer = pickle.load(f)
            print(f"Tokenizer loaded from {tokenizer_path}")

            return True
        except Exception as e:
            print(f"Error loading model: {e}")
            return False


# Example usage and demonstration
if __name__ == "__main__":
    print("=== LSTM Next-Word Prediction for David Copperfield ===\n")

    # Initialize the predictor
    predictor = LSTMWordPredictor(
        sequence_length=10,  # Use 10 words to predict the next one
        vocab_size=5000,     # Keep top 5000 most common words
        embedding_dim=100,   # 100-dimensional word embeddings
        lstm_units=128       # 128 LSTM units
    )

    # Train the model
    print("Starting training process...")
    history = predictor.train('David_Copperfield.txt', epochs=10, batch_size=64)

    if history:
        print("\n=== Training completed! ===")

        # Save the model
        predictor.save_model()

        # Test predictions
        print("\n=== Testing the model ===")

        test_phrases = [
            "I was born on a",
            "The old man walked",
            "She looked at me with",
            "In the morning I would"
        ]

        for phrase in test_phrases:
            print(f"\nSeed: '{phrase}'")
            predictions = predictor.predict_next_word(phrase, num_predictions=3)
            print("Top predictions:")
            for word, prob in predictions:
                print(f"  {word}: {prob:.3f}")

        # Generate some text
        print("\n=== Text Generation Examples ===")
        for phrase in test_phrases[:2]:  # Just first two for brevity
            generated = predictor.generate_text(phrase, num_words=15, temperature=0.8)
            print(f"\nGenerated from '{phrase}':")
            print(f"'{generated}'")

    else:
        print("Training failed. Please check that David_Copperfield.txt is available.")
        print("\nTo use this model:")
        print("1. Download David Copperfield text file")
        print("2. Place it in the same directory as this script")
        print("3. Run the script again")

=== LSTM Next-Word Prediction for David Copperfield ===

Starting training process...
Reading text from David_Copperfield.txt...
Loaded text with 1933907 characters
Preprocessing text...
Total words after preprocessing: 300034
Creating sequences...
Created 300024 training sequences
Building vocabulary...
Input shape: (300024, 10)
Output shape: (300024,)
Actual vocabulary size: 5000
Building model...
Model architecture:


Starting training...
Epoch 1/10
[1m4220/4220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m246s[0m 57ms/step - accuracy: 0.1035 - loss: 5.9412 - val_accuracy: 0.1675 - val_loss: 5.2451
Epoch 2/10
[1m4220/4220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m264s[0m 58ms/step - accuracy: 0.1619 - loss: 5.1479 - val_accuracy: 0.1828 - val_loss: 5.0348
Epoch 3/10
[1m4220/4220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m241s[0m 57ms/step - accuracy: 0.1755 - loss: 4.9123 - val_accuracy: 0.1909 - val_loss: 4.9456
Epoch 4/10
[1m4220/4220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m259s[0m 56ms/step - accuracy: 0.1824 - loss: 4.7737 - val_accuracy: 0.1928 - val_loss: 4.8951
Epoch 5/10
[1m4220/4220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m268s[0m 58ms/step - accuracy: 0.1856 - loss: 4.6613 - val_accuracy: 0.1964 - val_loss: 4.8692
Epoch 6/10
[1m4220/4220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 58ms/step - accuracy: 0.1912 - loss: 4.5826 - val_accuracy: 0.1



Training completed!

=== Training completed! ===
Model saved to lstm_word_predictor.h5
Tokenizer saved to tokenizer.pkl

=== Testing the model ===

Seed: 'I was born on a'
Top predictions:
  <OOV>: 0.336
  little: 0.025
  few: 0.021

Seed: 'The old man walked'
Top predictions:
  out: 0.134
  up: 0.121
  away: 0.092

Seed: 'She looked at me with'
Top predictions:
  a: 0.300
  the: 0.121
  his: 0.105

Seed: 'In the morning I would'
Top predictions:
  have: 0.329
  be: 0.075
  not: 0.074

=== Text Generation Examples ===

Generated from 'I was born on a':
'i was born on a good deal of <OOV> but i feel now as i could even have had had'

Generated from 'The old man walked':
'the old man walked back to the <OOV> without telling me when i lived at all times in the'


In [8]:
phrase= "My deepest sorrows are"
generated = predictor.generate_text(phrase, num_words=128, temperature=0.8)
print(f"\nGenerated from '{phrase}':")
print(f"'{generated}'")


Generated from 'My deepest sorrows are':
'my deepest sorrows are not very much attached it to see the little <OOV> of my <OOV> take possession of the <OOV> of my <OOV> that <OOV> like, for another one of her lips, and she was a small outside table i am sure, but to <OOV> the subject of her <OOV> and to i told you, i should wish to feel it made the money into the <OOV> of my aunts having been sometimes to <OOV> he made a <OOV> of the fire of a <OOV> and the <OOV> <OOV> <OOV> the <OOV> <OOV> of mr jack <OOV> which was a too, and he had gone to mr peggotty, and mr i told him well, and i could <OOV> my throat, to be got i the time when i first came'
