In [None]:
import string

def enhanced_clean_text(text):
    # Step 1: Find the actual start of the content (e.g., "Chapter 1")
    start_marker = "chapter I"
    start = text.lower().find(start_marker)  # Locate the start of "Chapter 1"
    if start != -1:
        text = text[start:]  # Remove everything before "Chapter 1"

    # Step 2: Remove URLs and metadata
    text = ' '.join([word for word in text.split() if not word.startswith('http')])

    # Step 3: Remove special formatting markers and transcriber notes
    text = text.replace('_', '')  # Remove underscores
    text = text.replace('^', '')  # Remove carat markers
    text = text.replace('{', '').replace('}', '')  # Remove curly brackets

    # Step 4: Remove punctuation, numbers, and convert to lowercase
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    text = ''.join([char for char in text if char.isalpha() or char.isspace()])  # Remove numbers
    text = text.lower()  # Convert to lowercase

    # Step 5: Remove extra spaces
    text = ' '.join(text.split())  # Remove redundant spaces

    return text

# Load raw text file
with open("42671-0.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()

# Apply the cleaning function
cleaned_text = enhanced_clean_text(raw_text)

# Output the first 500 characters of the cleaned text
print(cleaned_text[:500])

start of the project gutenberg ebook note project gutenberg also has an html version of this file which includes the original illustrations see hhtm or hzip httpwwwgutenbergorgfileshhhtm or httpwwwgutenbergorgfileshzip images of the original pages are available through internet archive see transcribers note text enclosed by underscores is in italics italics a carat character is used to denote superscription multiple superscripted characters are enclosed by curly brackets example mrs pride and pr


In [None]:
import numpy as np

# Tokenize the text at character level
chars = sorted(set(cleaned_text))  # Unique characters
char_to_index = {char: idx for idx, char in enumerate(chars)}  # Character to integer mapping
index_to_char = {idx: char for idx, char in enumerate(chars)}  # Integer to character mapping

# Convert text into integers
text_as_int = np.array([char_to_index[char] for char in cleaned_text])

# Define sequence length and prepare input-output pairs
sequence_length = 100
sequences = []
targets = []

for i in range(len(text_as_int) - sequence_length):
    sequences.append(text_as_int[i:i+sequence_length])
    targets.append(text_as_int[i+sequence_length])

sequences = np.array(sequences)
targets = np.array(targets)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

# Custom dataset class
class TextDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = torch.tensor(sequences, dtype=torch.long)
        self.targets = torch.tensor(targets, dtype=torch.long)

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]

# Split data into training and validation sets
train_size = int(0.9 * len(sequences))
val_size = len(sequences) - train_size

train_dataset = TextDataset(sequences[:train_size], targets[:train_size])
val_dataset = TextDataset(sequences[train_size:], targets[train_size:])

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

In [None]:
import torch.nn as nn

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden=None):
        x = self.embedding(x)  # Convert input to embeddings
        output, hidden = self.lstm(x, hidden)  # Pass through LSTM
        output = self.fc(output[:, -1, :])  # Use the last output for prediction
        return output, hidden

# Define model parameters
vocab_size = len(chars)
embed_size = 64
hidden_size = 128

model = LSTMModel(vocab_size, embed_size, hidden_size)

In [None]:
import torch.optim as optim

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    train_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        # Forward pass
        outputs, _ = model(inputs)
        loss = criterion(outputs, targets)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Validation loop
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs, _ = model(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")

Epoch 1/20, Train Loss: 1.5747, Val Loss: 1.3525
Epoch 2/20, Train Loss: 1.2992, Val Loss: 1.2610
Epoch 3/20, Train Loss: 1.2326, Val Loss: 1.2205
Epoch 4/20, Train Loss: 1.1966, Val Loss: 1.2061
Epoch 5/20, Train Loss: 1.1737, Val Loss: 1.1826
Epoch 6/20, Train Loss: 1.1560, Val Loss: 1.1804
Epoch 7/20, Train Loss: 1.1438, Val Loss: 1.1718
Epoch 8/20, Train Loss: 1.1336, Val Loss: 1.1653
Epoch 9/20, Train Loss: 1.1249, Val Loss: 1.1600
Epoch 10/20, Train Loss: 1.1173, Val Loss: 1.1523
Epoch 11/20, Train Loss: 1.1107, Val Loss: 1.1552
Epoch 12/20, Train Loss: 1.1054, Val Loss: 1.1534
Epoch 13/20, Train Loss: 1.0998, Val Loss: 1.1486
Epoch 14/20, Train Loss: 1.0956, Val Loss: 1.1530
Epoch 15/20, Train Loss: 1.0926, Val Loss: 1.1446
Epoch 16/20, Train Loss: 1.0887, Val Loss: 1.1435
Epoch 17/20, Train Loss: 1.0853, Val Loss: 1.1478
Epoch 18/20, Train Loss: 1.0833, Val Loss: 1.1403
Epoch 19/20, Train Loss: 1.0802, Val Loss: 1.1447
Epoch 20/20, Train Loss: 1.0782, Val Loss: 1.1452


In [None]:
def generate_text(model, seed_text, length):
    model.eval()  # Set the model to evaluation mode
    generated_text = seed_text
    hidden = None

    for _ in range(length):
        # Convert seed text to integers
        input_seq = torch.tensor([char_to_index[char] for char in seed_text[-sequence_length:]], dtype=torch.long).unsqueeze(0).to(device)

        # Get predictions
        with torch.no_grad():
            output, hidden = model(input_seq, hidden)  # Forward pass
            output = output.squeeze(0)  # Remove batch dimension
            next_char_index = torch.argmax(output).item()  # Select the most probable character deterministically

        # Append the predicted character
        next_char = index_to_char[next_char_index]
        generated_text += next_char
        seed_text += next_char

    return generated_text

# Generate example text
seed_text = "It is a truth universally acknowledged that".lower()  # Convert to lowercase
generated_text = generate_text(model, seed_text, length=200)
print(generated_text)

it is a truth universally acknowledged that he had not to the companion of her family and the common and the common and the common and the common and the common and the common and the common and the common and the common and the common and the


In [None]:
import math

def calculate_perplexity(model, data_loader):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    total_count = 0
    criterion = nn.CrossEntropyLoss()  # Loss function used for training

    with torch.no_grad():
        for inputs, targets in data_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs, _ = model(inputs)  # Get model predictions
            loss = criterion(outputs, targets)  # Compute loss
            total_loss += loss.item() * inputs.size(0)  # Accumulate loss
            total_count += inputs.size(0)  # Count tokens

    # Calculate perplexity
    average_loss = total_loss / total_count
    perplexity = math.exp(average_loss)
    return perplexity

# Evaluate perplexity on the validation set
val_perplexity = calculate_perplexity(model, val_loader)
print(f"Validation Perplexity: {val_perplexity:.2f}")

Validation Perplexity: 3.14


In [None]:
from collections import Counter

def calculate_entropy(generated_text):
    # Count the frequency of each character
    char_counts = Counter(generated_text)
    total_chars = sum(char_counts.values())

    # Compute entropy
    entropy = -sum((count / total_chars) * math.log2(count / total_chars) for count in char_counts.values())
    return entropy

# Example usage
entropy = calculate_entropy(generated_text)
print(f"Entropy of Generated Text: {entropy:.2f}")

Entropy of Generated Text: 3.70


In [None]:
from nltk.translate.bleu_score import sentence_bleu

# Example reference and candidate texts
reference_text = "it is a truth universally acknowledged that a single man in possession of a good fortune must be in want of a wife".split()
candidate_text = generated_text.lower().split()

# Compute BLEU score
bleu_score = sentence_bleu([reference_text], candidate_text)
print(f"BLEU Score: {bleu_score:.2f}")

BLEU Score: 0.12
