In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import requests
from bs4 import BeautifulSoup
import re
from torch.utils.data import Dataset, DataLoader

### Dataset scraping and preparation

In [None]:
print("Scraping and preparing data")
url = "https://www.gutenberg.org/cache/epub/84/pg84-images.html"
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
raw_text = soup.get_text()
tokens = re.findall(r'\b\w+\b', raw_text.lower())

# Build Vocabulary
vocab = sorted(list(set(tokens)))
word_to_idx = {word: i for i, word in enumerate(vocab)}
idx_to_word = {i: word for i, word in enumerate(vocab)}
vocab_size = len(vocab)

# Define window_size variable
window_size = 100
seq_length = window_size - 1 # 99 tokens for input

inputs = []
targets = []

# Format into lists where inner lists contain 99 numbers
for i in range(len(tokens) - window_size + 1):
    seq_in = tokens[i : i + seq_length]
    seq_out = tokens[i + seq_length]

    # Input sequence and target are consecutive
    inputs.append([word_to_idx[w] for w in seq_in])
    targets.append(word_to_idx[seq_out])

class TextDataset(Dataset):
    def __init__(self, x_data, y_data):
        self.x = torch.tensor(x_data, dtype=torch.long)
        self.y = torch.tensor(y_data, dtype=torch.long)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]

dataset = TextDataset(inputs, targets)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)


Scraping and preparing data


### RNN model architecture

In [None]:
class TextGenerationRNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(TextGenerationRNN, self).__init__()
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_size)
        # one RNN layer
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        # Fully connected layer for output
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        # x shape: (batch_size, seq_length)
        embedded = self.embedding(x)
        # out shape: (batch_size, seq_length, hidden_size)
        out, hidden = self.rnn(embedded)
        # Take the output of the final token in the sequence to predict the next word
        last_out = out[:, -1, :]
        logits = self.fc(last_out)
        return logits

# Hyperparameters
embed_size = 64
hidden_size = 128
learning_rate = 0.005
epochs = 15

model = TextGenerationRNN(vocab_size, embed_size, hidden_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

### Model Training

In [None]:
print("Starting training...")
model.train()
for epoch in range(epochs):
    epoch_loss = 0
    for batch_inputs, batch_targets in dataloader:
        optimizer.zero_grad()
        outputs = model(batch_inputs)
        loss = criterion(outputs, batch_targets)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    if (epoch + 1) % 3 == 0 or epoch == 0:
        print(f"Epoch {epoch+1}/{epochs} | Loss: {epoch_loss/len(dataloader):.4f}")

Starting training...
Epoch 1/15 | Loss: 6.6138
Epoch 3/15 | Loss: 5.7538
Epoch 6/15 | Loss: 6.2714
Epoch 9/15 | Loss: 5.0601
Epoch 12/15 | Loss: 4.8546
Epoch 15/15 | Loss: 5.5523


### Continue Model Training

In [None]:
epochs = 24

In [None]:
print(f"Continuing training from epoch {epoch + 1}...")
model.train()
# Continue training from the last completed epoch up to the total number of epochs
for current_epoch in range(epoch, epochs):
    epoch_loss = 0
    for batch_inputs, batch_targets in dataloader:
        optimizer.zero_grad()
        outputs = model(batch_inputs)
        loss = criterion(outputs, batch_targets)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    if (current_epoch + 1) % 3 == 0 or current_epoch == 0:
        print(f"Epoch {current_epoch+1}/{epochs} | Loss: {epoch_loss/len(dataloader):.4f}")

print("Training complete.")

Continuing training from epoch 15...
Epoch 15/24 | Loss: 5.8290
Epoch 18/24 | Loss: 5.5928
Epoch 21/24 | Loss: 4.8260
Epoch 24/24 | Loss: 4.5780
Training complete.


### Text Generation

In [10]:
def generate_text(model, seed_words, num_words_to_generate=100):
    model.eval()
    words = seed_words.lower().split()

    with torch.no_grad():
        # generate a text of at least 100 words
        for _ in range(num_words_to_generate):
            # Take the most recent context window (up to 99 tokens)
            context_words = words[-seq_length:]

            # Convert context to indices (defaulting to 0 for unknown words in seed)
            context_indices = [word_to_idx.get(w, 0) for w in context_words]
            x_tensor = torch.tensor([context_indices], dtype=torch.long)

            # Predict the next word
            prediction = model(x_tensor)
            predicted_idx = torch.argmax(prediction, dim=-1).item()
            predicted_word = idx_to_word[predicted_idx]
            words.append(predicted_word)

    return ' '.join(words)

print("\nGenerating text...")
seed_sentence = "the monster looked at me and"
generated_output = generate_text(model, seed_sentence, num_words_to_generate=100)

print("\n--- Final Generated Text ---")
print(generated_output)


Generating text...

--- Final Generated Text ---
the monster looked at me and extinguish which i had arrived at the alteration except the growth of chamounix the cabin where i had arrived at the project gutenberg concept and the alps of the american hemisphere and tingling despairing and i had no right to the treacherous turk i cannot describe the uses of the project gutenberg ebook frankenstein the resources of the project gutenberg concept and ardent works in accordance and the range of the higher of chamounix the silence of the project gutenberg concept and the alps of the american hemisphere and tingling despairing and i shunned the inclemency of the project gutenberg
