In [1]:
import re
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Data Info


In [2]:
# Custom Dataset for Macbeth text
class TextDataset(Dataset):
    def __init__(self, text, seq_len, char2idx):
        self.seq_len = seq_len
        self.char2idx = char2idx

        # Encode entire text to integers
        encoded_text = [self.char2idx[c] for c in text]
        self.data = [encoded_text[i:i + seq_len + 1] for i in range(len(encoded_text) - seq_len)]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sequence = self.data[idx]
        return torch.tensor(sequence[:-1]), torch.tensor(sequence[1:])

In [3]:
# Load and preprocess the text data
with open('macbeth.txt', 'r', encoding='utf-8') as file:
    raw_text = file.read()

# Basic cleanup: remove license information, non-dialogue metadata, etc.
clean_text = re.sub(r"<<.*?>>", "", raw_text)  # Remove Project Gutenberg tags
clean_text = re.sub(r"\s+", " ", clean_text)  # Remove excessive whitespace
clean_text = clean_text.lower()  # Convert to lowercase

In [4]:
# Create a character set and mapping
chars = sorted(set(clean_text))
char2idx = {c: i for i, c in enumerate(chars)}
idx2char = {i: c for i, c in enumerate(chars)}

# Hyperparameters
seq_len = 100
batch_size = 64
hidden_dim = 256
num_layers = 2
lr = 0.001
num_epochs = 10

# Create the dataset and dataloader
dataset = TextDataset(clean_text, seq_len, char2idx)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [5]:
# Define the RNN model using LSTM
class ShakespeareRNN(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_layers):
        super(ShakespeareRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        return (torch.zeros(self.num_layers, batch_size, self.hidden_dim),
                torch.zeros(self.num_layers, batch_size, self.hidden_dim))


In [6]:
# Initialize the model, loss function, and optimizer
vocab_size = len(chars)
model = ShakespeareRNN(vocab_size, hidden_dim, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)


In [8]:
# Training loop
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    
    for input_seq, target_seq in dataloader:
        batch_size, seq_len = input_seq.size()  # Get the actual batch size

        # Initialize hidden state with the current batch size
        hidden = model.init_hidden(batch_size)

        optimizer.zero_grad()
        output, hidden = model(input_seq, hidden)
        
        # Since we reinitialize the hidden state at the start of each batch,
        # we do not need to detach the hidden states.
        loss = criterion(output.view(-1, vocab_size), target_seq.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {total_loss / len(dataloader):.4f}")


Epoch [1/10], Loss: 0.4895
Epoch [2/10], Loss: 0.2083
Epoch [3/10], Loss: 0.1684
Epoch [4/10], Loss: 0.1542
Epoch [5/10], Loss: 0.1457
Epoch [6/10], Loss: 0.1399
Epoch [7/10], Loss: 0.1356
Epoch [8/10], Loss: 0.1321
Epoch [9/10], Loss: 0.1290
Epoch [10/10], Loss: 0.1268


In [10]:
# Generate new text function
def generate_text(model, start_text, length, char2idx, idx2char):
    model.eval()
    with torch.no_grad():
        input_ids = torch.tensor([char2idx[c] for c in start_text]).unsqueeze(0)
        hidden = model.init_hidden(1)

        generated = start_text

        for _ in range(length):
            output, hidden = model(input_ids, hidden)
            pred_token = output[0, -1].argmax().item()

            generated += idx2char[pred_token]
            input_ids = torch.cat([input_ids, torch.tensor([[pred_token]])], dim=1)

        return generated

# Generate text based on the trained model
start_text = "macbeth: is this a dagger which i see before me,"
generated_text = generate_text(model, start_text, 500, char2idx, idx2char)
print(generated_text)

macbeth: is this a dagger which i see before me, the handle toward my hand? come, let me clutch thee. i have thee not, and yet i see thee still. art thou not, fatal vision, sensible to fear. leave all the rest to me. exeunt. scene vi. before macbeth's castle. hautboys and torches. enter a sewer and divers servants with dishes and service, who pass over the stage. then enter macbeth. macbeth. if it were done when 'tis done, then 'twere well it were a death, with two murtherers. now go to the door, and stay there till we call. exit attendant. w
