In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import re
import random

In [2]:
# Set seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set the seed
set_seed(42)  # You can change this to any integer value

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
class TextDataset(Dataset):
    def __init__(self, file_path, seq_length, vocab=None):
        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
        
        text = re.sub(r'\s+', ' ', text)  # Clean up spaces
        self.vocab = vocab or sorted(set(text))
        self.char2idx = {c: i for i, c in enumerate(self.vocab)}
        self.idx2char = {i: c for i, c in enumerate(self.vocab)}
        self.seq_length = seq_length
        self.text_as_int = np.array([self.char2idx[c] for c in text])
        self.num_samples = len(self.text_as_int) - seq_length
        
    def __len__(self):
        return self.num_samples
    
    def __getitem__(self, idx):
        input_seq = self.text_as_int[idx:idx + self.seq_length]
        target_seq = self.text_as_int[idx + 1:idx + self.seq_length + 1]
        return torch.tensor(input_seq, dtype=torch.long), torch.tensor(target_seq, dtype=torch.long)


In [5]:
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.gru(x, hidden)
        out = self.fc(out)
        return out, hidden
    
    def init_hidden(self, batch_size):
        return torch.zeros(num_layers, batch_size, hidden_size).to(device)


In [6]:
def train_and_generate(model, dataloader, criterion, optimizer, num_epochs, generation_length, generate_every_n_epochs):
    model.train()
    print("Started training")
    for epoch in range(1, num_epochs + 1):
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()

            hidden = model.init_hidden(inputs.size(0))
            
            outputs, hidden = model(inputs, hidden)
            hidden = hidden.detach()
            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
            loss.backward()
            optimizer.step()
        
        print(f'Epoch {epoch}/{num_epochs}, Loss: {loss.item()}')
        
        if epoch % generate_every_n_epochs == 0:
            model.eval()
            generated_text = generate_text(model, generation_length)
            print(f"Generated text after {epoch} epochs:\n{generated_text}\n")
            model.train()

In [7]:
def generate_text(model, generation_length, temperature=0.5):
    model.eval()
    generated_text = ""
    input_seq = torch.tensor([[random.randint(0, vocab_size-1)]], dtype=torch.long).to(device)
    hidden = model.init_hidden(1)
    
    for _ in range(generation_length):
        with torch.no_grad():
            output, hidden = model(input_seq, hidden)
            output = output.div(temperature).exp()
            next_char_idx = torch.multinomial(output[0, -1], 1).item()
            next_char = idx2char[next_char_idx]
            generated_text += next_char
            input_seq = torch.tensor([[next_char_idx]], dtype=torch.long).to(device)
    
    return generated_text


In [8]:
# Hyperparameters
file_path = 'trump_speeches_combined_processed.txt'
seq_length = 100
embed_size = 128
hidden_size = 256
num_layers = 2
batch_size = 64
num_epochs = 50
learning_rate = 0.001
generation_length = 500
generate_every_n_epochs = 5

# Initialize Dataset and DataLoader
dataset = TextDataset(file_path, seq_length)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, generator=torch.Generator().manual_seed(42))

# Vocabulary size
vocab_size = len(dataset.vocab)
char2idx = dataset.char2idx
idx2char = dataset.idx2char

# Initialize the model, loss function, and optimizer
model = GRUModel(vocab_size, embed_size, hidden_size, num_layers).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the model and generate text in a loop
train_and_generate(model, dataloader, criterion, optimizer, num_epochs, generation_length, generate_every_n_epochs)

Started training
Epoch 1/50, Loss: 0.8258545994758606
Epoch 2/50, Loss: 0.9102525115013123
Epoch 3/50, Loss: 0.8398287892341614
Epoch 4/50, Loss: 0.8924387693405151
Epoch 5/50, Loss: 0.8721145391464233
Generated text after 5 epochs:
 or 2,000 years ago, the democrats are coming in from china. we have the support of the united states is now in her change and they all start fifty, and enforce the right people. this is a big thing. i don't want to just read it. they have to create a fair, safe, sane, i said, "who is a disaster and they could rebuild its audience and they say it's so strongly into the clinton campaign. it’s going to be a lot of money for the farmers. he said her doin today. [applause] and we will make america g

Epoch 6/50, Loss: 0.9931888580322266
Epoch 7/50, Loss: 1.4136778116226196
Epoch 8/50, Loss: 1.3076834678649902
Epoch 9/50, Loss: 2.213380813598633
Epoch 10/50, Loss: 1.355588436126709
Generated text after 10 epochs:
4 we have to people the way, we’re going to be an