In [4]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import time
import matplotlib.pyplot as plt

# Load the text file (upload input.txt first)
with open("input.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Vocabulary
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}
encoded_text = torch.tensor([stoi[c] for c in text], dtype=torch.long)

# Dataset
class CharDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        return (
            self.data[idx:idx + self.seq_length],
            self.data[idx + self.seq_length]
        )


In [6]:
###Models (Transformer, LSTM)s
class TransformerCharModel(nn.Module):
    def __init__(self, vocab_size, emb_size=64, num_heads=2, num_layers=2, ff_dim=128, dropout=0.1, max_seq_len=512):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.pos_embedding = nn.Parameter(torch.randn(1, max_seq_len, emb_size))
        encoder_layer = nn.TransformerEncoderLayer(d_model=emb_size, nhead=num_heads, dim_feedforward=ff_dim, dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(emb_size, vocab_size)

    def forward(self, x):
        seq_len = x.size(1)
        x = self.embedding(x) + self.pos_embedding[:, :seq_len, :]
        x = self.transformer(x)
        return self.fc(x[:, -1, :])  # Predict next char

class LSTMCharModel(nn.Module):
    def __init__(self, vocab_size, emb_size=64, hidden_size=128, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.lstm = nn.LSTM(emb_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        return self.fc(out[:, -1, :])


In [7]:
###Train & Evaluate Function
def train_model(model, train_loader, val_loader, num_epochs=5, lr=0.003):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    start_time = time.time()
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            output = model(x)
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss / len(train_loader):.4f}")
    elapsed = time.time() - start_time

    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            preds = model(x).argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)
    acc = correct / total
    return total_loss / len(train_loader), acc, elapsed, sum(p.numel() for p in model.parameters())


In [8]:
##Basic Transformer + LSTM
seq_lengths = [20, 30, 50]

for model_type in ["Transformer", "LSTM"]:
    print(f"\n======= {model_type} RESULTS =======")
    for seq_len in seq_lengths:
        print(f"\nTraining {model_type} with seq_length = {seq_len}")
        dataset = CharDataset(encoded_text, seq_len)
        train_size = int(0.9 * len(dataset))
        val_size = len(dataset) - train_size
        train_set, val_set = torch.utils.data.random_split(dataset, [train_size, val_size])
        train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
        val_loader = DataLoader(val_set, batch_size=64)

        if model_type == "Transformer":
            model = TransformerCharModel(vocab_size, num_heads=2, num_layers=2)
        else:
            model = LSTMCharModel(vocab_size)

        loss, acc, time_sec, params = train_model(model, train_loader, val_loader)
        print(f"Loss: {loss:.4f}, Acc: {acc:.4f}, Time: {time_sec:.2f}s, Params: {params}")




Training Transformer with seq_length = 20
Epoch 1, Loss: 2.0966
Epoch 2, Loss: 1.9779
Epoch 3, Loss: 1.9519
Epoch 4, Loss: 1.9439
Epoch 5, Loss: 1.9410
Loss: 1.9410, Acc: 0.4527, Time: 485.90s, Params: 108097

Training Transformer with seq_length = 30
Epoch 1, Loss: 2.0928
Epoch 2, Loss: 1.9822
Epoch 3, Loss: 1.9607
Epoch 4, Loss: 1.9427
Epoch 5, Loss: 1.9294
Loss: 1.9294, Acc: 0.4607, Time: 498.09s, Params: 108097

Training Transformer with seq_length = 50
Epoch 1, Loss: 2.0933
Epoch 2, Loss: 1.9720
Epoch 3, Loss: 1.9473
Epoch 4, Loss: 1.9395
Epoch 5, Loss: 1.9357
Loss: 1.9357, Acc: 0.4467, Time: 522.73s, Params: 108097


Training LSTM with seq_length = 20
Epoch 1, Loss: 1.6854
Epoch 2, Loss: 1.5021
Epoch 3, Loss: 1.4703
Epoch 4, Loss: 1.4570
Epoch 5, Loss: 1.4506
Loss: 1.4506, Acc: 0.5494, Time: 237.73s, Params: 243969

Training LSTM with seq_length = 30
Epoch 1, Loss: 1.6735
Epoch 2, Loss: 1.4975
Epoch 3, Loss: 1.4669
Epoch 4, Loss: 1.4542
Epoch 5, Loss: 1.4482
Loss: 1.4482, Acc: 

In [9]:
###Transformer Hyperparameter Grid Search
layers_list = [1, 2, 4]
heads_list = [2, 4]
seq_len = 30

print("\n=== Transformer Hyperparameter Comparison ===")
for layers in layers_list:
    for heads in heads_list:
        print(f"\nLayers: {layers}, Heads: {heads}")
        dataset = CharDataset(encoded_text, seq_len)
        train_size = int(0.9 * len(dataset))
        val_size = len(dataset) - train_size
        train_set, val_set = torch.utils.data.random_split(dataset, [train_size, val_size])
        train_loader = DataLoader(train_set, batch_size=64, shuffle=True)
        val_loader = DataLoader(val_set, batch_size=64)

        model = TransformerCharModel(vocab_size, num_heads=heads, num_layers=layers)
        loss, acc, time_sec, params = train_model(model, train_loader, val_loader)
        print(f"Loss: {loss:.4f}, Acc: {acc:.4f}, Time: {time_sec:.2f}s, Params: {params}")



=== Transformer Hyperparameter Comparison ===

Layers: 1, Heads: 2
Epoch 1, Loss: 2.1199
Epoch 2, Loss: 2.0297
Epoch 3, Loss: 2.0194
Epoch 4, Loss: 2.0167
Epoch 5, Loss: 2.0177
Loss: 2.0177, Acc: 0.4297, Time: 326.03s, Params: 74625

Layers: 1, Heads: 4
Epoch 1, Loss: 2.0844
Epoch 2, Loss: 1.9955
Epoch 3, Loss: 1.9963
Epoch 4, Loss: 1.9838
Epoch 5, Loss: 1.9815
Loss: 1.9815, Acc: 0.4333, Time: 330.74s, Params: 74625

Layers: 2, Heads: 2
Epoch 1, Loss: 2.0929
Epoch 2, Loss: 1.9767
Epoch 3, Loss: 1.9575
Epoch 4, Loss: 1.9470
Epoch 5, Loss: 1.9471
Loss: 1.9471, Acc: 0.4488, Time: 496.34s, Params: 108097

Layers: 2, Heads: 4
Epoch 1, Loss: 2.0563
Epoch 2, Loss: 1.9500
Epoch 3, Loss: 1.9291
Epoch 4, Loss: 1.9223
Epoch 5, Loss: 1.9219
Loss: 1.9219, Acc: 0.4639, Time: 497.07s, Params: 108097

Layers: 4, Heads: 2
Epoch 1, Loss: 2.1045
Epoch 2, Loss: 1.9969
Epoch 3, Loss: 2.0015
Epoch 4, Loss: 2.0463
Epoch 5, Loss: 2.5756
Loss: 2.5756, Acc: 0.2929, Time: 835.07s, Params: 175041

Layers: 4, Hea