<a href="https://colab.research.google.com/github/GordonYang02/Intro-to-DL/blob/main/homework3_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
)
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from torchvision import datasets, transforms

#Problem 2
from torch.utils.data import Dataset, DataLoader, random_split
import requests

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {device}")

'''devNumber = torch.cuda.current_device()
devName = torch.cuda.get_device_name(devNumber)

print(f"Current device number is: {devNumber}")
print(f"GPU name is: {devName}")'''

Using device: cuda


'devNumber = torch.cuda.current_device()\ndevName = torch.cuda.get_device_name(devNumber)\n\nprint(f"Current device number is: {devNumber}")\nprint(f"GPU name is: {devName}")'

In [11]:
import time
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split

# --- Sample Data ---
training_text = (
    "Next character prediction is a fundamental task in the field of natural language processing (NLP) "
    "that involves predicting the next character in a sequence of text based on the characters that precede it. "
    "This task is essential for various applications, including text auto-completion, spell checking, and even in the "
    "development of sophisticated AI models capable of generating human-like text.\n"
    "At its core, next character prediction relies on statistical models or deep learning algorithms to analyze a given "
    "sequence of text and predict which character is most likely to follow. These predictions are based on patterns and "
    "relationships learned from large datasets of text during the training phase of the model.\n"
    "One of the most popular approaches to next character prediction involves the use of Recurrent Neural Networks (RNNs), "
    "and more specifically, a variant called Long Short-Term Memory (LSTM) networks. RNNs are particularly well-suited for "
    "sequential data like text, as they can maintain information in 'memory' about previous characters to inform the prediction "
    "of the next character. LSTM networks enhance this capability by being able to remember long-term dependencies, making them "
    "even more effective for next character prediction tasks.\n"
    "Training a model for next character prediction involves feeding it large amounts of text data, allowing it to learn the "
    "probability of each character's appearance following a sequence of characters. During this training process, the model "
    "adjusts its parameters to minimize the difference between its predictions and the actual outcomes, thus improving its "
    "predictive accuracy over time.\n"
    "Once trained, the model can be used to predict the next character in a given piece of text by considering the sequence of "
    "characters that precede it. This can enhance user experience in text editing software, improve efficiency in coding environments "
    "with auto-completion features, and enable more natural interactions with AI-based chatbots and virtual assistants.\n"
    "In summary, next character prediction plays a crucial role in enhancing the capabilities of various NLP applications, making "
    "text-based interactions more efficient, accurate, and human-like. Through the use of advanced machine learning models like "
    "RNNs and LSTMs, next character prediction continues to evolve, opening new possibilities for the future of text-based technology."
)

# --- Create Character Mappings ---
all_chars = sorted(set(training_text))
idx_to_char = {idx: ch for idx, ch in enumerate(all_chars)}
char_to_idx = {ch: idx for idx, ch in enumerate(all_chars)}

# --- Dataset Creation ---
def create_dataset(seq_len, text, mapping):
    """Convert text into input sequences and target characters."""
    inputs, targets = [], []
    for pos in range(len(text) - seq_len):
        chunk = text[pos: pos + seq_len]
        target_char = text[pos + seq_len]
        inputs.append([mapping[c] for c in chunk])
        targets.append(mapping[target_char])
    return np.array(inputs), np.array(targets)

# --- Model Definition ---
class CharPredictor(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_classes, cell_variant="RNN"):
        super(CharPredictor, self).__init__()
        self.hidden_dim = hidden_dim
        self.embed = nn.Embedding(vocab_size, hidden_dim)
        cell_variant = cell_variant.upper()
        if cell_variant == "RNN":
            self.cell = nn.RNN(hidden_dim, hidden_dim, batch_first=True)
        elif cell_variant == "LSTM":
            self.cell = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        elif cell_variant == "GRU":
            self.cell = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        else:
            raise ValueError("cell_variant must be one of: RNN, LSTM, GRU")
        self.classifier = nn.Linear(hidden_dim, num_classes)

    def forward(self, seq):
        # Embed input indices to vectors
        embedded_seq = self.embed(seq)
        # Process sequence through the recurrent cell
        rnn_out, _ = self.cell(embedded_seq)
        # Use the output from the final time step
        final_step = rnn_out[:, -1, :]
        return self.classifier(final_step)

# --- Training & Evaluation ---
def run_training(model, train_data, valid_data, num_epochs, lr):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    X_train, y_train = train_data
    X_valid, y_valid = valid_data

    t0 = time.time()
    for ep in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        logits = model(X_train)
        loss = criterion(logits, y_train)
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            val_logits = model(X_valid)
            val_loss = criterion(val_logits, y_valid)
            _, preds = torch.max(val_logits, dim=1)
            accuracy = (preds == y_valid).float().mean()
        if (ep + 1) % 10 == 0:
            print(f"Epoch {ep+1}: Train Loss={loss.item():.4f} | Val Loss={val_loss.item():.4f} | Val Acc={accuracy.item():.4f}")
    elapsed = time.time() - t0
    total_params = sum(p.numel() for p in model.parameters())
    return loss.item(), accuracy.item(), elapsed, total_params

# --- Main Experiment Loop ---
if __name__ == '__main__':
    # Experiment settings
    seq_options = [10, 20, 30]
    cell_types = ["RNN", "LSTM", "GRU"]
    hidden_dim = 128
    num_epochs = 100
    lr = 0.005

    experiment_results = []

    for cell in cell_types:
        print(f"\n=== Experiment: {cell} Model ===")
        for seq_len in seq_options:
            print(f"\n--- Sequence Length: {seq_len} ---")
            # Generate full dataset for current sequence length
            X_all, y_all = create_dataset(seq_len, training_text, char_to_idx)
            # Split into training and validation sets
            X_train_np, X_val_np, y_train_np, y_val_np = train_test_split(
                X_all, y_all, test_size=0.2, random_state=42
            )
            # Convert numpy arrays to PyTorch tensors
            X_train = torch.tensor(X_train_np, dtype=torch.long)
            y_train = torch.tensor(y_train_np, dtype=torch.long)
            X_val = torch.tensor(X_val_np, dtype=torch.long)
            y_val = torch.tensor(y_val_np, dtype=torch.long)

            # Initialize the model
            model_inst = CharPredictor(
                vocab_size=len(all_chars),
                hidden_dim=hidden_dim,
                num_classes=len(all_chars),
                cell_variant=cell
            )
            # Train and evaluate the model
            final_loss, final_acc, elapsed_time, param_count = run_training(
                model_inst, (X_train, y_train), (X_val, y_val), num_epochs, lr
            )
            experiment_results.append({
                'Cell Type': cell,
                'Sequence Length': seq_len,
                'Final Loss': final_loss,
                'Validation Accuracy': final_acc,
                'Time (s)': elapsed_time,
                'Parameter Count': param_count
            })

   # --- Print Summary ---
print("\n=== Summary of Results ===")
for res in experiment_results:
    print(f"{res['Cell Type']} | Seq Len: {res['Sequence Length']} | Loss: {res['Final Loss']:.4f} | "
          f"Val Acc: {res['Validation Accuracy']:.4f} | Time: {res['Time (s)']:.2f}s | Params: {res['Parameter Count']}")



=== Experiment: RNN Model ===

--- Sequence Length: 10 ---
Epoch 10: Train Loss=2.2529 | Val Loss=2.2937 | Val Acc=0.3866
Epoch 20: Train Loss=1.7820 | Val Loss=2.0591 | Val Acc=0.4286
Epoch 30: Train Loss=1.4257 | Val Loss=1.9322 | Val Acc=0.4811
Epoch 40: Train Loss=1.1156 | Val Loss=1.8887 | Val Acc=0.4979
Epoch 50: Train Loss=0.8352 | Val Loss=1.8972 | Val Acc=0.5336
Epoch 60: Train Loss=0.6036 | Val Loss=1.9626 | Val Acc=0.5567
Epoch 70: Train Loss=0.4017 | Val Loss=2.0809 | Val Acc=0.5609
Epoch 80: Train Loss=0.2560 | Val Loss=2.2224 | Val Acc=0.5483
Epoch 90: Train Loss=0.1584 | Val Loss=2.3785 | Val Acc=0.5399
Epoch 100: Train Loss=0.1054 | Val Loss=2.5253 | Val Acc=0.5441

--- Sequence Length: 20 ---
Epoch 10: Train Loss=2.2658 | Val Loss=2.3164 | Val Acc=0.3734
Epoch 20: Train Loss=1.8031 | Val Loss=2.0673 | Val Acc=0.4473
Epoch 30: Train Loss=1.4424 | Val Loss=1.9782 | Val Acc=0.4873
Epoch 40: Train Loss=1.1247 | Val Loss=1.9556 | Val Acc=0.4852
Epoch 50: Train Loss=0.8389 

In [13]:
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import requests

# --- Data Acquisition ---
def fetch_shakespeare(url="https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        raise RuntimeError("Failed to download the dataset.")

# --- Data Preparation ---
def prepare_shakespeare_data(seq_len, text):
    # Create a sorted set of characters and mapping dictionaries
    unique_chars = sorted(set(text))
    char2idx = {ch: idx for idx, ch in enumerate(unique_chars)}
    idx2char = {idx: ch for idx, ch in enumerate(unique_chars)}

    # Convert the text to a list of integer indices
    encoded = [char2idx[ch] for ch in text]

    # Create sequences and corresponding targets
    seq_list, target_list = [], []
    for pos in range(len(encoded) - seq_len):
        seq_list.append(encoded[pos:pos + seq_len])
        target_list.append(encoded[pos + seq_len])

    # Return tensors along with the mappings
    return (torch.tensor(seq_list, dtype=torch.long),
            torch.tensor(target_list, dtype=torch.long),
            char2idx, idx2char)

# --- Custom Dataset ---
class ShakespeareDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]

# --- Model Definition ---
class CharRNNModel(nn.Module):
    def __init__(self, vocab_size, hidden_dim, out_dim, cell_type="LSTM", num_layers=1):
        super(CharRNNModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.embed = nn.Embedding(vocab_size, hidden_dim)

        cell_type = cell_type.upper()
        if cell_type == "LSTM":
            self.rnn = nn.LSTM(hidden_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        elif cell_type == "GRU":
            self.rnn = nn.GRU(hidden_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        else:
            raise ValueError("cell_type must be either 'LSTM' or 'GRU'")

        # Fully connected output layer
        self.fc = nn.Linear(hidden_dim, out_dim)

    def forward(self, x):
        # x: (batch_size, sequence_length)
        x_embed = self.embed(x)  # (batch_size, sequence_length, hidden_dim)
        rnn_out, _ = self.rnn(x_embed)
        # Use the output from the final time step
        final_output = rnn_out[:, -1, :]
        logits = self.fc(final_output)
        return logits

# --- Training and Evaluation ---
def train_model(model, train_loader, valid_loader, epochs, lr, device):
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    model.to(device)

    start_time = time.time()
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer.zero_grad()
            preds = model(batch_x)
            loss = loss_fn(preds, batch_y)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        avg_train_loss = running_loss / len(train_loader)

        # Validation loop
        model.eval()
        val_loss_total = 0.0
        correct = 0
        count = 0
        with torch.no_grad():
            for v_x, v_y in valid_loader:
                v_x, v_y = v_x.to(device), v_y.to(device)
                v_preds = model(v_x)
                loss_val = loss_fn(v_preds, v_y)
                val_loss_total += loss_val.item()
                _, predicted_labels = torch.max(v_preds, dim=1)
                correct += (predicted_labels == v_y).sum().item()
                count += v_y.size(0)
        avg_val_loss = val_loss_total / len(valid_loader)
        val_accuracy = correct / count

        if (epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}: Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Acc: {val_accuracy:.4f}")

    total_time = time.time() - start_time
    param_count = sum(p.numel() for p in model.parameters())
    return avg_train_loss, val_accuracy, total_time, param_count

# --- Main Experiment ---
if __name__ == "__main__":
    # Check for GPU availability
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Download the tiny Shakespeare dataset
    shakespeare_text = fetch_shakespeare()

    # Hyperparameters and settings
    seq_lengths = [20, 30, 50]  # Include sequence length 50 as requested
    cell_variants = ["LSTM", "GRU"]
    hidden_dim = 128
    num_epochs = 50
    batch_size = 128
    learning_rate = 0.005
    num_layers = 1  # Can adjust to experiment with deeper RNNs

    results = []

    # Loop over sequence lengths and model types
    for seq_len in seq_lengths:
        print(f"\n=== Processing Sequence Length: {seq_len} ===")
        X_data, y_data, char2idx, idx2char = prepare_shakespeare_data(seq_len, shakespeare_text)
        dataset = ShakespeareDataset(X_data, y_data)

        # Split dataset into training and validation subsets
        train_size = int(0.8 * len(dataset))
        valid_size = len(dataset) - train_size
        train_set, valid_set = random_split(dataset, [train_size, valid_size])

        train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
        valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=False)

        for cell in cell_variants:
            print(f"\n--- Training {cell} model with sequence length {seq_len} ---\n")
            model = CharRNNModel(
                vocab_size=len(char2idx),
                hidden_dim=hidden_dim,
                out_dim=len(char2idx),
                cell_type=cell,
                num_layers=num_layers
            )
            train_loss, val_acc, train_time, model_params = train_model(
                model, train_loader, valid_loader, num_epochs, learning_rate, device
            )
            results.append({
                "Cell Type": cell,
                "Seq Length": seq_len,
                "Train Loss": train_loss,
                "Val Acc": val_acc,
                "Time (s)": train_time,
                "Model Size": model_params
            })

    # --- Summary of Results ---
print("\n=== Final Model Comparison ===")
for res in results:
    print(f"{res['Cell Type']} | Seq Len: {res['Seq Length']} | Loss: {res['Train Loss']:.4f} | Val Acc: {res['Val Acc']:.4f} | Time: {res['Time (s)']:.2f}s | Params: {res['Model Size']}")




=== Processing Sequence Length: 20 ===

--- Training LSTM model with sequence length 20 ---

Epoch 10: Train Loss: 1.5437 | Val Loss: 1.5639 | Val Acc: 0.5265
Epoch 20: Train Loss: 1.5946 | Val Loss: 1.6108 | Val Acc: 0.5153
Epoch 30: Train Loss: 1.6395 | Val Loss: 1.6616 | Val Acc: 0.5025
Epoch 40: Train Loss: 1.6615 | Val Loss: 1.6788 | Val Acc: 0.4995
Epoch 50: Train Loss: 1.6783 | Val Loss: 1.6913 | Val Acc: 0.4915

--- Training GRU model with sequence length 20 ---

Epoch 10: Train Loss: 1.8482 | Val Loss: 1.8565 | Val Acc: 0.4568
Epoch 20: Train Loss: 1.8582 | Val Loss: 1.8808 | Val Acc: 0.4480
Epoch 30: Train Loss: 1.8616 | Val Loss: 1.8535 | Val Acc: 0.4585
Epoch 40: Train Loss: 1.8523 | Val Loss: 1.8584 | Val Acc: 0.4589
Epoch 50: Train Loss: 1.8428 | Val Loss: 1.8611 | Val Acc: 0.4554

=== Processing Sequence Length: 30 ===

--- Training LSTM model with sequence length 30 ---

Epoch 10: Train Loss: 1.5261 | Val Loss: 1.5479 | Val Acc: 0.5354
Epoch 20: Train Loss: 1.5768 | Va

In [20]:
"""Problem 2 Adjust hyperparameters (fully connected network, number of hidden layers, and the number of hidden states)"""

import time
import torch
import torch.nn as nn
import torch.optim as optim
import requests
from torch.utils.data import Dataset, DataLoader, random_split

# Download the Tiny Shakespeare dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text  # the whole text

# Function to convert text into sequences of a given length
def get_data(seq_len):
    # Get unique characters and build mappings
    chars = sorted(set(text))
    char2idx = {c: i for i, c in enumerate(chars)}
    idx2char = {i: c for i, c in enumerate(chars)}

    # Encode the text into numbers
    encoded = [char2idx[c] for c in text]

    X, y = [], []
    for i in range(len(encoded) - seq_len):
        X.append(encoded[i:i+seq_len])
        y.append(encoded[i+seq_len])
    return torch.tensor(X, dtype=torch.long), torch.tensor(y, dtype=torch.long), char2idx, idx2char

# Create a simple Dataset class
class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# Define a simple RNN model using either LSTM or GRU
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, model_type="LSTM"):
        super(SimpleRNN, self).__init__()
        self.emb = nn.Embedding(vocab_size, hidden_size)
        if model_type.upper() == "LSTM":
            self.rnn = nn.LSTM(hidden_size, hidden_size, batch_first=True)
        else:
            self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    def forward(self, x):
        x = self.emb(x)           # shape: [batch, seq_len, hidden_size]
        out, _ = self.rnn(x)      # shape: [batch, seq_len, hidden_size]
        out = self.fc(out[:, -1, :])  # use last time step
        return out

# Training loop function
def train_model(model, train_loader, val_loader, epochs, lr, device):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    model.to(device)
    start = time.time()

    for ep in range(epochs):
        model.train()
        total_loss = 0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)

        # Evaluate on validation set
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                outputs = model(xb)
                l = criterion(outputs, yb)
                val_loss += l.item()
                _, preds = torch.max(outputs, 1)
                correct += (preds == yb).sum().item()
                total += yb.size(0)
        avg_val_loss = val_loss / len(val_loader)
        acc = correct / total
        if (ep + 1) % 10 == 0:
            print(f"Epoch {ep+1}: Train {avg_loss:.4f}, Val {avg_val_loss:.4f}, Acc {acc:.4f}")

    total_time = time.time() - start
    num_params = sum(p.numel() for p in model.parameters())
    return avg_loss, acc, total_time, num_params

# Settings
seq_lengths = [20, 30]   # you can add 50 if needed
model_types = ["LSTM", "GRU"]
hidden_size = 64
epochs = 50
batch_size = 128
lr = 0.01
results = []
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Main loop for experiments
for sl in seq_lengths:
    print(f"\nProcessing sequence length: {sl}")
    X, y, c2i, i2c = get_data(sl)
    dataset = MyDataset(X, y)
    train_size = int(0.8 * len(dataset))
    val_size = len(dataset) - train_size
    train_set, val_set = random_split(dataset, [train_size, val_size])
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False)

    for typ in model_types:
        print(f"Training {typ} model with seq_len {sl}")
        model = SimpleRNN(len(c2i), hidden_size, model_type=typ)
        tr_loss, val_acc, t_time, m_size = train_model(model, train_loader, val_loader, epochs, lr, device)
        results.append((typ, sl, tr_loss, val_acc, t_time, m_size))

# Show final results
print("\nFinal Results:")
for r in results:
    print(f"{r[0]} | Seq: {r[1]} | Loss: {r[2]:.4f} | Acc: {r[3]:.4f} | Time: {r[4]:.2f}s | Size: {r[5]}")



Processing sequence length: 20
Training LSTM model with seq_len 20
Epoch 10: Train 1.8000, Val 1.8293, Acc 0.4620
Epoch 20: Train 1.8491, Val 1.8675, Acc 0.4493
Epoch 30: Train 1.8838, Val 1.8950, Acc 0.4436
Epoch 40: Train 1.9002, Val 1.9067, Acc 0.4412
Epoch 50: Train 1.9165, Val 1.9237, Acc 0.4411
Training GRU model with seq_len 20
Epoch 10: Train 2.0195, Val 2.0394, Acc 0.4036
Epoch 20: Train 2.0560, Val 2.0622, Acc 0.4108
Epoch 30: Train 2.0707, Val 2.0946, Acc 0.3990
Epoch 40: Train 2.1019, Val 2.1088, Acc 0.3884
Epoch 50: Train 2.1130, Val 2.1364, Acc 0.3908

Processing sequence length: 30
Training LSTM model with seq_len 30
Epoch 10: Train 1.7862, Val 1.7984, Acc 0.4667
Epoch 20: Train 1.8458, Val 1.8456, Acc 0.4605
Epoch 30: Train 1.8799, Val 1.8894, Acc 0.4495
Epoch 40: Train 1.8849, Val 1.8975, Acc 0.4483
Epoch 50: Train 1.8977, Val 1.8998, Acc 0.4473
Training GRU model with seq_len 30
Epoch 10: Train 2.0078, Val 2.0031, Acc 0.4127
Epoch 20: Train 2.0619, Val 2.0662, Acc 0.

In [19]:
'''What if we increase the sequence length to 50? Perform the training and report the accuracy and model complexity results.'''

import time
import torch
import torch.nn as nn
import torch.optim as optim
import requests
from torch.utils.data import Dataset, DataLoader, random_split

#  Get the Data
def download_shakespeare():
    url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
    r = requests.get(url)
    return r.text

#   Create Sequences
def make_sequences(seq_length, raw_text):
    # Create character-to-index map
    all_chars = sorted(set(raw_text))
    mapping = {ch: i for i, ch in enumerate(all_chars)}

    # Encode the text and build sequence lists
    encoded = [mapping[ch] for ch in raw_text]
    seqs = [encoded[i:i+seq_length] for i in range(len(encoded) - seq_length)]
    targets = [encoded[i+seq_length] for i in range(len(encoded) - seq_length)]
    return torch.tensor(seqs, dtype=torch.long), torch.tensor(targets, dtype=torch.long), mapping

#   Dataset Definition
class ShakespeareDataset(Dataset):
    def __init__(self, sequences, targets):
        self.seqs = sequences
        self.targs = targets
    def __len__(self):
        return len(self.seqs)
    def __getitem__(self, idx):
        return self.seqs[idx], self.targs[idx]

#   Define the Model
class MyRNNModel(nn.Module):
    def __init__(self, vocab_size, hidden_dim, cell_type='LSTM'):
        super(MyRNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim)
        # Use either LSTM or GRU based on input
        if cell_type.upper() == 'LSTM':
            self.rnn = nn.LSTM(hidden_dim, hidden_dim, batch_first=True)
        else:
            self.rnn = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, input_seq):
        emb = self.embedding(input_seq)   # [batch, seq_len, hidden_dim]
        out, _ = self.rnn(emb)              # [batch, seq_len, hidden_dim]
        last_out = out[:, -1, :]            # use the output from the final time step
        return self.fc(last_out)

#   Training and Evaluation
def train_and_test(model, train_loader, valid_loader, num_epochs, lr, dev):
    loss_function = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    model.to(dev)
    t0 = time.time()

    for epoch in range(1, num_epochs+1):
        model.train()
        sum_loss = 0.0
        for batch in train_loader:
            inputs, labels = batch
            inputs, labels = inputs.to(dev), labels.to(dev)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()
        train_avg = sum_loss / len(train_loader)

        # Evaluation step
        model.eval()
        total_loss, correct, total = 0.0, 0, 0
        with torch.no_grad():
            for batch in valid_loader:
                inp, lab = batch
                inp, lab = inp.to(dev), lab.to(dev)
                out = model(inp)
                total_loss += loss_function(out, lab).item()
                correct += (out.argmax(dim=1) == lab).sum().item()
                total += lab.size(0)
        valid_avg = total_loss / len(valid_loader)
        acc = correct / total
        if epoch % 10 == 0:
            print(f"Epoch {epoch}: TrainLoss = {train_avg:.4f}, ValidLoss = {valid_avg:.4f}, Accuracy = {acc:.4f}")

    elapsed = time.time() - t0
    param_total = sum(p.numel() for p in model.parameters())
    return train_avg, acc, elapsed, param_total

#  Main Experiment (Sequence Length = 50)
def main():
    seq_length = 50
    raw_text = download_shakespeare()
    X, Y, char_map = make_sequences(seq_length, raw_text)

    dataset = ShakespeareDataset(X, Y)
    train_len = int(0.8 * len(dataset))
    valid_len = len(dataset) - train_len
    train_set, valid_set = random_split(dataset, [train_len, valid_len])
    train_dl = DataLoader(train_set, batch_size=128, shuffle=True)
    valid_dl = DataLoader(valid_set, batch_size=128, shuffle=False)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    results = {}
    for cell in ['LSTM', 'GRU']:
        print(f"\nTraining model using {cell} cell with sequence length = {seq_length}")
        net = MyRNNModel(len(char_map), hidden_dim=64, cell_type=cell)
        t_loss, v_acc, run_time, num_params = train_and_test(net, train_dl, valid_dl, num_epochs=50, lr=0.01, dev=device)
        results[cell] = (t_loss, v_acc, run_time, num_params)

    print("\nFinal Results:")
    for cell_type, metrics in results.items():
        tl, acc, rt, params = metrics
        print(f"{cell_type}: TrainLoss = {tl:.4f}, ValidAcc = {acc:.4f}, Time = {rt:.2f}s, Params = {params}")

if __name__ == '__main__':
    main()



Training model using LSTM cell with sequence length = 50
Epoch 10: TrainLoss = 1.7689, ValidLoss = 1.7733, Accuracy = 0.4745
Epoch 20: TrainLoss = 1.8365, ValidLoss = 1.8507, Accuracy = 0.4486
Epoch 30: TrainLoss = 1.8865, ValidLoss = 1.8929, Accuracy = 0.4483
Epoch 40: TrainLoss = 1.8962, ValidLoss = 1.9097, Accuracy = 0.4437
Epoch 50: TrainLoss = 1.8935, ValidLoss = 1.8943, Accuracy = 0.4485

Training model using GRU cell with sequence length = 50
Epoch 10: TrainLoss = 2.0276, ValidLoss = 2.0328, Accuracy = 0.4103
Epoch 20: TrainLoss = 2.0654, ValidLoss = 2.0666, Accuracy = 0.3957
Epoch 30: TrainLoss = 2.0960, ValidLoss = 2.1114, Accuracy = 0.3945
Epoch 40: TrainLoss = 2.1312, ValidLoss = 2.1335, Accuracy = 0.3918
Epoch 50: TrainLoss = 2.1695, ValidLoss = 2.1824, Accuracy = 0.3737

Final Results:
LSTM: TrainLoss = 1.8935, ValidAcc = 0.4485, Time = 727.30s, Params = 41665
GRU: TrainLoss = 2.1695, ValidAcc = 0.3737, Time = 673.06s, Params = 33345
