<a href="https://colab.research.google.com/github/JingchenYan1/Real-Time-ML/blob/main/Homework3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import time
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

text = """Next character prediction is a fundamental task in the field of natural language processing (NLP) that involves predicting the next character in a sequence of text based on the characters that precede it. This task is essential for various applications, including text auto-completion, spell checking, and even in the development of sophisticated AI models capable of generating human-like text.

At its core, next character prediction relies on statistical models or deep learning algorithms to analyze a given sequence of text and predict which character is most likely to follow. These predictions are based on patterns and relationships learned from large datasets of text during the training phase of the model.

One of the most popular approaches to next character prediction involves the use of Recurrent Neural Networks (RNNs), and more specifically, a variant called Long Short-Term Memory (LSTM) networks. RNNs are particularly well-suited for sequential data like text, as they can maintain information in 'memory' about previous characters to inform the prediction of the next character. LSTM networks enhance this capability by being able to remember long-term dependencies, making them even more effective for next character prediction tasks.

Training a model for next character prediction involves feeding it large amounts of text data, allowing it to learn the probability of each character's appearance following a sequence of characters. During this training process, the model adjusts its parameters to minimize the difference between its predictions and the actual outcomes, thus improving its predictive accuracy over time.

Once trained, the model can be used to predict the next character in a given piece of text by considering the sequence of characters that precede it. This can enhance user experience in text editing software, improve efficiency in coding environments with auto-completion features, and enable more natural interactions with AI-based chatbots and virtual assistants.

In summary, next character prediction plays a crucial role in enhancing the capabilities of various NLP applications, making text-based interactions more efficient, accurate, and human-like. Through the use of advanced machine learning models like RNNs and LSTMs, next character prediction continues to evolve, opening new possibilities for the future of text-based technology."""

chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Vocabulary size:", vocab_size)

char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

text_indices = [char_to_idx[ch] for ch in text]
text_tensor = torch.tensor(text_indices, dtype=torch.long)

class TextDataset(Dataset):
    def __init__(self, text_tensor, seq_length):
        self.text_tensor = text_tensor
        self.seq_length = seq_length

    def __len__(self):
        return len(self.text_tensor) - self.seq_length

    def __getitem__(self, idx):
        return self.text_tensor[idx:idx + self.seq_length], self.text_tensor[idx + self.seq_length]

class CharRNN(nn.Module):
    def __init__(self, rnn_type, vocab_size, embed_size, hidden_size, num_layers=1):
        super(CharRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn_type = rnn_type
        if rnn_type == 'RNN':
            self.rnn = nn.RNN(embed_size, hidden_size, num_layers, batch_first=True)
        elif rnn_type == 'LSTM':
            self.rnn = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True)
        else:
            raise ValueError("Unknown rnn_type")
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        if self.rnn_type == 'LSTM':
            out, (h_n, c_n) = self.rnn(x)
        else:
            out, h_n = self.rnn(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out

def train_epoch(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0.0
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * inputs.size(0)
    return total_loss / len(dataloader.dataset)

def evaluate_model(model, dataloader, criterion):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, dim=1)
            correct += (predicted == targets).sum().item()
            total += targets.size(0)
    accuracy = correct / total
    return total_loss / len(dataloader.dataset), accuracy

results = {}
num_epochs = 20
batch_size = 32
embed_size = 64
hidden_size = 128
learning_rate = 0.001

for seq_length in [10, 20, 30]:
    dataset = TextDataset(text_tensor, seq_length)
    dataset_size = len(dataset)
    train_size = int(0.8 * dataset_size)
    val_size = dataset_size - train_size
    train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    for rnn_type in ['RNN', 'LSTM', 'GRU']:
        print(f"\nTraining model: {rnn_type}, Sequence length: {seq_length}")
        model = CharRNN(rnn_type, vocab_size, embed_size, hidden_size).to(device)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        criterion = nn.CrossEntropyLoss()

        start_time = time.time()
        for epoch in range(num_epochs):
            train_loss = train_epoch(model, train_loader, criterion, optimizer)
            val_loss, val_acc = evaluate_model(model, val_loader, criterion)
            print(f"Epoch {epoch + 1:2d}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
        elapsed_time = time.time() - start_time

        num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

        results[(rnn_type, seq_length)] = {
            "train_loss": train_loss,
            "val_loss": val_loss,
            "val_acc": val_acc,
            "time": elapsed_time,
            "num_params": num_params
        }
        print(f"{rnn_type} Model (Sequence Length {seq_length}) Training Time: {elapsed_time:.2f} sec, Parameters: {num_params}")
        print("-" * 60)

print("\nFinal results summary:")
for key, res in results.items():
    rnn_type, seq_length = key
    print(f"Model: {rnn_type:3s}, Sequence Length: {seq_length:2d}, Train Loss: {res['train_loss']:.4f}, "
          f"Val Loss: {res['val_loss']:.4f}, Val Acc: {res['val_acc']:.4f}, "
          f"Time: {res['time']:.2f}s, Params: {res['num_params']}")


Device: cuda
Vocabulary size: 45

Training model: RNN, Sequence length: 10
Epoch  1/20, Train Loss: 3.1070, Val Loss: 2.6928, Val Acc: 0.2977
Epoch  2/20, Train Loss: 2.5079, Val Loss: 2.4294, Val Acc: 0.3669
Epoch  3/20, Train Loss: 2.2711, Val Loss: 2.2895, Val Acc: 0.3899
Epoch  4/20, Train Loss: 2.0916, Val Loss: 2.2030, Val Acc: 0.4025
Epoch  5/20, Train Loss: 1.9497, Val Loss: 2.1480, Val Acc: 0.4277
Epoch  6/20, Train Loss: 1.8276, Val Loss: 2.0818, Val Acc: 0.4298
Epoch  7/20, Train Loss: 1.7095, Val Loss: 2.0756, Val Acc: 0.4465
Epoch  8/20, Train Loss: 1.6131, Val Loss: 2.0352, Val Acc: 0.4654
Epoch  9/20, Train Loss: 1.5196, Val Loss: 2.0185, Val Acc: 0.4696
Epoch 10/20, Train Loss: 1.4294, Val Loss: 2.0341, Val Acc: 0.4633
Epoch 11/20, Train Loss: 1.3534, Val Loss: 2.0017, Val Acc: 0.4843
Epoch 12/20, Train Loss: 1.2779, Val Loss: 1.9867, Val Acc: 0.4717
Epoch 13/20, Train Loss: 1.2046, Val Loss: 2.0203, Val Acc: 0.4717
Epoch 14/20, Train Loss: 1.1329, Val Loss: 1.9929, Val

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import time
import math
from torch.utils.data import Dataset, DataLoader
import requests

url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)
text = response.text

chars = sorted(list(set(text)))
vocab_size = len(chars)
char_to_int = {ch: i for i, ch in enumerate(chars)}
int_to_char = {i: ch for i, ch in enumerate(chars)}

encoded_text = [char_to_int[ch] for ch in text]


def create_sequences_targets(encoded_text, sequence_length):
    sequences = []
    targets = []
    for i in range(0, len(encoded_text) - sequence_length):
        seq = encoded_text[i:i + sequence_length]
        target = encoded_text[i + sequence_length]
        sequences.append(seq)
        targets.append(target)
    sequences = torch.tensor(sequences, dtype=torch.long)
    targets = torch.tensor(targets, dtype=torch.long)
    return sequences, targets


class CharDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, index):
        return self.sequences[index], self.targets[index]


class CharModel(nn.Module):
    def __init__(self, model_type, vocab_size, embed_dim, hidden_size, num_layers=1):
        super(CharModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.model_type = model_type
        if model_type == "LSTM":
            self.rnn = nn.LSTM(embed_dim, hidden_size, num_layers, batch_first=True)
        elif model_type == "GRU":
            self.rnn = nn.GRU(embed_dim, hidden_size, num_layers, batch_first=True)
        else:
            raise ValueError("model_type must be LSTM or GRU")
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        if self.model_type == "LSTM":
            out, (h_n, c_n) = self.rnn(x)
        else:
            out, h_n = self.rnn(x)
        out = out[:, -1, :]
        out = self.fc(out)
        return out


def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    for inputs, targets in dataloader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * inputs.size(0)
    return total_loss / len(dataloader.dataset)


def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, dim=1)
            correct += (predicted == targets).sum().item()
            total += targets.size(0)
    avg_loss = total_loss / len(dataloader.dataset)
    accuracy = correct / total
    return avg_loss, accuracy


def generate_text(model, seed_text, gen_length, device):
    model.eval()
    generated = seed_text
    input_seq = torch.tensor([char_to_int[ch] for ch in seed_text], dtype=torch.long).unsqueeze(0).to(device)
    with torch.no_grad():
        for _ in range(gen_length):
            output = model(input_seq)
            probs = torch.softmax(output, dim=1).squeeze()
            idx = torch.multinomial(probs, 1).item()
            generated += int_to_char[idx]
            input_seq = torch.cat([input_seq[:, 1:], torch.tensor([[idx]], dtype=torch.long).to(device)], dim=1)
    return generated


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

batch_size = 128
num_epochs = 10
learning_rate = 0.001
embed_dim = 128
hidden_size = 256
num_layers = 2

results = {}

for seq_length in [20, 30, 50]:
    print(f"\n=== Sequence Length = {seq_length} ===")
    sequences, targets = create_sequences_targets(encoded_text, seq_length)
    dataset = CharDataset(sequences, targets)
    train_size = int(len(dataset) * 0.8)
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
    train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

    for model_type in ["LSTM", "GRU"]:
        print(f"\n--- Training Model: {model_type} ---")
        model = CharModel(model_type, vocab_size, embed_dim, hidden_size, num_layers=num_layers).to(device)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        criterion = nn.CrossEntropyLoss()

        start_time = time.time()
        for epoch in range(num_epochs):
            train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
            val_loss, val_acc = evaluate_model(model, test_loader, criterion, device)
            perplexity = math.exp(val_loss)
            print(f"Epoch {epoch + 1:2d}/{num_epochs}, Train Loss: {train_loss:.4f}, " +
                  f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}, Perplexity: {perplexity:.2f}")
        elapsed_time = time.time() - start_time

        num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

        results[(model_type, seq_length)] = {
            "train_loss": train_loss,
            "val_loss": val_loss,
            "val_acc": val_acc,
            "perplexity": perplexity,
            "time": elapsed_time,
            "num_params": num_params
        }
        print(
            f"{model_type} Model (Sequence Length {seq_length}) Training Time: {elapsed_time:.2f} sec, Parameters: {num_params}")

        seed = "The "
        generated = generate_text(model, seed, 100, device)
        print("Generated Text Example:")
        print(generated)
        print("-" * 60)

print("\n=== Experiment Results Summary ===")
for key, res in results.items():
    model_type, seq_length = key
    print(f"Model: {model_type:4s}, Sequence Length: {seq_length:2d}, Final Train Loss: {res['train_loss']:.4f}, " +
          f"Final Val Loss: {res['val_loss']:.4f}, Val Acc: {res['val_acc']:.4f}, " +
          f"Perplexity: {res['perplexity']:.2f}, Time: {res['time']:.2f}s, Params: {res['num_params']}")


Device: cuda

=== Sequence Length = 20 ===

--- Training Model: LSTM ---
Epoch  1/10, Train Loss: 1.6668, Val Loss: 1.4746, Val Acc: 0.5505, Perplexity: 4.37
Epoch  2/10, Train Loss: 1.4084, Val Loss: 1.3955, Val Acc: 0.5698, Perplexity: 4.04
Epoch  3/10, Train Loss: 1.3446, Val Loss: 1.3628, Val Acc: 0.5775, Perplexity: 3.91
Epoch  4/10, Train Loss: 1.3073, Val Loss: 1.3500, Val Acc: 0.5822, Perplexity: 3.86
Epoch  5/10, Train Loss: 1.2809, Val Loss: 1.3392, Val Acc: 0.5860, Perplexity: 3.82
Epoch  6/10, Train Loss: 1.2602, Val Loss: 1.3353, Val Acc: 0.5857, Perplexity: 3.80
Epoch  7/10, Train Loss: 1.2432, Val Loss: 1.3356, Val Acc: 0.5868, Perplexity: 3.80
Epoch  8/10, Train Loss: 1.2292, Val Loss: 1.3335, Val Acc: 0.5872, Perplexity: 3.79
Epoch  9/10, Train Loss: 1.2172, Val Loss: 1.3341, Val Acc: 0.5872, Perplexity: 3.80
Epoch 10/10, Train Loss: 1.2067, Val Loss: 1.3307, Val Acc: 0.5905, Perplexity: 3.78
LSTM Model (Sequence Length 20) Training Time: 551.66 sec, Parameters: 946625