<a href="https://colab.research.google.com/github/Hbrand03/Real-Time-Machine-Learning/blob/main/Homework_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Problem 1

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import time

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data and Encoding
text = """ Next character prediction is a fundamental task in the field of natural language processing (NLP) that involves predicting the next character in a sequence of text based on the characters that precede it. This task is essential for various applications, including text auto-completion, spell checking, and even in the development of sophisticated AI models capable of generating human-like text.

At its core, next character prediction relies on statistical models or deep learning algorithms to analyze a given sequence of text and predict which character is most likely to follow. These predictions are based on patterns and relationships learned from large datasets of text during the training phase of the model.

One of the most popular approaches to next character prediction involves the use of Recurrent Neural Networks (RNNs), and more specifically, a variant called Long Short-Term Memory (LSTM) networks. RNNs are particularly well-suited for sequential data like text, as they can maintain information in 'memory' about previous characters to inform the prediction of the next character. LSTM networks enhance this capability by being able to remember long-term dependencies, making them even more effective for next character prediction tasks.

Training a model for next character prediction involves feeding it large amounts of text data, allowing it to learn the probability of each character's appearance following a sequence of characters. During this training process, the model adjusts its parameters to minimize the difference between its predictions and the actual outcomes, thus improving its predictive accuracy over time.

Once trained, the model can be used to predict the next character in a given piece of text by considering the sequence of characters that precede it. This can enhance user experience in text editing software, improve efficiency in coding environments with auto-completion features, and enable more natural interactions with AI-based chatbots and virtual assistants.

In summary, next character prediction plays a crucial role in enhancing the capabilities of various NLP applications, making text-based interactions more efficient, accurate, and human-like. Through the use of advanced machine learning models like RNNs and LSTMs, next character prediction continues to evolve, opening new possibilities for the future of text-based technology. """

chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

def encode(s): return [stoi[c] for c in s]
def decode(l): return ''.join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)

# Create batches
def get_batches(data, seq_len, batch_size):
    X, Y = [], []
    for i in range(0, len(data) - seq_len - 1, seq_len):
        X.append(data[i:i+seq_len])
        Y.append(data[i+1:i+seq_len+1])
    X, Y = torch.stack(X), torch.stack(Y)
    return DataLoader(TensorDataset(X, Y), batch_size=batch_size)

# Model Definitions
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, num_heads=4, num_layers=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        encoder_layer = nn.TransformerEncoderLayer(embed_dim, num_heads)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        x = x.permute(1, 0, 2)
        x = self.transformer(x)
        x = x.permute(1, 0, 2)
        return self.fc(x)

# Training Function
def train_model(model, dataloader, vocab_size, epochs=10):
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()
    model.train()
    start = time.time()
    for epoch in range(epochs):
        total_loss = 0
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            output = model(x)
            loss = criterion(output.view(-1, vocab_size), y.view(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
    end = time.time()
    return total_loss / len(dataloader), end - start

# Example run for one sequence length
seq_len = 10
batch_size = 32
dataloader = get_batches(data, seq_len, batch_size)
model = TransformerModel(vocab_size)
loss, duration = train_model(model, dataloader, vocab_size)
print(f"Transformer (seq_len={seq_len}): Loss={loss:.4f}, Time={duration:.2f}s")

seq_len = 20
batch_size = 32
dataloader = get_batches(data, seq_len, batch_size)
model = TransformerModel(vocab_size)
loss, duration = train_model(model, dataloader, vocab_size)
print(f"Transformer (seq_len={seq_len}): Loss={loss:.4f}, Time={duration:.2f}s")

seq_len = 30
batch_size = 32
dataloader = get_batches(data, seq_len, batch_size)
model = TransformerModel(vocab_size)
loss, duration = train_model(model, dataloader, vocab_size)
print(f"Transformer (seq_len={seq_len}): Loss={loss:.4f}, Time={duration:.2f}s")




Transformer (seq_len=10): Loss=1.3527, Time=28.05s




Transformer (seq_len=20): Loss=2.0190, Time=16.70s
Transformer (seq_len=30): Loss=2.2289, Time=13.29s


# Problem 2

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import requests
import time
import math

# Download Tiny Shakespeare
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
text = requests.get(url).text

# Create character vocabulary
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for ch, i in stoi.items()}

def encode(s): return [stoi[c] for c in s]
def decode(l): return ''.join([itos[i] for i in l])

# Hyperparams
device = 'cuda' if torch.cuda.is_available() else 'cpu'
block_size = 30
batch_size = 64
train_ratio = 0.9

# Prepare dataset
data = torch.tensor(encode(text), dtype=torch.long)
n = int(train_ratio * len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    d = train_data if split == 'train' else val_data
    ix = torch.randint(len(d) - block_size, (batch_size,))
    x = torch.stack([d[i:i+block_size] for i in ix])
    y = torch.stack([d[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

# Transformer Block
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, n_heads, dropout):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim, n_heads, dropout=dropout, batch_first=True)
        self.ln1 = nn.LayerNorm(embed_dim)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, 4 * embed_dim),
            nn.ReLU(),
            nn.Linear(4 * embed_dim, embed_dim),
        )
        self.ln2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        attn_out, _ = self.attn(x, x, x)
        x = self.ln1(x + attn_out)
        ff_out = self.ff(x)
        return self.ln2(x + ff_out)

# Full Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, num_layers, n_heads, block_size, vocab_size, embed_dim=128, dropout=0.1):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_embedding = nn.Parameter(torch.zeros(1, block_size, embed_dim))
        self.blocks = nn.Sequential(*[
            TransformerBlock(embed_dim, n_heads, dropout) for _ in range(num_layers)
        ])
        self.ln_f = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        token_emb = self.token_embedding(x)
        x = token_emb + self.pos_embedding[:, :x.size(1), :]
        x = self.blocks(x)
        x = self.ln_f(x)
        return self.head(x)

# Train/eval function
def train_and_evaluate(num_layers, n_heads, block_size):
    model = TransformerModel(num_layers=num_layers, n_heads=n_heads, block_size=block_size, vocab_size=vocab_size).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
    loss_fn = nn.CrossEntropyLoss()

    print(f"\nTraining Transformer (layers={num_layers}, heads={n_heads}, seq_len={block_size})")

    start_time = time.time()
    for step in range(200):  # You can increase this for better results
        model.train()
        xb, yb = get_batch('train')
        logits = model(xb)
        loss = loss_fn(logits.view(-1, vocab_size), yb.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % 50 == 0 or step == 199:
            model.eval()
            with torch.no_grad():
                xval, yval = get_batch('val')
                val_logits = model(xval)
                val_loss = loss_fn(val_logits.view(-1, vocab_size), yval.view(-1))
            print(f"Step {step:03d}: Train Loss={loss.item():.4f}, Val Loss={val_loss.item():.4f}")

    end_time = time.time()
    print(f"Total Time: {end_time - start_time:.2f}s")
    print(f"Model Params: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")

# Run experiments
configs = [
    (1, 2), (2, 2), (4, 2),
    (1, 4), (2, 4), (4, 4),
]

# Run all configs for block_size 20 and 30
for block_size in [20, 30]:
    for layers, heads in configs:
        train_and_evaluate(num_layers=layers, n_heads=heads, block_size=block_size)

# Optional: test seq_len = 50
print("\nTesting longer context window (seq_len = 50)...")
train_and_evaluate(num_layers=2, n_heads=2, block_size=50)



Training Transformer (layers=1, heads=2, seq_len=20)
Step 000: Train Loss=4.2916, Val Loss=4.0617
Step 050: Train Loss=2.4438, Val Loss=2.4395
Step 100: Train Loss=2.2504, Val Loss=2.3222
Step 150: Train Loss=2.1705, Val Loss=2.2118
Step 199: Train Loss=2.0587, Val Loss=2.0705
Total Time: 48.08s
Model Params: 0.22M

Training Transformer (layers=2, heads=2, seq_len=20)
Step 000: Train Loss=4.3453, Val Loss=3.9056
Step 050: Train Loss=2.3458, Val Loss=2.3323
Step 100: Train Loss=2.1345, Val Loss=2.1130
Step 150: Train Loss=2.0233, Val Loss=1.9953
Step 199: Train Loss=1.9610, Val Loss=1.9441
Total Time: 75.40s
Model Params: 0.42M

Training Transformer (layers=4, heads=2, seq_len=20)
Step 000: Train Loss=4.4531, Val Loss=3.7734
Step 050: Train Loss=2.3228, Val Loss=2.2603
Step 100: Train Loss=2.1233, Val Loss=2.0710
Step 150: Train Loss=1.9928, Val Loss=1.9939
Step 199: Train Loss=1.9578, Val Loss=1.8290
Total Time: 135.29s
Model Params: 0.81M

Training Transformer (layers=1, heads=4, seq

# Problem 3

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
import random

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset
english_to_french = [
    ("I am cold", "J'ai froid"),
    ("You are tired", "Tu es fatigué"),
    ("He is hungry", "Il a faim"),
    ("She is happy", "Elle est heureuse"),
    ("We are friends", "Nous sommes amis"),
    ("They are students", "Ils sont étudiants"),
    ("The cat is sleeping", "Le chat dort"),
    ("The sun is shining", "Le soleil brille"),
    ("We love music", "Nous aimons la musique"),
    ("She speaks French fluently", "Elle parle français couramment"),
    ("He enjoys reading books", "Il aime lire des livres"),
    ("They play soccer every weekend", "Ils jouent au football chaque week-end"),
    ("The movie starts at 7 PM", "Le film commence à 19 heures"),
    ("She wears a red dress", "Elle porte une robe rouge"),
    ("We cook dinner together", "Nous cuisinons le dîner ensemble"),
]

# Build char vocab
all_text = "".join(e + f for e, f in english_to_french)
chars = sorted(set(all_text))
char2idx = {ch: idx+1 for idx, ch in enumerate(chars)}  # 0 = padding
char2idx["<pad>"] = 0
idx2char = {idx: ch for ch, idx in char2idx.items()}

def encode(text, max_len):
    return [char2idx[c] for c in text.ljust(max_len)]

def decode(indices):
    return "".join([idx2char[i] for i in indices if i != 0])

# Dataset class
class Seq2SeqDataset(Dataset):
    def __init__(self, data):
        self.pairs = data
        self.max_src = max(len(e) for e, _ in data)
        self.max_tgt = max(len(f) for _, f in data) + 2  # + <sos> and <eos>

    def __len__(self): return len(self.pairs)

    def __getitem__(self, idx):
        src, tgt = self.pairs[idx]
        src_enc = encode(src, self.max_src)
        tgt_enc = [char2idx["<pad>"]] + encode(tgt, self.max_tgt - 2) + [char2idx["<pad>"]]
        return torch.tensor(src_enc), torch.tensor(tgt_enc)

# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=100):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        pos = torch.arange(0, max_len).unsqueeze(1)
        div = torch.exp(torch.arange(0, d_model, 2) * -(torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(pos * div)
        pe[:, 1::2] = torch.cos(pos * div)
        self.pe = pe.unsqueeze(0).to(device)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# Transformer Model
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_layers, num_heads):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.pos_encoder = PositionalEncoding(embed_size)
        self.transformer = nn.Transformer(
            d_model=embed_size,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=512,
            dropout=0.1,
            batch_first=True
        )
        self.fc_out = nn.Linear(embed_size, vocab_size)

    def forward(self, src, tgt):
        src = src.to(device)
        tgt = tgt.to(device)
        src_mask = self.transformer.generate_square_subsequent_mask(src.size(1)).to(device)
        tgt_mask = self.transformer.generate_square_subsequent_mask(tgt.size(1)).to(device)

        src_emb = self.embedding(src)
        tgt_emb = self.embedding(tgt)
        src_emb = self.pos_encoder(src_emb)
        tgt_emb = self.pos_encoder(tgt_emb)

        output = self.transformer(src_emb, tgt_emb, src_mask=src_mask, tgt_mask=tgt_mask)
        return self.fc_out(output)

# Training + Validation
def train_model(model, train_loader, val_loader, num_epochs=20):
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    for epoch in range(num_epochs):
        model.train()
        for src, tgt in train_loader:
            optimizer.zero_grad()
            tgt_input = tgt[:, :-1].to(device)
            tgt_output = tgt[:, 1:].to(device)
            src = src.to(device)

            output = model(src, tgt_input)
            loss = criterion(output.reshape(-1, vocab_size), tgt_output.reshape(-1))
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        val_loss = 0
        correct = 0
        total = 0
        with torch.no_grad():
            for src, tgt in val_loader:
                tgt_input = tgt[:, :-1].to(device)
                tgt_output = tgt[:, 1:].to(device)
                src = src.to(device)

                output = model(src, tgt_input)
                loss = criterion(output.reshape(-1, vocab_size), tgt_output.reshape(-1))
                val_loss += loss.item()

                preds = output.argmax(dim=-1)
                correct += (preds == tgt_output).sum().item()
                total += (tgt_output != 0).sum().item()

        print(f"Epoch {epoch}: Val Loss = {val_loss:.4f}, Val Accuracy = {correct/total:.2f}")

# Run model for different configurations
vocab_size = len(char2idx)
dataset = Seq2SeqDataset(english_to_french)
train_data, val_data = random_split(dataset, [12, 3])
train_loader = DataLoader(train_data, batch_size=4, shuffle=True)
val_loader = DataLoader(val_data, batch_size=4)

configs = [(l, h) for l in [1, 2, 4] for h in [2, 4]]
for num_layers, num_heads in configs:
    print(f"\nTraining Transformer: Layers={num_layers}, Heads={num_heads}")
    model = TransformerModel(vocab_size=vocab_size, embed_size=128, num_layers=num_layers, num_heads=num_heads).to(device)
    train_model(model, train_loader, val_loader, num_epochs=10)

    # Qualitative testing
    model.eval()
    print("Sample translations:")
    for i in range(3):
        src, _ = random.choice(english_to_french)
        src_enc = torch.tensor(encode(src, dataset.max_src)).unsqueeze(0).to(device)
        tgt = torch.tensor([char2idx["<pad>"]]).unsqueeze(0).to(device)
        for _ in range(30):
            output = model(src_enc, tgt)
            next_char = output[0, -1].argmax().item()
            tgt = torch.cat([tgt, torch.tensor([[next_char]]).to(device)], dim=1)
            if next_char == char2idx["<pad>"]:
                break
        print(f"ENG: {src}")
        print(f"FR : {decode(tgt.squeeze().tolist())}\n")


# Problem 4

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from torch.utils.data import DataLoader, Dataset
from transformers import Transformer

# Assume the data is in the form of sentence pairs for French (input) and English (output).
# We will preprocess this dataset accordingly.

# Step 1: Dataset Preprocessing
class TranslationDataset(Dataset):
    def __init__(self, input_data, target_data, input_tokenizer, target_tokenizer):
        self.input_data = input_data
        self.target_data = target_data
        self.input_tokenizer = input_tokenizer
        self.target_tokenizer = target_tokenizer

    def __len__(self):
        return len(self.input_data)

    def __getitem__(self, idx):
        input_sentence = self.input_data[idx]
        target_sentence = self.target_data[idx]

        # Tokenize the sentences
        input_tokens = self.input_tokenizer(input_sentence, padding='max_length', truncation=True, return_tensors='pt')
        target_tokens = self.target_tokenizer(target_sentence, padding='max_length', truncation=True, return_tensors='pt')

        return {
            'input_ids': input_tokens['input_ids'].squeeze(),
            'attention_mask': input_tokens['attention_mask'].squeeze(),
            'labels': target_tokens['input_ids'].squeeze()
        }

# Assume we have the following data
input_data = ["Il aime lire des livres", "Nous sommes fatigués", "Elle est heureuse"]
target_data = ["He likes to read books", "We are tired", "She is happy"]

# Load pre-trained tokenizers (e.g., from Hugging Face)
from transformers import BertTokenizer
input_tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
target_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Step 2: Model Configuration

class TransformerModel(nn.Module):
    def __init__(self, num_layers, num_heads, vocab_size, d_model=512, d_ff=2048, dropout=0.1):
        super(TransformerModel, self).__init__()

        self.encoder = nn.Embedding(vocab_size, d_model)
        self.decoder = nn.Embedding(vocab_size, d_model)

        self.transformer = nn.Transformer(
            d_model=d_model,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            nhead=num_heads,
            dim_feedforward=d_ff,
            dropout=dropout
        )

        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        # Encoder and decoder
        enc_output = self.encoder(src)
        dec_output = self.decoder(tgt)

        # Transformer forward pass
        transformer_output = self.transformer(enc_output, dec_output, src_mask=src_mask, tgt_mask=tgt_mask)

        output = self.fc_out(transformer_output)
        return output

# Step 3: Training Function

def train_model(model, train_loader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        output = model(input_ids, labels, src_mask=None, tgt_mask=None)

        # Compute loss
        loss = criterion(output.view(-1, output.shape[-1]), labels.view(-1))
        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(train_loader)

# Step 4: Evaluation Function

def evaluate_model(model, val_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            output = model(input_ids, labels, src_mask=None, tgt_mask=None)
            loss = criterion(output.view(-1, output.shape[-1]), labels.view(-1))

            epoch_loss += loss.item()

            # Calculate accuracy
            _, predicted = output.max(dim=-1)
            correct += (predicted == labels).sum().item()
            total += labels.numel()

    accuracy = correct / total
    return epoch_loss / len(val_loader), accuracy

# Step 5: Hyperparameter Tuning

num_epochs = 10
batch_size = 32
learning_rate = 1e-4

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loop over different configurations of layers and heads
configs = [(1, 2), (1, 4), (2, 2), (2, 4), (4, 2), (4, 4)]
for num_layers, num_heads in configs:
    print(f"Training Transformer: Layers={num_layers}, Heads={num_heads}")

    # Create model
    vocab_size = len(input_tokenizer)  # Use the size of the input tokenizer vocab
    model = TransformerModel(num_layers, num_heads, vocab_size).to(device)

    # Optimizer and criterion
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss(ignore_index=input_tokenizer.pad_token_id)

    # Prepare dataset and dataloader
    dataset = TranslationDataset(input_data, target_data, input_tokenizer, target_tokenizer)
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Training and evaluation loop
    for epoch in range(num_epochs):
        train_loss = train_model(model, train_loader, optimizer, criterion, device)
        val_loss, val_accuracy = evaluate_model(model, train_loader, criterion, device)

        print(f"Epoch {epoch}: Train Loss = {train_loss:.4f}, Val Loss = {val_loss:.4f}, Val Accuracy = {val_accuracy:.2f}")

    # Sample translation evaluation
    print("Sample translations:")
    sample_sentences = ["He enjoys reading books", "You are tired", "I am cold"]
    for sentence in sample_sentences:
        input_tokens = input_tokenizer(sentence, return_tensors='pt').to(device)
        with torch.no_grad():
            translated = model(input_tokens['input_ids'], input_tokens['input_ids'])
            translated_sentence = target_tokenizer.decode(translated.argmax(dim=-1).cpu().numpy().flatten())
            print(f"ENG: {sentence}")
            print(f"FR : {translated_sentence}")

