# Introduction to Deep Learning 67822 - [Ex1](https://docs.google.com/document/d/11Q1ejfwTH_tHjdQob0gYLA3bS88lNsBStpBWz085rB0/edit?tab=t.0)
### NAME1 (ID1) & NAME2 (ID2)

### Section 1: Load and Prepare the Data
#### Split training data (from the .txt files)
We are training a model to classify 9-mer peptides based on whether they are detected by the immune system via specific HLA alleles. Each positive sample is associated with one of six common alleles. The negative samples are peptides not detected by any of the alleles.

When splitting the data into training and test sets, it’s crucial to avoid introducing bias. One tempting idea is to take the first 90% of each file for training and the last 10% for testing. However, this assumes that the peptide order inside each file is random — which may not be true. The files might be sorted by binding strength, similarity, or even alphabetically, which could skew the distribution.

To prevent such biases and ensure fair training and evaluation, we use a **stratified random split per allele**:

1. We load and shuffle the peptides from each positive allele file individually.
2. We split each file into a 90% training / 10% test set.
3. We do the same for the negative examples (from `negs.txt`).
4. Finally, we combine all subsets and shuffle them again.

This approach ensures that all alleles are represented in both training and test sets, the overall class balance between positive and negative is maintained and no ordering bias from the original files leaks into the learning process.

In [318]:
import os
import random
from pathlib import Path

# Config
data_dir = Path("Data/HLA_Dataset")
train_ratio = 0.9

# Define allele-to-label mapping
allele_label_map = {
    "A0101": 1,
    "A0201": 2,
    "A0203": 3,
    "A0207": 4,
    "A0301": 5,
    "A2402": 6,
    "NEG": 0
}

# Locate all positive allele files
allele_files = [f for f in data_dir.glob("*.txt") if "neg" not in f.name]

# Store train/test samples
pos_train, pos_test = [], []

# Process each allele file separately
for file in allele_files:
    allele = file.stem.replace("_pos", "")
    label = allele_label_map[allele]
    with open(file) as f:
        peptides = [line.strip() for line in f if line.strip()]
        random.shuffle(peptides)
        split_idx = int(len(peptides) * train_ratio)
        pos_train += [(pep, allele, label) for pep in peptides[:split_idx]]
        pos_test  += [(pep, allele, label) for pep in peptides[split_idx:]]

# Process negatives
with open(data_dir / "negs.txt") as f:
    neg_peptides = [line.strip() for line in f if line.strip()]
    random.shuffle(neg_peptides)
    split_idx = int(len(neg_peptides) * train_ratio)
    neg_train = [(pep, "NEG", 0) for pep in neg_peptides[:split_idx]]
    neg_test  = [(pep, "NEG", 0) for pep in neg_peptides[split_idx:]]

# Combine and shuffle
train_data = pos_train + neg_train
test_data = pos_test + neg_test
random.shuffle(train_data)
random.shuffle(test_data)

# Print final stats
from collections import Counter

def print_dist(data, name):
    counter = Counter([label for _, _, label in data])
    total = sum(counter.values())
    print(f"\n{name} set distribution:")
    for lbl in sorted(counter):
        allele = [a for a, l in allele_label_map.items() if l == lbl][0]
        print(f"  {allele:6s} (label {lbl}): {counter[lbl]} samples ({100 * counter[lbl]/total:.2f}%)")

print(f"Train set size: {len(train_data)} ({((len(train_data) / (len(train_data) + len(test_data))) * 100):.2f}%))")
print(f"Test set size: {len(test_data)} ({((len(test_data) / (len(train_data) + len(test_data))) * 100):.2f}%)")
print_dist(train_data, "Train")
print_dist(test_data, "Test")

Train set size: 33642 (89.99%))
Test set size: 3741 (10.01%)

Train set distribution:
  NEG    (label 0): 22042 samples (65.52%)
  A0101  (label 1): 1142 samples (3.39%)
  A0201  (label 2): 2352 samples (6.99%)
  A0203  (label 3): 1645 samples (4.89%)
  A0207  (label 4): 2982 samples (8.86%)
  A0301  (label 5): 1493 samples (4.44%)
  A2402  (label 6): 1986 samples (5.90%)

Test set distribution:
  NEG    (label 0): 2450 samples (65.49%)
  A0101  (label 1): 127 samples (3.39%)
  A0201  (label 2): 262 samples (7.00%)
  A0203  (label 3): 183 samples (4.89%)
  A0207  (label 4): 332 samples (8.87%)
  A0301  (label 5): 166 samples (4.44%)
  A2402  (label 6): 221 samples (5.91%)


### Section 2 – Peptide Representation

#### How would you represent these 9-mers of amino acids?

We considered two approaches:

**1. One-hot encoding:**  
Each amino acid is represented as a 20-dimensional one-hot vector. For a 9-mer peptide, this would require 180 input features. While this is straightforward, it’s sparse and does not capture biological similarities between amino acids.

**2. Embedding (used):**  
Instead, we map each amino acid to a dense embedding vector of size `d` (e.g., 4 or 8). This allows the model to learn meaningful representations during training, such as that hydrophobic or acidic amino acids may behave similarly.

Each peptide is converted to 9 indices (integers from 0–19), then embedded to get a `9 × d` matrix, which is then flattened for input to an MLP.

#### How would you represent the associate alleles?

Each positive sample comes from a known allele, and each negative sample is from none. We label:
- `0` → NEG (non-detecting)
- `1–6` → Alleles A0101 to A2402

This forms a 7-class multi-class classification problem.


In [319]:
# Amino acid to index mapping
AMINO_ACIDS = 'ACDEFGHIKLMNPQRSTVWY'
aa_to_idx = {aa: i for i, aa in enumerate(AMINO_ACIDS)}

# Convert peptide (string of 9 amino acids) to list of indices
def peptide_to_indices(peptide):
    return [aa_to_idx[aa] for aa in peptide]

import torch

# Split features (X) and labels (y) for train and test
X_train = [peptide_to_indices(p) for p, _, _ in train_data]
y_train = [label for _, _, label in train_data]

X_test  = [peptide_to_indices(p) for p, _, _ in test_data]
y_test  = [label for _, _, label in test_data]

# Convert to tensors
X_train = torch.tensor(X_train, dtype=torch.long)
y_train = torch.tensor(y_train, dtype=torch.long)

X_test  = torch.tensor(X_test, dtype=torch.long)
y_test  = torch.tensor(y_test, dtype=torch.long)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

# Check a sample
print("\nExample input (peptide indices):", X_train[0])
print("Corresponding label (allele class):", y_train[0])

X_train shape: torch.Size([33642, 9])
y_train shape: torch.Size([33642])
X_test shape: torch.Size([3741, 9])
y_test shape: torch.Size([3741])

Example input (peptide indices): tensor([17, 19, 12,  0, 15,  8, 10,  4, 12])
Corresponding label (allele class): tensor(6)


### Section 3 – Network Architecture

#### What will the network’s input dimension be?
With embeddings of size `d` and peptides of length 9, the input dimension is `9 × d`.  
For example, using `d = 4`, the input to the MLP is of size 36.

#### Implement an MLP that keeps this dimension for 2 inner layers
We construct a small feedforward neural network (MLP) with the following layers:
- **Embedding layer:** Maps 20 amino acid types to `d`-dimensional learnable vectors.
- **Flatten layer:** Concatenates the 9 embedded amino acids into a single vector of size `9 × d`.
- **Two hidden layers:** Fully connected, both using the same dimension (`9 × d`) with ReLU activations.
- **Output layer:** A linear layer with 7 outputs, representing the 7 classification labels (6 alleles + NEG).

We use `CrossEntropyLoss` as our loss function, and the `Adam` optimizer. During training, we track both training and validation loss.

#### Does the input dimension cause training problems?
In our setup, Each amino acid is embedded into a small vector (e.g. 4D), A peptide of length 9 becomes a 36D input vector (`9 × 4`) and the hidden layers also use this dimension.

This is a relatively small dimensional space (especially compared to one-hot encoding with 180 features). The network trains quickly and converges within a few epochs. No numerical instability or overfitting is observed.

**Conclusion:** The embedding-based representation allows the model to learn efficiently without overfitting or struggling with too high-dimensional sparse inputs.

#### Architecture Overview
We use a Multi-Layer Perceptron (MLP) that receives a 9-mer peptide encoded as indices of amino acids (integers from 0 to 19).
- We pass the input through an `nn.Embedding` layer that maps each amino acid to a learnable dense vector of dimension `d` (e.g., 4 or 8).
- The resulting tensor of shape `[batch_size, 9, d]` is flattened into `[batch_size, 9 × d]`.
- This is passed through two fully connected layers of the same size, with ReLU activations.
- The final layer outputs 7 logits corresponding to 7 classes (6 alleles + negative class).
- We apply `CrossEntropyLoss`, which internally applies softmax + log likelihood.

We also track training and test loss and accuracy over multiple epochs.


#### Defining the MLP Model

In [320]:
import torch
import torch.nn as nn

# Configurable constants
PEPTIDE_LENGTH = 9
NUM_AMINO_ACIDS = 20
NUM_CLASSES = 7


class PeptideClassifier(nn.Module):
    """
    Feedforward classifier for 9-mer peptide sequences using fixed-width hidden layers.
    - Embedding layer: (20 → emb_dim)
    - Flattened input
    - Two hidden layers of same dimension (as required)
    - Output layer: 7 classes
    """

    def __init__(self, emb_dim=4):
        """
        Initializes the model layers.
        Args:
            emb_dim (int): Size of the embedding vector for each amino acid.
        """
        super().__init__()

        self.embedding = nn.Embedding(NUM_AMINO_ACIDS, emb_dim)
        input_dim = PEPTIDE_LENGTH * emb_dim

        self.model = nn.Sequential(
            nn.Flatten(),
            nn.Linear(input_dim, input_dim),
            nn.ReLU(),
            nn.Linear(input_dim, input_dim),
            nn.ReLU(),
            nn.Linear(input_dim, NUM_CLASSES)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Forward pass of the model.
        Args:
            x (Tensor): Peptide input of shape (B, 9)
        Returns:
            Tensor: Class logits of shape (B, 7)
        """
        x = self.embedding(x)  # (B, 9, emb_dim)
        return self.model(x)   # (B, 7)


#### Loss & Optimization 

In [321]:
# Imports
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Model (assuming it was defined/imported from a cell above)
model = PeptideClassifier(emb_dim=4)

# Training Config
EPOCHS = 30
BATCH_SIZE = 256
LEARNING_RATE = 0.001

# Prepare DataLoaders
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Loss Function: CrossEntropy (optionally add class weights here)
class_counts = torch.bincount(y_train)
class_weights = 1.0 / (class_counts.float() + 1e-6)
class_weights = class_weights / class_weights.sum()  # Normalize

loss_fn = nn.CrossEntropyLoss()

# Optimizer: Adam (adaptive learning rate)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Sanity check
print("Model, optimizer, and loss function initialized!")

Model, optimizer, and loss function initialized!


#### Training Loop

In [322]:
import matplotlib.pyplot as plt

# Metrics Tracking
train_losses = []
test_losses = []
accuracies = []

# Training Loop
for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0

    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()            # 1. Reset gradients
        logits = model(batch_x)          # 2. Forward pass
        loss = loss_fn(logits, batch_y)  # 3. Compute loss
        loss.backward()                  # 4. Backpropagation
        optimizer.step()                 # 5. Update weights

        running_loss += loss.item() * batch_x.size(0)  # track loss

    # Epoch Summary
    epoch_train_loss = running_loss / len(train_loader.dataset)
    train_losses.append(epoch_train_loss)

    # Evaluate on Test Set
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            logits = model(batch_x)
            loss = loss_fn(logits, batch_y)
            test_loss += loss.item() * batch_x.size(0)

            preds = logits.argmax(dim=1)
            correct += (preds == batch_y).sum().item()
            total += batch_y.size(0)

    epoch_test_loss = test_loss / len(test_loader.dataset)
    epoch_accuracy = 100 * correct / total
    test_losses.append(epoch_test_loss)
    accuracies.append(epoch_accuracy)

    # Log Results
    print(f"Epoch {epoch+1:2d}/{EPOCHS} | "
          f"Train Loss: {epoch_train_loss:.4f} | "
          f"Test Loss: {epoch_test_loss:.4f} | "
          f"Accuracy: {epoch_accuracy:.2f}%")

Epoch  1/30 | Train Loss: 1.3667 | Test Loss: 1.1748 | Accuracy: 65.49%
Epoch  2/30 | Train Loss: 1.0803 | Test Loss: 0.9615 | Accuracy: 65.20%
Epoch  3/30 | Train Loss: 0.9044 | Test Loss: 0.8625 | Accuracy: 66.11%
Epoch  4/30 | Train Loss: 0.8396 | Test Loss: 0.8206 | Accuracy: 66.11%
Epoch  5/30 | Train Loss: 0.8030 | Test Loss: 0.7935 | Accuracy: 67.39%
Epoch  6/30 | Train Loss: 0.7784 | Test Loss: 0.7763 | Accuracy: 68.08%
Epoch  7/30 | Train Loss: 0.7598 | Test Loss: 0.7629 | Accuracy: 68.43%
Epoch  8/30 | Train Loss: 0.7463 | Test Loss: 0.7522 | Accuracy: 68.83%
Epoch  9/30 | Train Loss: 0.7348 | Test Loss: 0.7424 | Accuracy: 69.10%
Epoch 10/30 | Train Loss: 0.7251 | Test Loss: 0.7381 | Accuracy: 69.79%
Epoch 11/30 | Train Loss: 0.7175 | Test Loss: 0.7327 | Accuracy: 69.95%
Epoch 12/30 | Train Loss: 0.7112 | Test Loss: 0.7238 | Accuracy: 70.73%
Epoch 13/30 | Train Loss: 0.7052 | Test Loss: 0.7219 | Accuracy: 70.30%
Epoch 14/30 | Train Loss: 0.7001 | Test Loss: 0.7181 | Accuracy:

####  Plot Train/Test Loss

In [323]:
import plotly.graph_objects as go
import plotly.offline as pyo
pyo.init_notebook_mode(connected=True)


# Create an x-axis based on epoch numbers
epochs = list(range(1, EPOCHS + 1))
x_ticks = [1] + list(range(5, EPOCHS + 1, 5))

# Axis and layout styles
axis_style = dict(
    showgrid=True,
    gridcolor='gray',
    gridwidth=1,
    zeroline=True,
    zerolinewidth=2,
    linecolor='white',
    linewidth=2,
    mirror=True
)

layout_style = dict(
    template="plotly_dark",
    plot_bgcolor='rgba(40, 40, 40, 1)',  
    paper_bgcolor='rgba(30, 30, 30, 1)', 
    width=800,
    height=420,
    margin=dict(l=40, r=30, t=40, b=40),
    legend=dict(
        x=0.98,
        y=0.98,
        xanchor='right',
        yanchor='top',
        bgcolor='rgba(0,0,0,0.3)',
        bordercolor='white',
        borderwidth=1
    )
)

# Loss Plot
fig_loss = go.Figure()
fig_loss.add_trace(go.Scatter(
    x=epochs,
    y=train_losses,
    mode='lines+markers',
    name='Train Loss',
    line=dict(width=3),
    hovertext=[f"Epoch {e}: {v:.4f}" for e, v in zip(epochs, train_losses)],
    hoverinfo="text"
))
fig_loss.add_trace(go.Scatter(
    x=epochs,
    y=test_losses,
    mode='lines+markers',
    name='Test Loss',
    line=dict(width=3),
    hovertext=[f"Epoch {e}: {v:.4f}" for e, v in zip(epochs, test_losses)],
    hoverinfo="text"
))
fig_loss.update_layout(
    title="Train vs Test Loss",
    xaxis=dict(title="Epoch", tickmode='array', tickvals=x_ticks, **axis_style),
    yaxis=dict(title="Loss", **axis_style),
    **layout_style
)
pyo.iplot(fig_loss)

# Accuracy Plot
fig_acc = go.Figure()
fig_acc.add_trace(go.Scatter(
    x=epochs,
    y=accuracies,
    mode='lines+markers',
    name='Test Accuracy',
    line=dict(width=3),
    hovertext=[f"Epoch {e}: {v:.2f}%" for e, v in zip(epochs, accuracies)],
    hoverinfo="text"
))
fig_acc.add_shape(type='line', x0=1, x1=EPOCHS, y0=70, y1=70,
                  line=dict(color='orange', width=2, dash='dash'))

fig_acc.update_layout(
    title="Test Accuracy Over Time",
    xaxis=dict(title="Epoch", tickmode='array', tickvals=x_ticks, **axis_style),
    yaxis=dict(title="Accuracy (%)", **axis_style),
    **layout_style
)
pyo.iplot(fig_acc)
