## Section 1: PyTorch Basics and Training Loops

### Moving the device to the gpu

In [None]:
## Model Instantiation and MPS GPU Device Setup

import torch
from torch import nn

# Device setup: MPS (for Mac), fallback to CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu") #torch.backends.cuda for non mac
print(f"Using device: {device}")

# Instantiate your model
model = HandGestureCNN(num_classes=6).to(device)

### PyTorch Basics and Training Loops
* nn.Module 
* nn.Sequential


nn.Module: The base class for all neural networks in Pytorch. Ir organizes layers and parameters.

nn.Sequential: A simpler way to stack layers in order


In [None]:
# Example: Building a linear model
import torch
import torch.nn as nn

# Define a simple model
linear_model = nn.Linear(1, 1) # 1 input feature 1 output feature
print(linear_model)

# Accessing parameters
print("weight:", linear_model.weight)
print("Bias", linear_model.bias)

Linear(in_features=1, out_features=1, bias=True)
weight: Parameter containing:
tensor([[-0.5132]], requires_grad=True)
Bias Parameter containing:
tensor([0.8347], requires_grad=True)


### Optimizer and Loss Functions
* Optimizers: Update model parameters based on gradients(e.g., SGD, ADAM)
* Loss Functions: Measure how far predictions are from the target (e.g., MSELoss, CrossEntropyLoss)

In [3]:
# Example: Defining Optimizer and Loss functions

import torch.optim as optim

# Define optimizer and loss function
optimizer = optim.SGD(linear_model.parameters(), lr=0.01)
loss_fn = nn.MSELoss()

### Trainig loops 

* Steps:
    - 1. Forward pass: Compute predictions.
    - 2. Compute loss: Compare predictions to adjust weights.
    - 3. Backward pass: Compute gradients.
    - 4. Update paramenters: Use optimizer to adjust weights

In [None]:
# Example: Training Loop 

# sample data
x_train = torch.tensor([[1.0],[2.0],[3.0]])
y_train = torch.tensor([[4.0],[8.0],[12.0]])

# Training loop
n_epochs = 100
for epoch in  range(n_epochs):
    # Forward pass
    y_pred = linear_model(x_train)
    loss = loss_fn(y_pred, y_train)

    # Backward pass 
    optimizer.zero_grad()   #clears old gradients
    loss.backward()         #computes new gradients via back propagations

    # Update parameters
    optimizer.step()        #updates the parameters using gradients

    # Print loss evey 10 epochs
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}") #item() is a method used in PyTorch and NumPy for tensors or arrays that contain only one element. It is used to extract that single value as a standard Python scalar.


Epoch 0, Loss: 80.6839
Epoch 10, Loss: 8.3697
Epoch 20, Loss: 1.4517
Epoch 30, Loss: 0.7622
Epoch 40, Loss: 0.6672
Epoch 50, Loss: 0.6302
Epoch 60, Loss: 0.6000
Epoch 70, Loss: 0.5718
Epoch 80, Loss: 0.5449
Epoch 90, Loss: 0.5193


## Section 2: Convolutional Neural Networks (CNNs):

### 1. What are CNNs?
* CNNs are specialized neural networks designed for processing structured grid data, such as images
* They are widely used in computer vision tasks like image classification, object detection, and segmentation


### 2. Key components of CNNs
-   1. Convolution Layers (nn.Conv2d or nn.Conv3d):
    * Extracts features from input images by applying filters (kernels).
    * Each filter detects specific patterns like edges, textures, or shapes.
    * Output: Feature maps

### Convolutional Neural Networks (CNNs)

#### Key Components:
1. **Convolutional Layers (`nn.Conv2d`)**: Extracts features from images.
2. **Activation Functions (`nn.ReLU`)**: Introduce non-Linearity.
3. **Pooling Layers (`nn.MaxPool2d`)**: Downsample feature maps. 
4. **Fully Connected Layers (`nn.Linear`)**: Combine features for predictions. 
5. **Dropout (`nn.Dropout`)**: Prevents overfitting. 

In [6]:
# Example: Simple CNN

import torch
import torch.nn as nn

# Define a simple CNN 
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()

        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1) #Input: 3 channels, output: 16 channels
        self.pool = nn.MaxPool2d(2) # Down sample by 2
        self.fc = nn.Linear(16 * 16 * 16, 10) # Fully connected layer for 10 classes

    def forward(self, x):
        x = self.pool(nn.Relu()(self.conv1(x))) # Conv -> ReLU -> Pool
        x = x.view(-1, 16 * 16 * 16) # Flatten
        x = self.fc(x) # Fully connected Layer

# Instantiate the model
model = SimpleCNN()
print(model)

SimpleCNN(
  (conv1): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc): Linear(in_features=4096, out_features=10, bias=True)
)


#### Training a CNN
- Using the same training loop as before:
1. Forward pass: Compute Prediction.
2. Compute loss: Compare prediction to ground truth
3. Backward pass: Compute gradients. 
4. Update parameters: Use optimizer to adjust the weights.

In [None]:
# Example: Training Loop for CNN

# Sample data (random tensor for demonstration)
x_train = torch.rand(8, 3, 32, 32) # Batch of 8 RGB images (3 channels, 32X32)
y_train = torch.randint(0, 10, (8,))  # Batch of 8 labels (10 classes)

# Define loss and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

# Training loop
n_epochs = 10
for epoch in range(n_epochs):
    # Forward pass
    y_pred = model(x_train)
    loss = loss_fn(y_pred, y_train)

    # Backward pass
    optimizer.zero_grad()
    loss.backward()

    # Update parameters
    optimizer.step()

    # Print loss every epoch
    print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")

### More advanced Architecture: 
This architecture has:
* 3 convolutional blocks with increasing depth
* Dropout before the classifier to reduce overfitting
* Final fully connected layer with 6 outputs (1 per gesture class)

In [None]:
## Defining the model

import torch.nn as nn
import torch.nn.functional as F

class HandGestureCNN(nn.Module):
    def __init__(self, num_classes=6):
        super(HandGestureCNN, self).__init__()
        
        self.conv_block1 = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, padding=1), #in_channels, out_channels, kernel_size, -OPTIONALS-> stride, padding, dilation, groups, bias, padding_mode='zeroes'
            nn.BatchNorm2d(32),                         # in
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        self.conv_block2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )

        self.conv_block3 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )

        # Assuming input size is 224x224 → after 3 pools (divided by 8) = 28x28
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(128 * 28 * 28, num_classes)

    def forward(self, x):
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = self.conv_block3(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        return self.fc(x)

In [None]:
## Defining the Optimizer and Loss Function

import torch.optim as optim

# Loss function for multi-class classification
#Label smoothing to cross entropy below #criterion = nn.CrossEntropyLoss()
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)  # 0.1 is a good starting point, helps prevent overconfidence

# Adam optimizer with optional weight decay (L2 regularization)
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

In [None]:
## defining scheduler after optimizer
from torch.optim.lr_scheduler import ReduceLROnPlateau

scheduler = ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5, verbose=True)
#This reduces LR by half if val loss doesn’t improve after 2 epochs.

In [None]:
## TRAINING LOOP !!! Includes: 
    # Process tracking for training and validation, 
    # Saving the best model to checkpoints/best_model.pth
    # Support both MPS or CPU

import time

def train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs=10, patience=3):
    best_val_loss = float('inf')
    no_improve_epochs = 0 #logic to automatically stop when overfitting while not watching the training, patience defaulted to 3
    history = {
        'train_loss': [],
        'val_loss': [],
        'val_acc': []
    }

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        start_time = time.time()

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * inputs.size(0)

        epoch_train_loss = running_loss / len(train_loader.dataset)
        history['train_loss'].append(epoch_train_loss)

        #  Evaluate on validation set
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * inputs.size(0)

                _, preds = torch.max(outputs, 1)
                correct += (preds == labels).sum().item()
                total += labels.size(0)

        epoch_val_loss = val_loss / len(val_loader.dataset)
        epoch_val_acc = correct / total

        history['val_loss'].append(epoch_val_loss)
        history['val_acc'].append(epoch_val_acc)
        
        #  Save best model & check early stopping
        if epoch_val_loss < best_val_loss:
            best_val_loss = epoch_val_loss
            no_improve_epochs = 0

            torch.save({
                'model_state': model.state_dict(),
                'optimizer_state': optimizer.state_dict()
            }, 'checkpoints/full_checkpoint.pth')

            torch.save(model.state_dict(), "checkpoints/best_model.pth")
            print(f"✅ Saved new best model at epoch {epoch+1}")

        else:
            no_improve_epochs += 1
            print(f"⏳ No improvement for {no_improve_epochs} epoch(s)")

        #  Print epoch summary
        duration = time.time() - start_time
        print(f"Epoch {epoch+1}/{num_epochs} | "
              f"Train Loss: {epoch_train_loss:.4f} | "
              f"Val Loss: {epoch_val_loss:.4f} | "
              f"Val Acc: {epoch_val_acc*100:.2f}% | "
              f"Time: {duration:.1f}s")
        
        scheduler.step(epoch_val_loss)## step the learning rate rescheduler
#
        if no_improve_epochs >= patience:
            print(f"🛑 Early stopping triggered after {epoch+1} epochs.")
            print(f" Best model was at val loss: {best_val_loss:.4f}")
            break

    return history



### Train the model

In [None]:
num_epochs = 20  # todo: mess around with different number of epochs
history = train_model(model, train_loader, val_loader, criterion, optimizer, device, num_epochs)

### Section 3:  Recurrent Neural Networks (RNNs)

1. What are RNNs?
2. Key components of RNNs:
    - Hidden states
    - Sequence processing
3. Building an RNN in PyTorch
4. Training and evaluating an RNN

### Recurrent Neural Networks (RNNs)

#### What are RNNs?
- RNNs are a type of neural network designed for sequential data.
- They process input sequences one step at a time, maintaining a **hidden state** that captures information about previous steps.
- Common applications include:
  - Time-series forecasting
  - Natural Language Processing (NLP)
  - Speech recognition

#### Key Components of RNNs:
1. **Hidden States**:
   - RNNs maintain a hidden state that is updated at each time step.
   - This allows them to capture temporal dependencies in the data.

2. **Sequence Processing**:
   - RNNs process sequences element by element, making them suitable for tasks where order matters.

3. **Variants of RNNs**:
   - **Vanilla RNN**: Basic RNN structure.
   - **LSTM (Long Short-Term Memory)**: Handles long-term dependencies better by using gates.
   - **GRU (Gated Recurrent Unit)**: A simplified version of LSTM with fewer parameters.

In [7]:
# Example: Simple RNN in PyTorch

import torch
import torch.nn as nn

# Define a simple RNN
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, hidden = self.rnn(x)  # RNN layer
        out = self.fc(out[:, -1, :])  # Fully connected layer (use last time step)
        return out

# Instantiate the model
input_size = 1
hidden_size = 16
output_size = 1
model = SimpleRNN(input_size, hidden_size, output_size)
print(model)

SimpleRNN(
  (rnn): RNN(1, 16, batch_first=True)
  (fc): Linear(in_features=16, out_features=1, bias=True)
)


#### Training an RNN
- Use the same training loop structure as before:
  1. Forward pass: Compute predictions.
  2. Compute loss: Compare predictions to ground truth.
  3. Backward pass: Compute gradients.
  4. Update parameters: Use optimizer to adjust weights.

In [10]:
# Example: Training Loop for RNN

# Sample data (sine wave for demonstration)
import numpy as np
x = np.linspace(0, 2 * np.pi, 100)
y = np.sin(x)

x_train = torch.tensor(y[:-1], dtype=torch.float32).view(1, -1, 1)  # Input sequence
y_train = torch.tensor(y[1:], dtype=torch.float32).view(1, -1, 1)  # Target sequence

# Define loss and optimizer
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training loop
n_epochs = 200
for epoch in range(n_epochs):
    # Forward pass
    y_pred = model(x_train)
    loss = loss_fn(y_pred, y_train)

    # Backward pass
    optimizer.zero_grad()
    loss.backward()

    # Update parameters
    optimizer.step()

    # Print loss every 20 epochs
    if epoch % 20 == 0:
        print(f"Epoch {epoch + 1}, Loss: {loss.item():.4f}")

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1, Loss: 0.5296
Epoch 21, Loss: 0.5001
Epoch 41, Loss: 0.5001
Epoch 61, Loss: 0.5000
Epoch 81, Loss: 0.5000
Epoch 101, Loss: 0.5000
Epoch 121, Loss: 0.5000
Epoch 141, Loss: 0.5000
Epoch 161, Loss: 0.5000
Epoch 181, Loss: 0.5000


#### LSTM and GRU
- **LSTM (Long Short-Term Memory)**:
  - Handles long-term dependencies using gates (input, forget, and output gates).
  - More effective for tasks like text generation and time-series forecasting.

- **GRU (Gated Recurrent Unit)**:
  - A simplified version of LSTM with fewer parameters.
  - Faster to train while still handling long-term dependencies.

#### Example: LSTM in PyTorch

In [9]:
# Example: LSTM in PyTorch

class SimpleLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, (hidden, cell) = self.lstm(x)  # LSTM layer
        out = self.fc(out[:, -1, :])  # Fully connected layer (use last time step)
        return out

# Instantiate the LSTM model
model = SimpleLSTM(input_size, hidden_size, output_size)
print(model)

SimpleLSTM(
  (lstm): LSTM(1, 16, batch_first=True)
  (fc): Linear(in_features=16, out_features=1, bias=True)
)
