# PyTorch Intermediate Deep Learning Cheat Sheet

**Focus:** Intermediate concepts for Computer Vision and Sequential Models

**Contents:**
- Custom CNNs & Pretrained Models
- Transfer Learning & Data Augmentation
- Training Loops & Optimization
- RNNs, LSTMs, GRUs
- Attention Mechanisms
- Model Evaluation & Saving
- Performance Tips

In [2]:
# Essential Imports for Intermediate PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# Computer Vision
import torchvision
import torchvision.transforms as transforms
from torchvision import models

# Utilities
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

## 🖼️ Computer Vision

### Custom CNN Architectures

In [4]:
# Custom CNN with Modern Techniques
class CustomCNN(nn.Module):
    def __init__(self, num_classes=10, dropout_rate=0.5):
        super(CustomCNN, self).__init__()
        
        # Feature extraction layers
        self.features = nn.Sequential(
            # Block 1: Basic conv layers
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),  # Batch normalization for stability
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            
            # Block 2: Deeper features
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
            
            # Block 3: High-level features
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, 2),
        )
        
        # Adaptive pooling for flexible input sizes
        self.adaptive_pool = nn.AdaptiveAvgPool2d((7, 7))
        
        # Classifier
        self.classifier = nn.Sequential(
            nn.Dropout(dropout_rate),  # Regularization
            nn.Linear(256 * 7 * 7, 512),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout_rate),
            nn.Linear(512, num_classes)
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.adaptive_pool(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.classifier(x)
        return x

# Initialize model
model = CustomCNN(num_classes=10, dropout_rate=0.3)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

Model parameters: 7,575,370


### Pretrained Models & Transfer Learning

In [5]:
# Transfer Learning with Pretrained Models

# 1. Feature Extraction (Freeze backbone)
def create_feature_extractor(num_classes, model_name='resnet18'):
    if model_name == 'resnet18':
        model = models.resnet18(pretrained=True)
        # Freeze all parameters
        for param in model.parameters():
            param.requires_grad = False
        # Replace classifier
        model.fc = nn.Linear(model.fc.in_features, num_classes)
    
    elif model_name == 'efficientnet_b0':
        model = models.efficientnet_b0(pretrained=True)
        for param in model.parameters():
            param.requires_grad = False
        # Replace classifier
        model.classifier[1] = nn.Linear(model.classifier[1].in_features, num_classes)
    
    return model

# 2. Fine-tuning (Unfreeze some layers)
def create_fine_tuned_model(num_classes, model_name='resnet18', freeze_layers=True):
    if model_name == 'resnet18':
        model = models.resnet18(pretrained=True)
        
        if freeze_layers:
            # Freeze early layers, unfreeze later ones
            for name, param in model.named_parameters():
                if 'layer4' not in name and 'fc' not in name:
                    param.requires_grad = False
        
        # Custom classifier with dropout
        model.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(model.fc.in_features, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )
    
    return model

# Example usage
feature_extractor = create_feature_extractor(num_classes=5, model_name='resnet18')
fine_tuned_model = create_fine_tuned_model(num_classes=5, model_name='resnet18')

print(f"Trainable params in feature extractor: {sum(p.numel() for p in feature_extractor.parameters() if p.requires_grad):,}")
print(f"Trainable params in fine-tuned: {sum(p.numel() for p in fine_tuned_model.parameters() if p.requires_grad):,}")



Trainable params in feature extractor: 2,565
Trainable params in fine-tuned: 8,526,341


In [6]:
# Advanced Data Augmentation
train_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(degrees=15),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1), scale=(0.9, 1.1)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # ImageNet stats
])

val_transforms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Advanced Training Loop with Learning Rate Scheduling
def train_model(model, train_loader, val_loader, num_epochs=25, device='cuda'):
    model = model.to(device)
    
    # Loss function and optimizer
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)  # Label smoothing for regularization
    optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)  # AdamW with weight decay
    
    # Learning rate scheduler
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
    # Alternative: StepLR for step-wise decay
    # scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
    
    best_val_acc = 0.0
    train_losses, val_losses = [], []
    train_accs, val_accs = [], []
    
    for epoch in range(num_epochs):
        print(f'Epoch {epoch+1}/{num_epochs}')
        print('-' * 10)
        
        # Training phase
        model.train()
        running_loss = 0.0
        running_corrects = 0
        
        for inputs, labels in tqdm(train_loader, desc='Training'):
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            
            with torch.set_grad_enabled(True):
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                loss = criterion(outputs, labels)
                
                loss.backward()
                optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == labels.data)
        
        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc = running_corrects.double() / len(train_loader.dataset)
        train_losses.append(epoch_loss)
        train_accs.append(epoch_acc.item())
        
        # Validation phase
        model.eval()
        val_running_loss = 0.0
        val_running_corrects = 0
        
        with torch.no_grad():  # No gradient computation for validation
            for inputs, labels in tqdm(val_loader, desc='Validation'):
                inputs, labels = inputs.to(device), labels.to(device)
                
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                loss = criterion(outputs, labels)
                
                val_running_loss += loss.item() * inputs.size(0)
                val_running_corrects += torch.sum(preds == labels.data)
        
        val_epoch_loss = val_running_loss / len(val_loader.dataset)
        val_epoch_acc = val_running_corrects.double() / len(val_loader.dataset)
        val_losses.append(val_epoch_loss)
        val_accs.append(val_epoch_acc.item())
        
        print(f'Train Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
        print(f'Val Loss: {val_epoch_loss:.4f} Acc: {val_epoch_acc:.4f}')
        print(f'LR: {scheduler.get_last_lr()[0]:.6f}')
        
        # Save best model
        if val_epoch_acc > best_val_acc:
            best_val_acc = val_epoch_acc
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'best_val_acc': best_val_acc,
            }, 'best_model.pth')
        
        scheduler.step()  # Update learning rate
        print()
    
    return model, (train_losses, val_losses, train_accs, val_accs)

print("Training function defined. Use: model, history = train_model(model, train_loader, val_loader)")

Training function defined. Use: model, history = train_model(model, train_loader, val_loader)


## 🔄 Sequential Models

### RNNs, LSTMs, and GRUs

In [7]:
# Sequential Models: RNN, LSTM, GRU

class TextClassifierRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_layers=2, 
                 num_classes=2, dropout=0.3, rnn_type='LSTM'):
        super(TextClassifierRNN, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.rnn_type = rnn_type
        
        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        
        # RNN layers
        if rnn_type == 'LSTM':
            self.rnn = nn.LSTM(embed_dim, hidden_dim, num_layers, 
                              batch_first=True, dropout=dropout, bidirectional=True)
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(embed_dim, hidden_dim, num_layers, 
                             batch_first=True, dropout=dropout, bidirectional=True)
        else:  # Simple RNN
            self.rnn = nn.RNN(embed_dim, hidden_dim, num_layers, 
                             batch_first=True, dropout=dropout, bidirectional=True)
        
        # Classifier (bidirectional doubles the hidden size)
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_classes)
        )
    
    def forward(self, x, lengths=None):
        # x shape: (batch_size, seq_len)
        batch_size = x.size(0)
        
        # Embedding
        embedded = self.embedding(x)  # (batch_size, seq_len, embed_dim)
        
        # Pack sequences for efficiency (if lengths provided)
        if lengths is not None:
            embedded = pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)
        
        # RNN forward pass
        if self.rnn_type == 'LSTM':
            output, (hidden, cell) = self.rnn(embedded)
        else:
            output, hidden = self.rnn(embedded)
        
        # Unpack if we packed
        if lengths is not None:
            output, _ = pad_packed_sequence(output, batch_first=True)
        
        # Use last hidden state (for bidirectional, concatenate both directions)
        if self.rnn_type == 'LSTM':
            # hidden shape: (num_layers * 2, batch_size, hidden_dim)
            final_hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)  # Last layer, both directions
        else:
            final_hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)
        
        # Classification
        output = self.classifier(final_hidden)
        return output

# Example usage
model_lstm = TextClassifierRNN(vocab_size=10000, rnn_type='LSTM')
model_gru = TextClassifierRNN(vocab_size=10000, rnn_type='GRU')

print(f"LSTM model parameters: {sum(p.numel() for p in model_lstm.parameters()):,}")
print(f"GRU model parameters: {sum(p.numel() for p in model_gru.parameters()):,}")

LSTM model parameters: 3,779,330
GRU model parameters: 3,187,458


### Basic Attention Mechanism

In [8]:
# Basic Attention Mechanism for Sequence Models

class AttentionRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_classes=2, dropout=0.3):
        super(AttentionRNN, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        
        # Attention mechanism
        self.attention = nn.Linear(hidden_dim * 2, 1)  # Bidirectional LSTM
        
        # Classifier
        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_classes)
        )
    
    def forward(self, x, lengths=None):
        # Embedding and LSTM
        embedded = self.embedding(x)  # (batch_size, seq_len, embed_dim)
        lstm_out, _ = self.lstm(embedded)  # (batch_size, seq_len, hidden_dim * 2)
        
        # Attention weights
        attention_weights = self.attention(lstm_out)  # (batch_size, seq_len, 1)
        attention_weights = F.softmax(attention_weights, dim=1)  # Normalize across sequence
        
        # Weighted sum (attention-weighted representation)
        attended = torch.sum(lstm_out * attention_weights, dim=1)  # (batch_size, hidden_dim * 2)
        
        # Classification
        output = self.classifier(attended)
        return output, attention_weights.squeeze(-1)  # Return attention for visualization

# Sequence padding utility for batching
def collate_sequences(batch):
    """Custom collate function for variable length sequences"""
    sequences, labels = zip(*batch)
    
    # Get lengths before padding
    lengths = torch.tensor([len(seq) for seq in sequences])
    
    # Pad sequences
    padded_sequences = torch.nn.utils.rnn.pad_sequence(
        [torch.tensor(seq) for seq in sequences], 
        batch_first=True, 
        padding_value=0
    )
    
    return padded_sequences, torch.tensor(labels), lengths

# Example usage
attention_model = AttentionRNN(vocab_size=10000)
print(f"Attention model parameters: {sum(p.numel() for p in attention_model.parameters()):,}")

# Example forward pass
sample_input = torch.randint(1, 100, (4, 20))  # Batch of 4, max length 20
output, attention_weights = attention_model(sample_input)
print(f"Output shape: {output.shape}")
print(f"Attention weights shape: {attention_weights.shape}")

Attention model parameters: 2,202,883
Output shape: torch.Size([4, 2])
Attention weights shape: torch.Size([4, 20])


## 🎯 Model Evaluation & Saving

### Evaluation Best Practices

In [9]:
# Model Evaluation Functions

def evaluate_model(model, test_loader, device='cuda', return_predictions=False):
    """Comprehensive model evaluation"""
    model.eval()  # Set to evaluation mode
    
    all_predictions = []
    all_labels = []
    total_loss = 0.0
    correct = 0
    total = 0
    
    criterion = nn.CrossEntropyLoss()
    
    with torch.no_grad():  # Disable gradient computation
        for inputs, labels in tqdm(test_loader, desc='Evaluating'):
            inputs, labels = inputs.to(device), labels.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            total_loss += loss.item()
            
            if return_predictions:
                all_predictions.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
    
    accuracy = 100 * correct / total
    avg_loss = total_loss / len(test_loader)
    
    print(f'Test Accuracy: {accuracy:.2f}%')
    print(f'Test Loss: {avg_loss:.4f}')
    
    if return_predictions:
        return accuracy, avg_loss, all_predictions, all_labels
    return accuracy, avg_loss

# Model Saving and Loading
def save_model(model, optimizer, epoch, loss, filepath):
    """Save model with training state"""
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
        'model_architecture': str(model),  # Save architecture info
    }, filepath)
    print(f"Model saved to {filepath}")

def load_model(model, optimizer, filepath, device='cuda'):
    """Load model and training state"""
    checkpoint = torch.load(filepath, map_location=device)
    
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    epoch = checkpoint['epoch']
    loss = checkpoint['loss']
    
    print(f"Model loaded from {filepath}")
    print(f"Epoch: {epoch}, Loss: {loss:.4f}")
    
    return model, optimizer, epoch, loss

# Model inference function
def predict_single(model, input_tensor, device='cuda', return_probabilities=False):
    """Make prediction on single input"""
    model.eval()
    with torch.no_grad():
        input_tensor = input_tensor.to(device)
        if len(input_tensor.shape) == 3:  # Add batch dimension if needed
            input_tensor = input_tensor.unsqueeze(0)
        
        output = model(input_tensor)
        probabilities = F.softmax(output, dim=1)
        _, predicted = torch.max(output, 1)
        
        if return_probabilities:
            return predicted.item(), probabilities.cpu().numpy()[0]
        return predicted.item()

print("Evaluation functions defined!")

Evaluation functions defined!


## ⚡ Performance Optimization Tips

### Training Optimizations

In [10]:
# Performance Optimization Techniques

# 1. Mixed Precision Training (for faster training on modern GPUs)
from torch.cuda.amp import autocast, GradScaler

def train_with_mixed_precision(model, train_loader, optimizer, criterion, device='cuda'):
    """Training loop with mixed precision for faster training"""
    scaler = GradScaler()  # For gradient scaling
    model.train()
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        # Use autocast for forward pass
        with autocast():
            outputs = model(inputs)
            loss = criterion(outputs, labels)
        
        # Scale loss and backward pass
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

# 2. Learning Rate Schedulers for Better Convergence
def get_scheduler(optimizer, scheduler_type='cosine', **kwargs):
    """Get different types of learning rate schedulers"""
    if scheduler_type == 'cosine':
        return optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=kwargs.get('T_max', 50))
    elif scheduler_type == 'step':
        return optim.lr_scheduler.StepLR(optimizer, step_size=kwargs.get('step_size', 10), 
                                       gamma=kwargs.get('gamma', 0.1))
    elif scheduler_type == 'plateau':
        return optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', 
                                                   patience=kwargs.get('patience', 5),
                                                   factor=kwargs.get('factor', 0.5))
    elif scheduler_type == 'warmup':
        return optim.lr_scheduler.OneCycleLR(optimizer, max_lr=kwargs.get('max_lr', 1e-3),
                                           steps_per_epoch=kwargs.get('steps_per_epoch', 100),
                                           epochs=kwargs.get('epochs', 25))
    else:
        raise ValueError(f"Unknown scheduler type: {scheduler_type}")

# 3. Dropout Schedules and Regularization
class DropoutScheduler:
    """Gradually reduce dropout during training"""
    def __init__(self, model, initial_dropout=0.5, final_dropout=0.1, total_epochs=100):
        self.model = model
        self.initial_dropout = initial_dropout
        self.final_dropout = final_dropout
        self.total_epochs = total_epochs
    
    def step(self, epoch):
        # Linear decay
        current_dropout = self.initial_dropout - (self.initial_dropout - self.final_dropout) * (epoch / self.total_epochs)
        current_dropout = max(current_dropout, self.final_dropout)
        
        # Update all dropout layers
        for module in self.model.modules():
            if isinstance(module, nn.Dropout):
                module.p = current_dropout

# 4. Early Stopping Implementation
class EarlyStopping:
    """Early stopping to prevent overfitting"""
    def __init__(self, patience=7, min_delta=0, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta
        self.restore_best_weights = restore_best_weights
        self.best_loss = None
        self.counter = 0
        self.best_weights = None
    
    def __call__(self, val_loss, model):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.save_checkpoint(model)
        elif val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            self.save_checkpoint(model)
        else:
            self.counter += 1
        
        if self.counter >= self.patience:
            if self.restore_best_weights:
                model.load_state_dict(self.best_weights)
            return True
        return False
    
    def save_checkpoint(self, model):
        self.best_weights = model.state_dict().copy()

# 5. Gradient Clipping for Stable Training
def train_with_gradient_clipping(model, train_loader, optimizer, criterion, max_norm=1.0):
    """Training with gradient clipping to prevent exploding gradients"""
    model.train()
    
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        
        # Clip gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
        
        optimizer.step()

print("Performance optimization tools defined!")
print("\nKey Tips:")
print("- Use mixed precision for faster training on modern GPUs")
print("- Implement early stopping to prevent overfitting")
print("- Use appropriate learning rate schedules")
print("- Apply gradient clipping for RNNs and deep networks")
print("- Consider dropout scheduling for better regularization")

Performance optimization tools defined!

Key Tips:
- Use mixed precision for faster training on modern GPUs
- Implement early stopping to prevent overfitting
- Use appropriate learning rate schedules
- Apply gradient clipping for RNNs and deep networks
- Consider dropout scheduling for better regularization


### Common Loss Functions for Different Tasks

In [11]:
# Common Loss Functions Reference

# 1. Classification Tasks
classification_losses = {
    'CrossEntropy': nn.CrossEntropyLoss(),  # Most common for multi-class
    'CrossEntropy_weighted': nn.CrossEntropyLoss(weight=torch.tensor([0.5, 2.0])),  # For imbalanced classes
    'CrossEntropy_smoothed': nn.CrossEntropyLoss(label_smoothing=0.1),  # Regularization
    'BCEWithLogits': nn.BCEWithLogitsLoss(),  # Binary classification
    'Focal': None,  # Custom implementation below
}

# Focal Loss for imbalanced datasets
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, reduce=True):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduce = reduce
    
    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduce=False)
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * ce_loss
        
        if self.reduce:
            return torch.mean(focal_loss)
        else:
            return focal_loss

# 2. Regression Tasks
regression_losses = {
    'MSE': nn.MSELoss(),  # Mean Squared Error
    'MAE': nn.L1Loss(),   # Mean Absolute Error
    'Huber': nn.SmoothL1Loss(),  # Robust to outliers
}

# 3. Custom Loss Combinations
class CombinedLoss(nn.Module):
    """Combine multiple losses with weights"""
    def __init__(self, losses, weights):
        super(CombinedLoss, self).__init__()
        self.losses = losses
        self.weights = weights
    
    def forward(self, outputs, targets):
        total_loss = 0
        for loss_fn, weight in zip(self.losses, self.weights):
            total_loss += weight * loss_fn(outputs, targets)
        return total_loss

# Example usage
focal_loss = FocalLoss(alpha=1, gamma=2)
combined_loss = CombinedLoss(
    losses=[nn.CrossEntropyLoss(), focal_loss],
    weights=[0.7, 0.3]
)

print("Loss functions reference complete!")
print("\nChoose loss based on task:")
print("- Multi-class: CrossEntropyLoss")
print("- Binary: BCEWithLogitsLoss")
print("- Imbalanced: FocalLoss or weighted CrossEntropyLoss")
print("- Regression: MSELoss (sensitive to outliers) or SmoothL1Loss (robust)")

Loss functions reference complete!

Choose loss based on task:
- Multi-class: CrossEntropyLoss
- Binary: BCEWithLogitsLoss
- Imbalanced: FocalLoss or weighted CrossEntropyLoss
- Regression: MSELoss (sensitive to outliers) or SmoothL1Loss (robust)


## 📝 Quick Reference Summary

### Key Patterns to Remember

**Model Definition:**
- Always use `nn.Module` as base class
- Use `nn.Sequential` for simple stacks
- Add `BatchNorm` after conv layers for stability
- Use `nn.Dropout` for regularization

**Training Loop:**
- Set `model.train()` before training
- Set `model.eval()` before evaluation
- Use `torch.no_grad()` during validation/inference
- Always call `optimizer.zero_grad()` before backward pass

**Transfer Learning:**
- Freeze early layers: `param.requires_grad = False`
- Replace final classifier layer
- Use lower learning rates for pretrained layers

**Sequential Models:**
- Use `pack_padded_sequence` for variable lengths
- Apply gradient clipping for RNNs: `clip_grad_norm_`
- Bidirectional RNNs double the hidden size

**Performance:**
- Use mixed precision training with `autocast`
- Implement early stopping
- Choose appropriate learning rate schedules
- Save model state dict, not the entire model

**Common Gotchas:**
- Always move data to device: `.to(device)`
- Use `model.eval()` for inference
- Remember to set dropout and batch norm to eval mode
- Check tensor shapes frequently during development

In [12]:
# Quick Start Examples

# 1. Image Classification with Transfer Learning
"""
model = models.resnet18(pretrained=True)
for param in model.parameters():
    param.requires_grad = False
model.fc = nn.Linear(model.fc.in_features, num_classes)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=50)
"""

# 2. Text Classification with LSTM
"""
model = TextClassifierRNN(vocab_size=10000, rnn_type='LSTM')
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Don't forget gradient clipping for RNNs!
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
"""

# 3. Model Evaluation Template
"""
model.eval()
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        # Process outputs...
"""

print("\n✅ PyTorch Intermediate Cheat Sheet Complete!")
print("\nThis cheat sheet covers:")
print("- Custom CNN architectures with modern techniques")
print("- Transfer learning strategies")
print("- Advanced training loops with schedulers")
print("- RNN/LSTM/GRU implementations")
print("- Basic attention mechanisms")
print("- Model evaluation and saving")
print("- Performance optimization tips")
print("- Common loss functions")
print("\nReady for intermediate deep learning projects! 🚀")


✅ PyTorch Intermediate Cheat Sheet Complete!

This cheat sheet covers:
- Custom CNN architectures with modern techniques
- Transfer learning strategies
- Advanced training loops with schedulers
- RNN/LSTM/GRU implementations
- Basic attention mechanisms
- Model evaluation and saving
- Performance optimization tips
- Common loss functions

Ready for intermediate deep learning projects! 🚀


In [13]:
# Device Setup and Compatibility Check
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("CUDA not available. Using CPU.")

print(f"PyTorch Version: {torch.__version__}")
print(f"Torchvision Version: {torchvision.__version__}")

Using device: cuda
GPU: NVIDIA GeForce RTX 4070 Laptop GPU
CUDA Version: 12.1
Memory: 8.6 GB
PyTorch Version: 2.5.1+cu121
Torchvision Version: 0.20.1+cpu


---

**🔥 Pro Tips:**
- Always validate your model architecture with dummy input: `model(torch.randn(1, 3, 224, 224))`
- Use `torchsummary` or `torchinfo` to inspect model architecture
- Monitor GPU memory usage during training
- Save checkpoints frequently for long training runs
- Use tensorboard or wandb for experiment tracking
- Test data loading pipeline separately before training

**🔗 Useful Libraries:**
- `timm` - Modern computer vision models
- `transformers` - For NLP (when ready for advanced topics)
- `pytorch-lightning` - Simplified training loops
- `torchmetrics` - Additional evaluation metrics

**Happy Deep Learning! 🤖**