## 1. Installation & Setup

In [123]:
# Install PyTorch
# pip install torchvision torchaudio #-> in terminal

# Import essentials
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

# Check CUDA availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cpu


## 2. Tensor Basics
### Creating Tensors


In [124]:
# From data
x = torch.tensor([1, 2, 3])
x = torch.tensor([[1, 2], [3, 4]], dtype=torch.float32)

# Special tensors
zeros = torch.zeros(3, 4)                    # All zeros
ones = torch.ones(2, 3)                      # All ones
empty = torch.empty(2, 3)                    # Uninitialized
rand = torch.rand(3, 3)                      # Uniform [0, 1)
randn = torch.randn(3, 3)                    # Normal dist N(0,1)
arange = torch.arange(0, 10, 2)              # [0, 2, 4, 6, 8]
linspace = torch.linspace(0, 1, 5)           # 5 points from 0 to 1
eye = torch.eye(3)                           # Identity matrix

# Like another tensor
x_zeros = torch.zeros_like(x)
x_ones = torch.ones_like(x)

### Tensor Properties

In [125]:
x = torch.randn(3, 4, 5)
x.shape                 # torch.Size([3, 4, 5])
x.size()                # Same as shape
x.dtype                 # Data type
x.device                # cpu or cuda
x.requires_grad         # Gradient tracking
x.ndim                  # Number of dimensions (3)
x.numel()               # Total elements (60)

60

### Type Conversion

In [126]:
x = torch.randn(3, 4)
x.int()                 # to int32
x.long()                # to int64
x.float()               # to float32
x.double()              # to float64
x.half()                # to float16
x.bool()                # to boolean

# Device conversion
# x.to('cuda')            # Move to GPU (2.5 GB+ torch version)
x.cpu()                 # Move to CPU
x.numpy()               # To NumPy (must be on CPU)
arr=np.array(x.numpy())  # From Tensor to NumPy
torch.from_numpy(arr)   # From NumPy to Tensor

tensor([[-0.2738,  1.0455,  0.4718,  0.9705],
        [-1.2570,  0.3437,  0.9924,  1.4733],
        [-1.3379,  0.7186, -1.4230, -1.2065]])

## 3. Tensor Operations
### Basic Math

In [127]:
a = torch.tensor([1, 2, 3])
b = torch.tensor([4, 5, 6])

# Element-wise operations
a + b                   # Addition
a - b                   # Subtraction
a * b                   # Multiplication
a / b                   # Division
a ** 2                  # Power
torch.add(a, b)         # Same as a + b
a.add_(b)               # In-place (note the underscore)

# Aggregations
a.sum()                 # Sum all elements
a.float().mean()                # Mean
a.float().std()                 # Standard deviation
a.max()                 # Maximum value
a.min()                 # Minimum value
a.argmax()              # Index of max
a.argmin()              # Index of min

tensor(0)

### Matrix Operations

In [128]:
A = torch.randn(3, 4)
B = torch.randn(4, 5)

# Matrix multiplication
C = torch.matmul(A, B)           # (3, 5)
C = A @ B                        # Same
C = torch.mm(A, B)               # 2D only

# Batch matrix multiplication
A_batch = torch.randn(10, 3, 4)
B_batch = torch.randn(10, 4, 5)
C_batch = torch.bmm(A_batch, B_batch)  # (10, 3, 5)

# Transpose
A.T                              # Transpose
A.transpose(0, 1)                # Specify dimensions
A.permute(1, 0)                  # General permutation

# Other operations
A = torch.randn(4, 4)
A.inverse()                      # Matrix inverse
torch.det(A)                     # Determinant
eigenvalues, eigenvectors = torch.linalg.eig(A)  # Eigenvalues and eigenvectors

### Reshaping

In [129]:
x = torch.randn(2, 3, 4)

# Reshape
x.view(6, 4)                     # Must be contiguous
x.reshape(6, 4)                  # Works on non-contiguous
x.view(-1)                       # Flatten (24,)
x.view(2, -1)                    # Auto-calculate (2, 12)

# Squeeze & Unsqueeze
x = torch.randn(1, 3, 1, 4)
x.squeeze()                      # Remove all 1-dims → (3, 4)
x.squeeze(0)                     # Remove specific dim → (3, 1, 4)
x.unsqueeze(0)                   # Add dim at position 0
x.unsqueeze(-1)                  # Add dim at end

# Flatten
x.flatten()                      # Flatten all
x.flatten(start_dim=1)           # Flatten from dim 1

tensor([[-0.4413, -0.5042,  0.5971,  1.0270, -1.1113,  0.0516,  0.8574, -0.1873,
         -0.0059,  0.5350, -0.5601, -2.6708]])

### Indexing & Slicing

In [130]:
x = torch.randn(4, 5, 6)

# Basic indexing
x[0]                             # First element
x[:2]                            # First 2 elements
x[1:3]                           # Elements 1 and 2
x[:, 0]                          # All rows, first column
x[..., 0]                        # Last dimension, first element

# Boolean indexing
mask = x > 0
x[mask]                          # All positive values

# Fancy indexing
indices = torch.tensor([0, 2])
x[indices]                       # Select rows 0 and 2

# Gathering
output = torch.index_select(x, dim=1, index=indices)

print(output.shape)

torch.Size([4, 2, 6])


### Concatenation & Stacking

In [131]:
a = torch.randn(2, 3)
b = torch.randn(2, 3)

# Concatenate
torch.cat([a, b], dim=0)         # (4, 3)
torch.cat([a, b], dim=1)         # (2, 6)

# Stack (adds new dimension)
torch.stack([a, b], dim=0)       # (2, 2, 3)
torch.stack([a, b], dim=1)       # (2, 2, 3)

# Split
torch.split(a, 1, dim=0)         # Split into chunks
torch.chunk(a, 2, dim=1)         # Split into n chunks

(tensor([[-1.2197,  0.7318],
         [-0.5083, -0.5122]]),
 tensor([[0.7136],
         [0.7368]]))

## 4. Autograd (Automatic Differentiation)

In [132]:
# Enable gradient tracking
x = torch.tensor([2.0], requires_grad=True)
y = torch.tensor([3.0], requires_grad=True)

# Forward pass
z = x ** 2 + y ** 3

# Backward pass
z.backward(retain_graph=True)    # Compute gradients and retain graph to use again

# Access gradients
print(x.grad)                    # dz/dx = 2x = 4
print(y.grad)                    # dz/dy = 3y² = 27

# Gradient accumulation
z.backward()                     # Gradients accumulate!
x.grad.zero_()                   # Zero gradients before next pass

# Prevent gradient tracking
with torch.no_grad():
    z = x ** 2 + y ** 3          # No gradients computed

# Detach from computation graph
z_detached = z.detach()          # Tensor without gradients

tensor([4.])
tensor([27.])


### Custom Functions

In [133]:
class MyFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x):
        ctx.save_for_backward(x)
        return x ** 2
    
    @staticmethod
    def backward(ctx, grad_output):
        x, = ctx.saved_tensors
        return grad_output * 2 * x

# Usage
x = torch.tensor([2.0], requires_grad=True)
y = MyFunction.apply(x)
y.backward()
print(x.grad)  # 4.0

tensor([4.])


## 5. Neural Network Layers
### Basic Layers

In [134]:
import torch.nn as nn

# Linear (Fully Connected)
linear = nn.Linear(in_features=10, out_features=5, bias=True)

# Convolutional
conv2d = nn.Conv2d(in_channels=3, out_channels=64, 
                   kernel_size=3, stride=1, padding=1)
conv1d = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3)

# Pooling
maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
avgpool = nn.AvgPool2d(kernel_size=2)
adaptivepool = nn.AdaptiveAvgPool2d((1, 1))  # Output size

# Normalization
batchnorm = nn.BatchNorm2d(num_features=64)
layernorm = nn.LayerNorm(normalized_shape=[10, 20])
groupnorm = nn.GroupNorm(num_groups=8, num_channels=64)

# Dropout
dropout = nn.Dropout(p=0.5)
dropout2d = nn.Dropout2d(p=0.5)

# Recurrent
rnn = nn.RNN(input_size=10, hidden_size=20, num_layers=2)
lstm = nn.LSTM(input_size=10, hidden_size=20, num_layers=2)
gru = nn.GRU(input_size=10, hidden_size=20, num_layers=2)

# Attention
attention = nn.MultiheadAttention(embed_dim=512, num_heads=8)

# Embedding
embedding = nn.Embedding(num_embeddings=1000, embedding_dim=128)

### Activation Functions

In [135]:
# All available as nn.Module or F.function
relu = nn.ReLU()
# Or: F.relu(x)

# Common activations
nn.ReLU()
nn.LeakyReLU(negative_slope=0.01)
nn.PReLU()
nn.ELU()
nn.GELU()
nn.Sigmoid()
nn.Tanh()
nn.Softmax(dim=1)
nn.LogSoftmax(dim=1)

LogSoftmax(dim=1)

## 6. Building Models
### Sequential Model

In [136]:
model = nn.Sequential(
    nn.Linear(784, 256),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(128, 10)
)

### Custom Model (Recommended)

In [137]:
class MyModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(MyModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Instantiate
model = MyModel(784, 256, 10)
model = model.to(device)

### CNN Example

In [138]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)
        self.dropout = nn.Dropout(0.25)
    
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 64 * 7 * 7)  # Flatten
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

model = CNN().to(device)

### Model Utilities

In [139]:
# Print model architecture
print(model)

# Count parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

# Access layers
for name, param in model.named_parameters():
    print(f"{name}: {param.shape}")

# Freeze/unfreeze layers
for param in model.parameters():
    param.requires_grad = False  # Freeze

# Apply function to all modules
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

model.apply(init_weights)

CNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=3136, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)
conv1.weight: torch.Size([32, 1, 3, 3])
conv1.bias: torch.Size([32])
conv2.weight: torch.Size([64, 32, 3, 3])
conv2.bias: torch.Size([64])
fc1.weight: torch.Size([128, 3136])
fc1.bias: torch.Size([128])
fc2.weight: torch.Size([10, 128])
fc2.bias: torch.Size([10])


CNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=3136, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

## 7. Loss Functions & Optimizers

In [140]:
# Classification
criterion = nn.CrossEntropyLoss()                    # Multi-class
criterion = nn.BCELoss()                             # Binary (needs sigmoid)
criterion = nn.BCEWithLogitsLoss()                   # Binary (with logits)
criterion = nn.NLLLoss()                             # Negative Log Likelihood

# Regression
criterion = nn.MSELoss()                             # Mean Squared Error
criterion = nn.L1Loss()                              # Mean Absolute Error
criterion = nn.SmoothL1Loss()                        # Huber Loss

# Other
criterion = nn.KLDivLoss()                           # KL Divergence
criterion = nn.CosineEmbeddingLoss()                 # Cosine similarity

# Usage
# outputs = model(inputs)
# loss = criterion(outputs, targets)

## 8. Optimizers

In [141]:
import torch.optim as optim

# Common optimizers
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999))
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
optimizer = optim.RMSprop(model.parameters(), lr=0.01, alpha=0.99)
optimizer = optim.Adagrad(model.parameters(), lr=0.01)

# Optimizer step
optimizer.zero_grad()      # Clear gradients
# loss.backward()            # Compute gradients
optimizer.step()           # Update weights

# Learning rate scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', 
                                                  factor=0.1, patience=10)

# Scheduler step
# scheduler.step()           # After each epoch
# Or for ReduceLROnPlateau:
# scheduler.step(val_loss)

## 9. Dataset & DataLoader
### Custom Dataset

In [142]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, data, labels, transform=None):
        self.data = data
        self.labels = labels
        self.transform = transform
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample = self.data[idx]
        label = self.labels[idx]
        
        if self.transform:
            sample = self.transform(sample)
        
        return sample, label

# Create dataset
# dataset = CustomDataset(data, labels)

# Create DataLoader
# dataloader = DataLoader(dataset, 
#                        batch_size=32, 
#                        shuffle=True, 
#                        num_workers=4,
#                        pin_memory=True)  # Faster GPU transfer

# Iterate
# for batch_idx, (data, labels) in enumerate(dataloader):
    # data, labels = data.to(device), labels.to(device)
    # Training code here

### Built-in Datasets

In [143]:
from torchvision import datasets, transforms

# Transforms
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])
])

# Load datasets
train_dataset = datasets.MNIST(root='./data', train=True, 
                               download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, 
                              download=True, transform=transform)

# Other datasets: CIFAR10, CIFAR100, ImageNet, etc.

## 10. Training Loop

In [144]:
def train(model, dataloader, criterion, optimizer, device):
    model.train()  # Set to training mode
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        # Move to device
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # Backward pass
        loss.backward()
        
        # Update weights
        optimizer.step()
        
        # Statistics
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()
        
        # Print progress
        if batch_idx % 100 == 0:
            print(f'Batch {batch_idx}/{len(dataloader)}, '
                  f'Loss: {loss.item():.4f}')
    
    epoch_loss = running_loss / len(dataloader)
    epoch_acc = 100. * correct / total
    
    return epoch_loss, epoch_acc

## 11. Evaluation Loop

In [145]:
def evaluate(model, dataloader, criterion, device):
    model.eval()  # Set to evaluation mode
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():  # Disable gradient computation
        for inputs, targets in dataloader:
            inputs, targets = inputs.to(device), targets.to(device)
            
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
    
    epoch_loss = running_loss / len(dataloader)
    epoch_acc = 100. * correct / total
    
    return epoch_loss, epoch_acc

## 12. Saving & Loading Models

In [146]:
# Save entire model
torch.save(model, 'model.pth')
model = torch.load('model.pth', weights_only=False)

# Save only weights (recommended)
torch.save(model.state_dict(), 'model_weights.pth')
model.load_state_dict(torch.load('model_weights.pth', weights_only=False))

# Save checkpoint
checkpoint = {
    'epoch': 100,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss,
    'accuracy': 98
}
torch.save(checkpoint, 'checkpoint.pth')

# Load checkpoint
checkpoint = torch.load('checkpoint.pth', weights_only=False)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

# Load for inference only
model.load_state_dict(torch.load('model_weights.pth', weights_only=False))
model.eval()

CNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=3136, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

## 13. Transfer Learning

In [147]:
import torchvision.models as models

# Load pretrained model
model = models.resnet18(pretrained=True)

# Freeze all layers
for param in model.parameters():
    param.requires_grad = False

# Replace final layer
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, 10)  # Assuming 10 classes

# Only train final layer
optimizer = optim.Adam(model.fc.parameters(), lr=0.001)

# Or fine-tune entire model
for param in model.parameters():
    param.requires_grad = True
optimizer = optim.Adam(model.parameters(), lr=0.0001)

## 14.  Image Transforms

In [148]:
from torchvision import transforms

# Common transforms
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])
])

# Data augmentation
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.3, contrast=0.3, 
                          saturation=0.3, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])
])

# Test transform (no augmentation)
test_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225])
])

## 15. Common Patterns
### Early Stopping

In [None]:
class EarlyStopping:
    def __init__(self, patience=7, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
    
    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

# Usage
early_stopping = EarlyStopping(patience=5)
for epoch in range(num_epochs):
    train_loss = train(...)
    val_loss = evaluate(...)
    early_stopping(val_loss)
    if early_stopping.early_stop:
        print("Early stopping")
        break

### Model Ensembling

In [None]:
models = [model1, model2, model3]
for model in models:
    model.eval()

# Ensemble prediction
with torch.no_grad():
    outputs = [model(inputs) for model in models]
    ensemble_output = torch.stack(outputs).mean(dim=0)

## 16. Tips & Best Practices

In [None]:
# Always set random seeds for reproducibility
torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.backends.cudnn.deterministic = True

# Use .item() to get Python numbers
loss_value = loss.item()  # Not loss (which is a tensor)

# Free memory explicitly
del large_tensor
torch.cuda.empty_cache()

# Use appropriate data types
x = x.float()  # For most operations
x = x.long()   # For indices/labels

# Batch operations when possible
# Bad:
for i in range(len(tensors)):
    result.append(model(tensors[i]))

# Good:
result = model(torch.stack(tensors))

# Use in-place operations carefully (they save memory)
x.add_(y)    # In-place
x = x + y    # Creates new tensor

# Move data to GPU once, not in loop
inputs = inputs.to(device)
for epoch in range(num_epochs):
    outputs = model(inputs)  # Already on GPU