In [1]:
import torch
import torch.nn as nn
import torchmetrics
import numpy as np
import matplotlib.pyplot as plt
import torchvision

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
import torchvision.transforms.v2 as T

toTensor = T.Compose([T.ToImage(), T.ToDtype(torch.float32, scale = True)])
train_valid_dataset = torchvision.datasets.CIFAR10(root = "./dataset/", train = True, download=True, transform=toTensor)
test_dataset = torchvision.datasets.CIFAR10(root = "./dataset/", train=False, download=True, transform=toTensor)


In [4]:
train_set, valid_set = torch.utils.data.random_split(train_valid_dataset, [45_000, 5_000])


In [5]:
from torch.utils.data import DataLoader

batch_size = 128
train_loader = DataLoader(train_set, batch_size=batch_size)
valid_loader = DataLoader(valid_set, batch_size = batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [6]:
def use_he_init(module):
    if isinstance(module, nn.Linear):
        nn.init.kaiming_uniform_(module.weight)
        nn.init.zeros_(module.bias)

In [37]:
def build_deep_model(n_hidden, n_neurons, n_inputs, n_outputs):
    layers = [nn.Flatten(), nn.Linear(n_inputs, n_neurons),nn.BatchNorm1d(n_neurons), nn.SiLU()]
    for _ in range(n_hidden - 1):
        layers += [nn.Linear(n_neurons, n_neurons),nn.BatchNorm1d(n_neurons), nn.SiLU()]
    layers += [nn.Linear(n_neurons, n_outputs)]
    model = nn.Sequential(*layers)
    model.apply(use_he_init)

    return model

In [38]:
model = build_deep_model(20, 100, 3*32*32, 10).to(device)
optimizer = torch.optim.NAdam(model.parameters(), lr = 0.125)
loss_fn = nn.CrossEntropyLoss()
accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(device)

In [39]:
def evaluate_tm(model: nn.Module, data_loader: DataLoader, metric: torchmetrics.Accuracy):
    model.eval()
    metric.reset()
    with torch.inference_mode():
        for X_batch, y_batch in data_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            metric.update(y_pred, y_batch)
    return metric.compute()

In [40]:
len(train_loader)

352

In [41]:
import time

def train_model(model: nn.Module, optimizer: torch.optim, loss_fn, metric: torchmetrics.Accuracy,
                train_loader: DataLoader, valid_loader: DataLoader, n_epochs: int, patience: int = 10,
                checkpoint_path: str = None, scheduler = None):
    checkpoint_path = checkpoint_path or "cifar-10.pt"
    history = {"train_losses":[],
               "train_metrics": [],
               "valid_metrics": []}
    best_metric = 0.0
    patience_counter = 0

    for epoch in range(n_epochs):
        total_loss = 0.0
        metric.reset()
        model.train()
        t0 = time.time()
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            loss = loss_fn(y_pred, y_batch)
            total_loss += loss.item()
            # print(f"------Batch statistics-----", f"Loss: {loss.item()}", f"Total Loss: {total_loss}")

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            metric.update(y_pred, y_batch)

        train_metric = metric.compute().item()
        valid_metric = evaluate_tm(model=model, data_loader=valid_loader, metric=metric).item()
        if valid_metric > best_metric:
            torch.save(model.state_dict(), checkpoint_path)
            best_metric = valid_metric
            best = " (best)"
            patience_counter = 0
        else:
            patience_counter += 1
            best = ""

        t1 = time.time()
        history["train_losses"].append(total_loss / len(train_loader))
        history["train_metrics"].append(train_metric)
        history["valid_metrics"].append(valid_metric)
        print(f"Epoch: {epoch + 1}/{n_epochs}",
              f"Train Loss: {history["train_losses"][-1]:.5f}",
              f"Train Metrics: {history["train_metrics"][-1]:.4f}",
              f"Valid Metrics: {history["valid_metrics"][-1]:.4f}{best} in {t1 - t0:.1f}s")
        if scheduler is not None:
            scheduler.step()
        if patience_counter >= patience:
            print("Early Stopping")
            break
    
    model.load_state_dict(torch.load(checkpoint_path))
    return history



In [42]:
n_epochs = 100
history = train_model(model, optimizer, loss_fn, accuracy, train_loader,
                      valid_loader, n_epochs)

Epoch: 1/100 Train Loss: 2.13337 Train Metrics: 0.1780 Valid Metrics: 0.1982 (best) in 6.1s


KeyboardInterrupt: 

In [43]:
class Standardize(nn.Module):
    def __init__(self, sample):
        super().__init__()
        flat = torch.flatten(sample, start_dim = 1)
        mean = flat.mean(dim = 0, keepdim=True)
        std = flat.std(dim = 0, keepdim=True)
        self.register_buffer("mean", mean)
        self.register_buffer("std", std)
    
    def forward(self, X):
        return (X - self.mean) / self.std


In [44]:
all_images = torch.stack([img for img,_ in train_set])
standardize = Standardize(all_images)

In [45]:
def use_lecun_init(module):
    if isinstance(module, nn.Linear):
        nn.init.kaiming_normal_(module.weight, mode="fan_in", nonlinearity="linear")
        nn.init.zeros_(module.bias)

In [46]:
def build_deep_model_with_selu(n_hidden, n_neurons, n_inputs, n_outputs):
    layers = [nn.Flatten(), standardize, nn.Linear(n_inputs, n_neurons), nn.SELU()]
    for _ in range(n_hidden - 1):
        layers += [nn.Linear(n_neurons, n_neurons), nn.SELU()]
    layers += [nn.Linear(n_neurons, n_outputs)]
    model = nn.Sequential(*layers)
    model.apply(use_lecun_init)
    return model

In [47]:
torch.manual_seed(43)
model = build_deep_model_with_selu(n_hidden=20, n_neurons=100, n_inputs=3*32*32, n_outputs=10).to(device)


In [48]:
optimizer = torch.optim.SGD(model.parameters(), lr = 0.125)
loss_fn = nn.CrossEntropyLoss()
accuracy = torchmetrics.Accuracy(task = "multiclass", num_classes=10).to(device)

In [49]:
X_new, y_new = next(iter(train_loader))
y_pred = model(X_new.to(device))
y_pred

tensor([[ 1.3894,  0.6269,  0.8503,  ..., -0.0957, -1.2816, -1.7623],
        [ 0.3867, -0.7577, -0.0635,  ...,  0.6493, -1.4471, -1.2958],
        [ 0.4182,  0.6508, -1.2004,  ...,  0.5712,  0.1454, -1.2473],
        ...,
        [-0.4236, -1.1303,  1.2127,  ...,  0.7121,  0.3489, -0.5387],
        [-0.4858,  0.2503, -0.1575,  ...,  0.0178, -1.1537,  0.2549],
        [-1.1143, -0.2282,  0.9924,  ...,  1.2536,  0.7671,  0.7227]],
       device='cuda:0', grad_fn=<AddmmBackward0>)

In [50]:
y_new = y_new.to(device)

In [51]:
loss = loss_fn(y_pred, y_new)
loss.item()

2.716379165649414

In [52]:
len(train_loader)

352

In [53]:
n_epochs = 100
history = train_model(model, optimizer, loss_fn, accuracy, train_loader,
                      valid_loader, n_epochs)

Epoch: 1/100 Train Loss: nan Train Metrics: 0.1004 Valid Metrics: 0.1014 (best) in 5.1s


KeyboardInterrupt: 

In [None]:
def build_deep_model_with_alpha_dropout(n_hidden, n_neurons, n_inputs, n_outputs, dropout_rate):
    layers = [nn.Flatten(), standardize, nn.Linear(n_inputs, n_neurons), 
              nn.SELU(), nn.AlphaDropout(dropout_rate)]
    for _ in range(n_hidden - 1):
        layers += [nn.Linear(n_neurons, n_neurons), 
                   nn.SELU(), nn.AlphaDropout(dropout_rate)]
    layers += [nn.Linear(n_neurons, n_outputs)]
    model = nn.Sequential(*layers)
    model.apply(use_lecun_init)
    return model

In [55]:
torch.manual_seed(42)

model = build_deep_model_with_alpha_dropout(n_hidden=20, n_neurons=100, n_inputs=3*32*32, n_outputs=10, dropout_rate=0.1).to(device)
optimizer = torch.optim.NAdam(model.parameters(), lr = 1e-3)
loss_fn = nn.CrossEntropyLoss()
accuracy = torchmetrics.Accuracy(task = "multiclass", num_classes=10).to(device)
n_epochs = 100
history = train_model(model, optimizer, loss_fn, accuracy, train_loader, valid_loader, n_epochs)


Epoch: 1/100 Train Loss: 2.15371 Train Metrics: 0.1957 Valid Metrics: 0.2046 (best) in 5.5s
Epoch: 2/100 Train Loss: 1.96613 Train Metrics: 0.2520 Valid Metrics: 0.2162 (best) in 5.7s
Epoch: 3/100 Train Loss: 1.88170 Train Metrics: 0.2818 Valid Metrics: 0.3136 (best) in 4.9s
Epoch: 4/100 Train Loss: 1.82748 Train Metrics: 0.3151 Valid Metrics: 0.3332 (best) in 5.0s
Epoch: 5/100 Train Loss: 1.78960 Train Metrics: 0.3313 Valid Metrics: 0.3328 in 5.2s
Epoch: 6/100 Train Loss: 1.73935 Train Metrics: 0.3548 Valid Metrics: 0.3602 (best) in 4.8s
Epoch: 7/100 Train Loss: 1.70990 Train Metrics: 0.3702 Valid Metrics: 0.3586 in 4.7s
Epoch: 8/100 Train Loss: 1.67694 Train Metrics: 0.3869 Valid Metrics: 0.3558 in 4.9s
Epoch: 9/100 Train Loss: 1.65503 Train Metrics: 0.3947 Valid Metrics: 0.3862 (best) in 5.2s
Epoch: 10/100 Train Loss: 1.63925 Train Metrics: 0.4026 Valid Metrics: 0.3782 in 5.5s
Epoch: 11/100 Train Loss: 1.61455 Train Metrics: 0.4130 Valid Metrics: 0.3974 (best) in 6.2s
Epoch: 12/100 

In [57]:
torch.manual_seed(42)

n_epochs = 60
model = build_deep_model_with_alpha_dropout(n_hidden=20, n_neurons=100, n_inputs=3*32*32, n_outputs=10, dropout_rate=0.1).to(device)
optimizer = torch.optim.NAdam(model.parameters(), lr = 1e-3)
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr = 1e-2, epochs=n_epochs, steps_per_epoch=len(train_loader))
loss_fn = nn.CrossEntropyLoss()
accuracy = torchmetrics.Accuracy(task = "multiclass", num_classes=10).to(device)

history = train_model(model, optimizer, loss_fn, accuracy, train_loader,
                      valid_loader, n_epochs, patience=20, scheduler=scheduler)

Epoch: 1/60 Train Loss: 2.26607 Train Metrics: 0.1722 Valid Metrics: 0.2160 (best) in 5.3s
Epoch: 2/60 Train Loss: 2.01805 Train Metrics: 0.2334 Valid Metrics: 0.2520 (best) in 5.1s
Epoch: 3/60 Train Loss: 1.95107 Train Metrics: 0.2530 Valid Metrics: 0.2716 (best) in 5.4s
Epoch: 4/60 Train Loss: 1.91026 Train Metrics: 0.2703 Valid Metrics: 0.2926 (best) in 5.0s
Epoch: 5/60 Train Loss: 1.87044 Train Metrics: 0.2857 Valid Metrics: 0.3044 (best) in 4.9s
Epoch: 6/60 Train Loss: 1.83257 Train Metrics: 0.3069 Valid Metrics: 0.3232 (best) in 4.8s
Epoch: 7/60 Train Loss: 1.80264 Train Metrics: 0.3244 Valid Metrics: 0.3440 (best) in 4.8s
Epoch: 8/60 Train Loss: 1.76939 Train Metrics: 0.3379 Valid Metrics: 0.3254 in 4.8s
Epoch: 9/60 Train Loss: 1.73349 Train Metrics: 0.3514 Valid Metrics: 0.3600 (best) in 4.9s
Epoch: 10/60 Train Loss: 1.70743 Train Metrics: 0.3666 Valid Metrics: 0.3422 in 5.2s
Epoch: 11/60 Train Loss: 1.67766 Train Metrics: 0.3800 Valid Metrics: 0.3676 (best) in 5.4s
Epoch: 12/6