In [84]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print("HIP version:", torch.version.hip)
print("Device count:", torch.cuda.device_count())
print("Device 0:", torch.cuda.get_device_name(0))

Using device: cuda
HIP version: 6.2.41133-dd7f95766
Device count: 1
Device 0: AMD Radeon RX 6800


In [None]:
from PIL import Image
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import random_split, DataLoader
import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet18
from torch.utils.tensorboard import SummaryWriter
from torch.cuda.amp import GradScaler, autocast
from pytorch_grad_cam import GradCAM
from pytorch_grad_cam.utils.model_targets import ClassifierOutputTarget
from pytorch_grad_cam.utils.image import show_cam_on_image
import cv2
import numpy as np
import matplotlib.pyplot as plt

In [86]:
transform_train = transforms.Compose([
    transforms.AutoAugment(transforms.AutoAugmentPolicy.CIFAR10),
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

In [87]:
train_data = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
test_data = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)

val_size = 5000
test_size = len(test_data) - val_size

val_data, test_data = random_split(test_data, [val_size, test_size])

train_loader = DataLoader(train_data, batch_size=128, shuffle=True, num_workers=16, pin_memory=True)
val_loader = DataLoader(val_data, batch_size=128, shuffle=False, num_workers=16, pin_memory=True)
test_loader = DataLoader(test_data, batch_size=128, shuffle=False, num_workers=16, pin_memory=True)

print(f'Training dataset: {len(train_loader.dataset)}')
print(f'Validation dataset: {len(val_loader.dataset)}')
print(f'Test dataset: {len(test_loader.dataset)}')


Files already downloaded and verified
Files already downloaded and verified
Training dataset: 50000
Validation dataset: 5000
Test dataset: 5000


In [88]:
image, label = train_data[0]

In [89]:
image.size()

torch.Size([3, 32, 32])

In [90]:
class_names = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']

In [91]:
class MyResnet(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.backbone = resnet18(weights=None)
        self.backbone.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.backbone.maxpool = nn.Identity()
        self.backbone.fc = nn.Linear(512, num_classes)

    def forward(self, x):
        return self.backbone(x)


In [92]:
net = MyResnet()
net = net.to(device)

In [None]:
#net = NeuralNet().to(device)
loss_function = nn.CrossEntropyLoss()
scaler = torch.amp.GradScaler(device)

num_of_epochs = 75

optimizer = optim.SGD(net.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)

scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_of_epochs)

torch.manual_seed(42)

<torch._C.Generator at 0x7b06db798b10>

In [94]:
writer = SummaryWriter()

In [95]:
def load_checkpoint(model, optimizer, filename="trained_model.pth"):
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['model_state'])
    optimizer.load_state_dict(checkpoint['optimizer_state'])
    scheduler.load_state_dict(checkpoint['scheduler_state'])
    scaler.load_state_dict(checkpoint['scaler_state'])
    epoch = checkpoint['epoch']
    return epoch

In [96]:
#checkpoint = load_checkpoint(net, optimizer, filename="trained_model.pth")
#start_epoch = checkpoint + 1

In [97]:
start_epoch = 0

In [98]:

highest_val_accuracy = 0.0

for epoch in range(start_epoch, num_of_epochs):
    net.train()

    running_loss = 0.0
    for _, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)
        
        optimizer.zero_grad(set_to_none=True)

        with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            outputs = net(inputs)
            loss = loss_function(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)

    net.eval()
    val_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = net(inputs)
            loss = loss_function(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = 100 * correct / total

    if val_accuracy > highest_val_accuracy:
        highest_val_accuracy = val_accuracy

        checkpoint = {
            "epoch": epoch,
            "model_state": net.state_dict(),
            "optimizer_state": optimizer.state_dict(),
            "scheduler_state": scheduler.state_dict(),
            "scaler_state": scaler.state_dict(),
            "config": {
                "conv1_kernel": 3,
                "remove_maxpool": True,
                "num_classes": 10
            },
            "best_val_accuracy": highest_val_accuracy
        }

        torch.save(checkpoint, "trained_model.pth")
    
    global_step = epoch + 1
    writer.add_scalar("Loss/Train", avg_train_loss, global_step)
    writer.add_scalar("Loss/Val", avg_val_loss, global_step)
    writer.add_scalar("Accuracy/Val", val_accuracy, global_step)
    writer.add_scalar("Params/LR", optimizer.param_groups[0]["lr"], global_step)

    print(
        f'Epoch ({epoch + 1}/{num_of_epochs}),'
        f'Train Loss: {avg_train_loss:.4f}, '
        f'Val Loss: {avg_val_loss:.4f}, '
        f'Val Accuracy: {val_accuracy:.2f}%'
        )

    scheduler.step()

writer.flush()

print(f'Highest Validation Accuracy: {highest_val_accuracy:.2f}%')

Epoch (1/75),Train Loss: 2.3244, Val Loss: 1.8535, Val Accuracy: 31.80%
Epoch (2/75),Train Loss: 1.9036, Val Loss: 1.6257, Val Accuracy: 39.92%
Epoch (3/75),Train Loss: 1.6696, Val Loss: 1.4399, Val Accuracy: 50.22%
Epoch (4/75),Train Loss: 1.4397, Val Loss: 1.1981, Val Accuracy: 57.12%
Epoch (5/75),Train Loss: 1.2285, Val Loss: 1.0191, Val Accuracy: 64.86%
Epoch (6/75),Train Loss: 1.0570, Val Loss: 0.7792, Val Accuracy: 72.20%
Epoch (7/75),Train Loss: 0.9294, Val Loss: 1.0551, Val Accuracy: 65.18%
Epoch (8/75),Train Loss: 0.8530, Val Loss: 1.3172, Val Accuracy: 62.84%
Epoch (9/75),Train Loss: 0.8050, Val Loss: 0.6980, Val Accuracy: 75.78%
Epoch (10/75),Train Loss: 0.7652, Val Loss: 0.6643, Val Accuracy: 78.00%
Epoch (11/75),Train Loss: 0.7449, Val Loss: 0.6992, Val Accuracy: 76.60%
Epoch (12/75),Train Loss: 0.7200, Val Loss: 0.5830, Val Accuracy: 79.66%
Epoch (13/75),Train Loss: 0.6998, Val Loss: 0.5458, Val Accuracy: 81.52%
Epoch (14/75),Train Loss: 0.6783, Val Loss: 0.7411, Val Accu