In [1]:
import torch
import torchvision
import torchvision.models as models
import torchvision.transforms as transforms
import torch.backends.cudnn as cudnn


import pickle
import os
from tqdm import tqdm

from torchinfo import summary


In [2]:
model = models.resnet18(weights=None)
num_classes = 10
model.fc = torch.nn.Linear(model.fc.in_features, num_classes)

num_params=sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {num_params:,}")



Number of trainable parameters: 11,181,642


In [5]:
# transform_train = transforms.Compose([
#     transforms.RandomHorizontalFlip(),
#     transforms.RandomCrop(32, padding=4),
#     transforms.ToTensor(),
#     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalizing the images
# ])

# transform_test = transforms.Compose([
#     transforms.ToTensor(),
#     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
# ])



def calculate_mean_std(dataset):
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=False, num_workers=2)
    mean_sum = torch.zeros(3)
    std_sum = torch.zeros(3)
    num_batches = 0

    for images, _ in dataloader:
        num_batches += 1
        for i in range(3):
            mean_sum[i] += images[:,i,:,:].mean()
            std_sum[i] += images[:,i,:,:].std()

    mean = mean_sum / num_batches
    std = std_sum / num_batches

    return mean, std


# Path for the dataset and the mean/std file
data_path = '../data'
mean_std_file = os.path.join(data_path, 'cifar10_mean_std.pkl')


# Check if mean/std file exists, calculate if not
if not os.path.exists(mean_std_file):
    # Load CIFAR-10 without normalization
    trainset_raw = torchvision.datasets.CIFAR10(root=data_path, train=True, download=True, transform=transforms.ToTensor())
    mean, std = calculate_mean_std(trainset_raw)
    with open(mean_std_file, 'wb') as f:
        pickle.dump((mean, std), f)
    print("Mean and Std Dev calculated and saved.")
else:
    with open(mean_std_file, 'rb') as f:
        mean, std = pickle.load(f)
    print("Mean and Std Dev loaded from file.")

print(f"Mean: {mean}, Std: {std}")

# Use the calculated/loaded mean and std for normalization
transform_train = transforms.Compose([
    transforms.RandomAffine(degrees=20, translate=(0.1, 0.1), scale=(0.8, 1.2), shear=10),
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
#     transforms.ConvertImageDtype(torch.float32),
    transforms.RandomErasing(p=0.5, scale=(0.02, 0.33), ratio=(0.3, 3.3), value=0),
    transforms.Normalize(mean, std)
])
# transform_train = transforms.Compose([
#     transforms.RandomCrop(32, padding=4),
#     transforms.RandomHorizontalFlip(),
#     transforms.ToTensor(),
#     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
# ])

transform_test = transforms.Compose([
    transforms.ToTensor(),
#     transforms.ConvertImageDtype(torch.float32),
    transforms.Normalize(mean, std)
])

# transform_test = transforms.Compose([
#     transforms.ToTensor(),
#     transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
# ])


Mean and Std Dev loaded from file.
Mean: tensor([0.4915, 0.4822, 0.4466]), Std: tensor([0.2463, 0.2428, 0.2607])


In [3]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
    
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [4]:
device = get_default_device()
device


device(type='cpu')

In [6]:
device=torch.device('cuda:2')
device


device(type='cuda', index=2)

In [10]:
# device=torch.device('cuda:2')
# print(f"Using device: {device}")
model.to(device)

batch_size=128
# Load the CIFAR-10 dataset with transforms
trainset = torchvision.datasets.CIFAR10(root='../data', train=True, download=True, transform=transform_train)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2,pin_memory=True)

testset = torchvision.datasets.CIFAR10(root='../data', train=False, download=True, transform=transform_test)
testloader = torch.utils.data.DataLoader(testset, batch_size=2*batch_size, shuffle=False, num_workers=2,pin_memory=True)

# Define loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.01,weight_decay=5e-4,fused=True,betas=(0.9,0.95))
# optimizer = torch.optim.SGD(model.parameters(), lr=0.001,weight_decay=1e-4,momentum=0.9)

summary(model,input_size=(batch_size, 3, 32, 32))


Files already downloaded and verified
Files already downloaded and verified


Layer (type:depth-idx)                   Output Shape              Param #
ResNet                                   [128, 10]                 --
├─Conv2d: 1-1                            [128, 64, 16, 16]         9,408
├─BatchNorm2d: 1-2                       [128, 64, 16, 16]         128
├─ReLU: 1-3                              [128, 64, 16, 16]         --
├─MaxPool2d: 1-4                         [128, 64, 8, 8]           --
├─Sequential: 1-5                        [128, 64, 8, 8]           --
│    └─BasicBlock: 2-1                   [128, 64, 8, 8]           --
│    │    └─Conv2d: 3-1                  [128, 64, 8, 8]           36,864
│    │    └─BatchNorm2d: 3-2             [128, 64, 8, 8]           128
│    │    └─ReLU: 3-3                    [128, 64, 8, 8]           --
│    │    └─Conv2d: 3-4                  [128, 64, 8, 8]           36,864
│    │    └─BatchNorm2d: 3-5             [128, 64, 8, 8]           128
│    │    └─ReLU: 3-6                    [128, 64, 8, 8]           --
│

In [28]:
trainloader = DeviceDataLoader(trainloader, device)
testloader = DeviceDataLoader(testloader, device)


In [12]:
import time

# print("compiling the model... (takes a ~minute)")
# model = torch.compile(model) # requires PyTorch 2.0

num_epochs = 200  # Set the number of epochs
best_accuracy = 0.0
decay_rate = 10 ** (-2 / num_epochs)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)
# scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, decay_rate)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 
                                        T_0 = 1563,# Number of iterations for the first restart (Total iterations are epochs*len(train_loader) then divide that by how many times you want a restart)
                                        T_mult = 1, # A factor increases TiTi​ after a restart
                                        eta_min = 1e-5) # Minimum learning rate


    


start_time = time.time()    
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0


    for i, data in enumerate(trainloader):
        inputs, labels = data[0].to(device), data[1].to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_train_loss = running_loss / len(trainloader)

    # Validation loop
    model.eval()
    correct = 0
    total = 0
    running_val_loss = 0.0

    with torch.no_grad():
        for data in testloader:
            images, labels = data[0].to(device), data[1].to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    avg_val_loss = running_val_loss / len(testloader)
    accuracy = 100 * correct / total

    # Save best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        torch.save(model.state_dict(), 'best_model.pth')

    # Calculate and format runtime and expected time
    elapsed_time = time.time() - start_time
    expected_time = elapsed_time * num_epochs / (epoch + 1)
    elapsed_str = time.strftime("%H:%M:%S", time.gmtime(elapsed_time))
    expected_str = time.strftime("%H:%M:%S", time.gmtime(expected_time))

    status_message = f"Epoch: {epoch+1}/{num_epochs}\tTrain Loss: {avg_train_loss:.4f}\tTest Loss: {avg_val_loss:.4f}\tAccuracy: {accuracy:.2f}%\tElapsed Time: {elapsed_str}\tExpected Time: {expected_str}"
    print(f"\r{status_message}",end='')

    scheduler.step()  # Step the scheduler

print('Finished Training')


Epoch: 200/200	Train Loss: 0.9813	Test Loss: 0.8190	Accuracy: 72.38%	Elapsed Time: 00:46:36	Expected Time: 00:46:36Finished Training
