In [None]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models
import torch.optim as optim

# Define a transform to normalize the data. The transforms are applied
# in the order they are given. First, the image is converted to a tensor,
# then it is normalized with mean and standard deviation of 0.5 for all channels.
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

# Download the CIFAR-10 dataset, apply the transforms, and load it into
# a DataLoader for efficient batch processing and shuffling.
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                          shuffle=True, num_workers=0)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=4,
                                         shuffle=False, num_workers=0)

# Define the classes in the CIFAR-10 dataset. These correspond to the labels
# of the images in the dataset.
classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

# Load the pretrained ResNet-50 model.
net = models.resnet50(pretrained=True)

# Freeze all the parameters of the model, making them untrainable.
# This is done because we want to keep the weights of the pre-trained model
# and only train the final layer that we will add next.
for param in net.parameters():
    param.requires_grad = False

# Replace the last fully connected layer of the model with a new one 
# having the correct number of output features. The new layer's parameters
# are not frozen, so they will be learned during training.
num_ftrs = net.fc.in_features
net.fc = nn.Linear(num_ftrs, 10)

# Define a loss function and an optimizer. The loss function is used to measure
# how well the model's predictions match the actual labels, and the optimizer
# is used to update the model's parameters based on this loss.
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.fc.parameters(), lr=0.001, momentum=0.9)

# Set device to GPU if available, otherwise use CPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Move the model to the device (GPU or CPU)
net.to(device)

# Training loop
for epoch in range(5):  # Loop over the dataset multiple times

    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # Get the inputs; data is a list of [inputs, labels]
        # Move the inputs and labels to the device
        inputs, labels = data[0].to(device), data[1].to(device)

        # Zero the parameter gradients
        # This is necessary because by default, gradients are accumulated
        # in backward passes, so we need to clear them at each step
        optimizer.zero_grad()

        # Forward pass: compute the outputs by passing inputs to the model
        outputs = net(inputs)
        # Compute the loss between the outputs and the ground truth labels
        loss = criterion(outputs, labels)
        # Backward pass: compute the gradients of the loss w.r.t. the model's parameters
        loss.backward()
        # Perform an optimization step: update the model's parameters
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
        # Print every 2000 mini-batches
        if i % 2000 == 1999:    
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

print('Finished Training')

# After training, we want to test how well the model performs on unseen data
# We'll compute the accuracy of the model on the test data

correct = 0
total = 0
# We don't need to compute gradients during testing, so we use torch.no_grad() 
# to disable gradient computation
with torch.no_grad():
    for data in testloader:
        # Get the inputs; data is a list of [inputs, labels]
        # Move the inputs and labels to the device
        images, labels = data[0].to(device), data[1].to(device)
        # Forward pass: compute the outputs by passing inputs to the model
        outputs = net(images)
        # Get the predicted class by finding the maximum value 
        # (since we're using CrossEntropyLoss)
        _, predicted = torch.max(outputs.data, 1)
        # Update the total number of images
        total += labels.size(0)
        # Update the number of correctly predicted images
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the 10000 test images: %d %%' % (
    100 * correct / total))

# We will store the vectors in a Python dictionary. 
# The keys will be the image labels and the values will be the vectors.
vectors = {}

# We move the model to the CPU for the feature extraction. 
# This is because we're going to be working with numpy, which can't handle CUDA tensors.
net = net.to('cpu')

# We don't need to compute gradients during feature extraction, 
# so we use torch.no_grad() to disable gradient computation.
with torch.no_grad():
    for data in trainloader:
        # Get the inputs; data is a list of [inputs, labels]
        images, labels = data
        # Forward pass: compute the outputs by passing inputs to the model
        outputs = net(images)
        # For each image in the batch, we store its vector in the dictionary.
        for i in range(len(outputs)):
            vectors[labels[i].item()] = outputs[i].numpy()
