# **Dataset Setup**

### **Download / Prepare Dataset**
Use the Countries Flags Dataset (24 countries, ~48.5 MB).  
The dataset is stored on Google Drive and will be downloaded and extracted into Colab, preparing it for loading and preprocessing.


In [None]:
# Install gdown for downloading files from Google Drive
!pip install -q gdown

# Download the ZIP from Google Drive
!gdown https://drive.google.com/uc?id=11jbRhBOS0cofCYv6HaB0IpKI8RzFk40z -O flags.zip

# Unzip into 'images/' folder
!unzip -q flags.zip -d images


### **Load Images & Labels**
Load all flag images from the 'images/Flags' folder into memory and create corresponding labels for each image.  
Display a sample image with its label to verify that the data has loaded correctly.



In [None]:
# Load and test the dataset
from PIL import Image
import matplotlib.pyplot as plt
import os

# Functions to list directories and find all JPG images
def list_files_in_directory(folder_path):
    return os.listdir(folder_path)

def find_jpg_paths(folder_path):
    jpg_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".jpg"):
                jpg_paths.append(os.path.join(root, file))
    return jpg_paths

# Load images and create labels
flags = []
labels = []

# Folder containing country subfolders
dataset_path = "/content/images/Flags"

for country in list_files_in_directory(dataset_path):
    flag_paths = find_jpg_paths(os.path.join(dataset_path, country))
    for path in flag_paths:
        image = Image.open(path)
        flags.append(image)
        labels.append(country)

# Display a sample image with its label as a sanity check
index = 146  # example index
print(f"Label: {labels[index]}")
plt.imshow(flags[index])
plt.axis('off')
plt.show()


### **Transform, Split, and Augment**
Apply image transformations and AutoAugment to the training dataset, and standard transformations to the test dataset.  
Split the dataset into training and testing sets and create DataLoaders to efficiently feed images to the model during training and testing.



In [None]:
# Data transforms, loading, and splitting
import torch
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
from torchvision.transforms import AutoAugmentPolicy

def split_dataset(display=False, batch_size=8):
    """
    Load the Flags dataset, apply transformations, split into train/test sets,
    and return DataLoaders.

    Args:
        display (bool): If True, prints dataset info.
        batch_size (int): Number of images per batch in DataLoader.

    Returns:
        train_loader, test_loader (DataLoader): PyTorch DataLoaders for training and testing.
    """

    # Transformations
    transform_no_augment = transforms.Compose([
        transforms.Resize([224,224]),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    transform_augment = transforms.Compose([
        transforms.AutoAugment(policy=AutoAugmentPolicy.IMAGENET),
        transforms.Resize([224,224]),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    # Load dataset using ImageFolder (expects subfolders for each class)
    dataset = ImageFolder(root='/content/images/Flags')

    # Split into train/test sets (80% train, 20% test)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

    # Apply transforms
    train_dataset.dataset.transform = transform_augment
    test_dataset.dataset.transform = transform_no_augment

    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Display dataset info if requested
    if display:
        print(f"Total images: {len(dataset)}")
        print(f"Training set size: {len(train_dataset)}")
        print(f"Testing set size: {len(test_dataset)}")
        print(f"Number of classes: {len(dataset.classes)}")
        print(f"Classes: {dataset.classes}")

    return train_loader, test_loader


# Test the function
train_load, test_load = split_dataset(display=True)


# **Neural Network**

### **Network Architecture**
Establish a convolutional neural network with two convolutional layers and three fully connected layers.  
Set up the network to run on GPU if available, and verify the model output shape with a random input.

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    """
    Simple CNN for flag classification.
    Input: 3x224x224 images
    Output: 24-class flag predictions
    """
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 8, kernel_size=3)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(8, 16, kernel_size=3)
        self.fc1 = nn.Linear(16 * 54 * 54, 120)  # after conv + pool
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 24)             # 24 classes

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 54 * 54)            # Flatten for fully connected layers
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# Initialize model
model = Net().to(device)

# Test forward pass with random input
inputs = torch.randn(1, 3, 224, 224).to(device)
outputs = model(inputs)
print("Output shape:", outputs.size())


### **Training Function**
Define a function to train the convolutional neural network using cross-entropy loss and stochastic gradient descent (SGD).  
Train for a set number of epochs or until a target loss is reached, and display loss statistics after each epoch.

In [None]:
import torch.optim as optim
from torch import nn

def train(model: nn.Module, dataloader: DataLoader, device: torch.device):
    """
    Train a convolutional neural network using cross-entropy loss and SGD.

    Args:
        model (nn.Module): The neural network to train.
        dataloader (DataLoader): DataLoader for training data.
        device (torch.device): CPU or GPU device for training.

    Training runs for up to 15 epochs or until the average loss falls below 1.
    Loss statistics are printed after each epoch.
    """
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

    max_epochs = 15
    target_loss = 1
    epoch = 0
    avg_loss = target_loss + 1

    while avg_loss > target_loss and epoch < max_epochs:
        running_loss = 0.0
        for i, (inputs, labels) in enumerate(dataloader, 0):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_loss = running_loss / (i + 1)
        print(f"[Epoch {epoch+1}, Batch {i+1}] Loss: {avg_loss:.5f}")
        epoch += 1

    print("\nTraining Complete")
    print(f"Final Loss: {avg_loss:.5f}")
    print(f"Total Epochs: {epoch}\n")


### **Testing Function**
Define a function to evaluate the trained network on a test dataset.  
The function computes accuracy as the percentage of correctly predicted labels.  
Optionally, limit the number of samples used for faster testing.

In [None]:
def test(model: nn.Module, dataloader: DataLoader, max_samples=None) -> float:
    """
    Evaluate the model on a dataset and return accuracy (%).

    Args:
        model (nn.Module): Trained neural network.
        dataloader (DataLoader): DataLoader for test data.
        max_samples (int, optional): Max number of samples to evaluate. Default is all.

    Returns:
        float: Accuracy in percentage.
    """
    correct, total, n_inferences = 0, 0, 0

    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            if max_samples:
                n_inferences += images.size(0)
                if n_inferences > max_samples:
                    break

    return 100 * correct / total


### **Full Training & Testing**
Resplit the dataset, reinitialize the network, and run training followed by evaluation.  
Prints the final test accuracy of the model.

In [None]:
"""
Run a full training and evaluation cycle.

- Resplits the dataset for a fresh run.
- Reinitializes the network.
- Trains the network using `train()`.
- Evaluates the network using `test()` and prints final accuracy.
"""

# Re-split dataset and reinitialize model
train_load, test_load = split_dataset()  # Resplit data each time
model = Net().to(device)                  # Re-establish a new network

# Train the model
train(model, train_load, device)

# Evaluate the model
score = test(model, test_load)
print(f'Accuracy of the network on the test images: {score:.2f}%')

# **Evaluate on Different Hardware**

**Download Test Images**

In [None]:
!pip install -q fvcore
import time
import psutil
import subprocess
from fvcore.nn import FlopCountAnalysis
import logging
logging.getLogger("fvcore.nn.jit_analysis").setLevel(logging.ERROR)

# Download images from Google Drive
!gdown https://drive.google.com/uc?id=1jqWHMlXisqgsYqSO5oy2VcpGY3hS6iyQ -O /content/Italy_Flag.png

### **CPU Testing**
Test the trained network on a CPU to verify functionality and measure performance.  
This allows comparison with GPU results and ensures the model runs on systems without a GPU.

In [None]:
# Force device to CPU
device = torch.device("cpu")
print("Using device:", device)

# Re-initialize model on CPU
model = Net().to(device)
model.eval()  # Evaluation mode (no dropout, batchnorm frozen, etc.)

# Path to test image (upload or place in /content/)
test_image_path = '/content/Italy_Flag.png'  # <-- Change if using a different image
image = Image.open(test_image_path)

# Apply same preprocessing as training
auto_augment_policy = AutoAugmentPolicy.IMAGENET
transform = transforms.Compose([
    transforms.AutoAugment(policy=auto_augment_policy),
    transforms.Resize([224, 224]),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Transform + add batch dimension
image = transform(image).unsqueeze(0).to(device)  # shape: [1, 3, 224, 224]

# Compute FLOPs (per forward pass)
flop = FlopCountAnalysis(model, image)
total_flops = flop.total()

# Run inference multiple times to benchmark runtime
number_of_runs = 1000  # Reduced from 10000 to keep runtime practical
time_before = time.time()
with torch.no_grad():
    for _ in range(number_of_runs):
        outputs = model(image)
time_after = time.time()

# Runtime results
total_time = time_after - time_before
gops = (total_flops * number_of_runs / 1e9) / total_time  # GOPs/sec

# CPU utilization snapshot (not exact science, just context)
cpu_usage = psutil.cpu_percent(interval=1)

# Display results
print(f"Runtime: {total_time:.2f} s")
print(f"GOPs/sec: {gops:.2f}")
print(f"CPU Usage: {cpu_usage}%")


### **GPU Testing**
Test the trained network on a GPU to measure improved performance and training speed.  
This highlights the efficiency gains from hardware acceleration compared to CPU execution.

In [None]:
# Ensure GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Re-initialize model on GPU
model = Net().to(device)
model.eval()  # Evaluation mode disables dropout, batchnorm updates, etc.

# Path to test image (upload or place in /content/)
test_image_path = '/content/Italy_Flag.png'  # <-- Change if using a different image
image = Image.open(test_image_path)

# Apply same preprocessing as training
auto_augment_policy = AutoAugmentPolicy.IMAGENET
transform = transforms.Compose([
    transforms.AutoAugment(policy=auto_augment_policy),
    transforms.Resize([224, 224]),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

# Transform + add batch dimension for inference
image = transform(image).unsqueeze(0).to(device)  # shape: [1, 3, 224, 224]

# Compute FLOPs (per forward pass)
flop = FlopCountAnalysis(model, image)
total_flops = flop.total()

# Run inference multiple times to benchmark runtime
number_of_runs = 1000  # adjust for longer/shorter test
time_before = time.time()
with torch.no_grad():
    for _ in range(number_of_runs):
        outputs = model(image)
time_after = time.time()

# Runtime results
total_time = time_after - time_before
gops = (total_flops * number_of_runs / 1e9) / total_time  # GOPs/sec

# Query GPU utilization (%)
gpu_process = subprocess.Popen(
    ['nvidia-smi',
     '--query-gpu=utilization.gpu',
     '--format=csv,noheader,nounits'],
    stdout=subprocess.PIPE
)
gpu_output, _ = gpu_process.communicate()
gpu_usage = float(gpu_output.strip())

# Helper function to grab extra GPU stats
def get_gpu_info():
    result = subprocess.run(
        ['nvidia-smi',
         '--query-gpu=name,memory.total,memory.used,memory.free,temperature.gpu,power.draw',
         '--format=csv,noheader,nounits'],
        stdout=subprocess.PIPE
    )
    output = result.stdout.decode('utf-8').strip().split('\n')
    gpu_info = []
    for line in output:
        info = line.split(', ')
        gpu_info.append({
            'GPU Name': info[0],
            'Memory (Total MB)': int(info[1]),
            'Memory (Used MB)': int(info[2]),
            'Memory (Free MB)': int(info[3]),
            'Temperature (C)': int(info[4]),
            'Power Draw (W)': float(info[5])
        })
    return gpu_info

gpu_info = get_gpu_info()
power_usage = total_time * gpu_info[0]['Power Draw (W)']  # Joules = W * s

# Display results
print(f"Runtime: {total_time:.2f} s")
print(f"GOPs/sec: {gops:.2f}")
print(f"GPU Usage: {gpu_usage}%")
print(f"Power Usage Estimate: {power_usage:.2f} J")
print("GPU Info:", gpu_info[0])

