# Google Scraped Image Dataset

From this [Kaggle dataset](https://www.kaggle.com/datasets/duttadebadri/image-classification), the problem is clear: we need to develop an image classification model to distinguish between traveling and photography images across four categories:

1. Architecture

2. Art and Culture

3. Food and Drinks

4. Travel and Adventure

The goal is to train a robust model that can accurately classify images into these categories while maintaining efficiency in training and deployment.

## Kaggle Setup

In [None]:
!pip install kaggle

Before downloading the dataset from Kaggle, we need to setup our Kaggle API key by generating and downloading it from https://www.kaggle.com/settings, then put the `kaggle.json` to current directory (in the same directory as this notebook).

In [None]:
import os

os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd()

## Download the Dataset

In [None]:
!kaggle datasets download -d duttadebadri/image-classification

In [None]:
!unzip -q image-classification.zip -d image-classification-dataset

#

In [None]:
import os

old_folder_name = "./image-classification-dataset/images/images/food and d rinks"
new_folder_name = "./image-classification-dataset/images/images/food"

os.rename(old_folder_name, new_folder_name)

## Data Exploration and Analysis

### Data Distribution Across Classes

In [None]:
import os
import matplotlib.pyplot as plt

# Path to the dataset (training) directory
dataset_path = "./image-classification-dataset/images/images"

# Get class names (subfolder names)
class_names = sorted(
  [d for d in os.listdir(dataset_path)
  if os.path.isdir(os.path.join(dataset_path, d))]
)

# Count the number of images in each class
class_counts = {
  cls: len(os.listdir(os.path.join(dataset_path, cls)))
  for cls in class_names
}

# Plot the class distribution
plt.figure(figsize=(6, 4))
plt.bar(class_counts.keys(), class_counts.values(), color="skyblue")
plt.xlabel("Class Name", fontsize=12)
plt.ylabel("Number of Images", fontsize=12)
plt.xticks(rotation=45, ha="right")
plt.title("[Train] Data Distribution Across Classes", fontsize=14)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

Looks like we have a nearly balanced data distribution across classes, so a simple metric like accuracy can be used as our main metric in this case.

In [None]:
import os
import matplotlib.pyplot as plt

# Path to the dataset (validation) directory
dataset_path = "./image-classification-dataset/validation/validation"

# Get class names (subfolder names)
class_names = sorted(
  [d for d in os.listdir(dataset_path)
  if os.path.isdir(os.path.join(dataset_path, d))]
)

# Count the number of images in each class
class_counts = {
  cls: len(os.listdir(os.path.join(dataset_path, cls)))
  for cls in class_names
}

# Plot the class distribution
plt.figure(figsize=(6, 4))
plt.bar(class_counts.keys(), class_counts.values(), color="skyblue")
plt.xlabel("Class Name", fontsize=12)
plt.ylabel("Number of Images", fontsize=12)
plt.xticks(rotation=45, ha="right")
plt.title("[Validation] Data Distribution Across Classes", fontsize=14)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

However, since we only have a small validation sample, we will check whether accuracy alone is sufficient for this dataset.

### Image Quality Check

Check the image blurriness

In [None]:
import cv2
import numpy as np
import random
import os

def calculate_blurriness(image_path: str) -> tuple[str, float]:
  """Calculate blurriness using the Laplacian variance method."""
  image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
  if image is None:
    return image_path, None
  laplacian = cv2.Laplacian(image, cv2.CV_64F)
  variance = laplacian.var()
  return image_path, variance

# Define the image folder
dataset_directory = "./image-classification-dataset/images/images"
image_folders = [
  os.path.join(dataset_directory, image_folder)
  for image_folder in os.listdir(dataset_directory)
]

# Get only 10% image samples for each classes
image_path_samples = []
for image_folder in image_folders:
  image_paths = [
    os.path.join(image_folder, f)
    for f in os.listdir(image_folder)
    if f.lower().endswith(('.jpg', '.jpeg', '.png'))
  ]

  # Define 10% sample size
  sample_size = int(0.1 * len(image_paths))  # 10% of total images

  # Randomly sample 10% of images
  image_path_samples += random.sample(image_paths, sample_size)

# Check the images' blurriness
blurriness_results = [
  calculate_blurriness(img_path) for img_path in image_path_samples
]

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Extract blurriness scores (ignoring None values)
blurriness_scores = [
  score for _, score in blurriness_results if score is not None
]

# Plot histogram
plt.figure(figsize=(6, 4))
plt.hist(
  blurriness_scores, bins=50, color='skyblue', edgecolor='black', alpha=0.7
)
plt.xlabel("Blurriness Score (Laplacian Variance)", fontsize=12)
plt.ylabel("Number of Images", fontsize=12)
plt.title("Blurriness Distribution of Images", fontsize=14)
plt.axvline(
  x=100, color='red', linestyle='dashed', label="Blurriness Threshold"
)
plt.legend()
plt.grid(axis='y', linestyle="--", alpha=0.7)
plt.show()

Now, I would like to see the distribution for these blurriness categories:
- Very Blurry: 0 - 50  
- Blurry: 50 - 100  
- Slightly Blurry: 100 - 500  
- Sharp: 500 - 2000  
- Very Sharp: 2,000 - 15,000  
- Extreme (15,000+): > 15,000


In [None]:
import matplotlib.pyplot as plt
import numpy as np

categories = {
  "Very Blurry (0-50)": 0,
  "Blurry (50-100)": 0,
  "Slightly Blurry (100-500)": 0,
  "Sharp (500-2000)": 0,
  "Very Sharp (2000-15000)": 0,
  "Extreme (15K+)": 0
}

for _, score in blurriness_results:
  if score is None:
    continue
  if score < 50:
    categories["Very Blurry (0-50)"] += 1
  elif score < 100:
    categories["Blurry (50-100)"] += 1
  elif score < 500:
    categories["Slightly Blurry (100-500)"] += 1
  elif score < 2000:
    categories["Sharp (500-2000)"] += 1
  elif score < 15000:
    categories["Very Sharp (2000-15000)"] += 1
  else:
    categories["Extreme (15K+)"] += 1

# Plot bar
plt.figure(figsize=(6, 4))
plt.bar(categories.keys(), categories.values(), color=["red", "orange", "yellow", "green", "blue", "purple"])
plt.xlabel("Blurriness Type", fontsize=12)
plt.ylabel("Number of Images", fontsize=12)
plt.title("Image Blurriness Distribution", fontsize=14)
plt.xticks(rotation=45, ha="right")
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

Looks like we have a lot of sharp images and a small number of blurry ones. Most of the images can be used for the modeling phase. Now, let's randomly sample and plot images from each blurriness category.

In [None]:
import cv2
import matplotlib.pyplot as plt
import random

def get_sample_images(blurriness_results, num_samples=1):
  categories = {
    "Very Blurry (0-50)": [],
    "Blurry (50-100)": [],
    "Slightly Blurry (100-500)": [],
    "Sharp (500-2000)": [],
    "Very Sharp (2000-15000)": [],
    "Extreme (15K+)": []
  }

  for img_path, score in blurriness_results:
    if score is None:
      continue
    if score < 50:
      categories["Very Blurry (0-50)"].append(img_path)
    elif score < 100:
      categories["Blurry (50-100)"].append(img_path)
    elif score < 500:
      categories["Slightly Blurry (100-500)"].append(img_path)
    elif score < 2000:
      categories["Sharp (500-2000)"].append(img_path)
    elif score < 15000:
      categories["Very Sharp (2000-15000)"].append(img_path)
    else:
      categories["Extreme (15K+)"].append(img_path)

  sample_images = {
    category: random.choice(paths)
    if paths else None for category, paths in categories.items()
  }
  return sample_images

sample_images = get_sample_images(blurriness_results)

# Plot images
plt.figure(figsize=(8, 6))

for i, (category, img_path) in enumerate(sample_images.items()):
  if img_path is None:
    continue

  img = cv2.imread(img_path)
  img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

  plt.subplot(2, 3, i+1)
  plt.imshow(img)
  plt.title(category, fontsize=10)
  plt.axis("off")

plt.tight_layout()
plt.show()

Check the overexposure and underexposure image

In [None]:
import cv2
import numpy as np

def compute_brightness(image_path):
  """Calculate image brightness."""
  img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
  if img is None:
    return None
  return np.mean(img)

exposure_results = [
  (img_path, compute_brightness(img_path))
  for img_path, _ in blurriness_results
]
exposure_results = [
  (img_path, brightness) for img_path, brightness in exposure_results
  if brightness is not None
]

In [None]:
import matplotlib.pyplot as plt

categories = {
    "Underexposed (<50)": [],
    "Normal (50-200)": [],
    "Overexposed (>200)": []
}

for img_path, brightness in exposure_results:
  if brightness < 50:
    categories["Underexposed (<50)"].append(img_path)
  elif brightness > 200:
    categories["Overexposed (>200)"].append(img_path)
  else:
    categories["Normal (50-200)"].append(img_path)

brightness_scores = [brightness for _, brightness in exposure_results]

# Plot histogram
plt.figure(figsize=(10, 6))
plt.hist(
  brightness_scores, bins=50, color='skyblue', edgecolor='black', alpha=0.7
)
plt.xlabel("Brightness Level (Mean Pixel Value)", fontsize=12)
plt.ylabel("Number of Images", fontsize=12)
plt.title("Image Exposure Distribution", fontsize=14)
plt.axvline(
  x=50, color='red', linestyle='dashed', label="Underexposed Threshold"
)
plt.axvline(
  x=200, color='red', linestyle='dashed', label="Overexposed Threshold"
)
plt.legend()
plt.grid(axis='y', linestyle="--", alpha=0.7)
plt.show()

We have some under-exposure images, let's visualize these images.

In [None]:
import random

sample_images = {
  category: random.choice(paths) if paths else None
  for category, paths in categories.items()
}

# Plot images
plt.figure(figsize=(10, 4))

for i, (category, img_path) in enumerate(sample_images.items()):
  if img_path is None:
    continue

  img = cv2.imread(img_path)
  img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

  plt.subplot(1, 3, i+1)
  plt.imshow(img)
  plt.title(category, fontsize=10)
  plt.axis("off")

plt.tight_layout()
plt.show()

For these under- and overexposed images, we can fix them by applying image processing techniques. But for now, let's see how the model handles this data. Later, we will correct these images and evaluate the significance of the improvement made by addressing this issue.

### Check the Image Size

In [None]:
import cv2

def get_image_size(image_path):
  img = cv2.imread(image_path)
  if img is None:
    return None  # Skip unreadable images
  return img.shape[1], img.shape[0]  # (width, height)

# Process all images
sampled_images = [(img_path, get_image_size(img_path)) for img_path, _ in blurriness_results]

# Remove None values (unreadable images)
sampled_images = [(img_path, size) for img_path, size in sampled_images if size is not None]

# Extract width and height lists
widths = [size[0] for _, size in sampled_images]
heights = [size[1] for _, size in sampled_images]

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))

# Width distribution
plt.subplot(1, 2, 1)
plt.hist(widths, bins=50, color='blue', alpha=0.7, edgecolor='black')
plt.xlabel("Image Width")
plt.ylabel("Number of Images")
plt.title("Distribution of Image Widths")
plt.grid(axis='y', linestyle="--", alpha=0.7)

# Height distribution
plt.subplot(1, 2, 2)
plt.hist(heights, bins=50, color='green', alpha=0.7, edgecolor='black')
plt.xlabel("Image Height")
plt.ylabel("Number of Images")
plt.title("Distribution of Image Heights")
plt.grid(axis='y', linestyle="--", alpha=0.7)

plt.tight_layout()
plt.show()

In [None]:
from collections import Counter

# Count occurrences of each image size
size_counts = Counter(size for _, size in sampled_images)

# Display the most common image sizes
print("Most common image sizes:")
for size, count in size_counts.most_common(10):
  print(f"{size}: {count} images")

# Extract file paths of the smallest and largest images
smallest_img_path, smallest_img_size = min(sampled_images, key=lambda x: x[1][0] * x[1][1])
largest_img_path, largest_img_size = max(sampled_images, key=lambda x: x[1][0] * x[1][1])

print(f"\nSmallest image size: {smallest_img_size}")
print(f"Largest image size: {largest_img_size}")

In [None]:
def display_image(image_path, title):
  img = cv2.imread(image_path)
  if img is not None:
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    plt.imshow(img)
    plt.title(title)
    plt.axis("off")
  else:
    print(f"Could not load image: {image_path}")

# Plot images
plt.figure(figsize=(6, 4))
plt.subplot(1, 2, 1)
display_image(smallest_img_path, f"Smallest Image: {smallest_img_size}")
plt.subplot(1, 2, 2)
display_image(largest_img_path, f"Largest Image: {largest_img_size}")
plt.tight_layout()
plt.show()

From the dataset, we can see that most images are small to medium in size (ranging from 128x128 to 512x512). Knowing this, it suggests that using a deeper model may not provide significant improvement but will instead increase training and deployment costs due to its complexity. Instead, a small to medium-sized model should be more sufficient.

## Data Pre-processing

The only data preprocessing I applied is standardizing the image size to 512x512. Reducing the size to 128x128 may lead to a significant loss of details and limitations in accuracy.

In [None]:
import os
import cv2
import numpy as np
from tqdm import tqdm

# Target size
TARGET_SIZE = 512

def resize_with_padding(image, target_size):
  """Resize image with padding to maintain aspect ratio."""
  h, w = image.shape[:2]

  # Compute scaling factor
  scale = target_size / max(h, w)
  new_w, new_h = int(w * scale), int(h * scale)

  # Resize image
  resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA)

  # Create a blank canvas
  padded = np.zeros((target_size, target_size, 3), dtype=np.uint8)

  # Compute padding offsets
  top = (target_size - new_h) // 2
  left = (target_size - new_w) // 2

  # Place resized image on canvas
  padded[top:top+new_h, left:left+new_w] = resized

  return padded

def preprocess_data(input_folder, output_folder, target_size=TARGET_SIZE):
  """Pre-process the input folder by resizing it to a predefined target size."""
  # Ensure output directory exists
  os.makedirs(output_folder, exist_ok=True)

  # Process all images in the folder
  for image_dir in os.listdir(input_folder):

    # Create output directory
    output_dir = os.path.join(output_folder, image_dir)
    os.makedirs(output_dir, exist_ok=True)

    for filename in tqdm(
      os.listdir(os.path.join(input_folder, image_dir)),
      f"Processing {image_dir}"
    ):
      if filename.lower().endswith(('.jpg', '.jpeg', '.png')):
        input_path = os.path.join(input_folder, image_dir, filename)

        image = cv2.imread(input_path)
        if image is None:
          print(f"Skipping unreadable image: {filename}")
          continue

        # Save the processed image
        output_path = os.path.join(output_folder, image_dir, filename)
        resized_image = resize_with_padding(image, target_size=target_size)
        cv2.imwrite(output_path, resized_image)

  print("Resizing complete! Images saved in:", output_folder)

Process Training images

In [None]:
# Input and output paths
input_folder = "./image-classification-dataset/images/images"
output_folder = "./processed-dataset/images"

preprocess_data(input_folder=input_folder, output_folder=output_folder)

Process Validation images

In [None]:
# Input and output paths
input_folder = "./image-classification-dataset/validation/validation"
output_folder = "./processed-dataset/validation"

preprocess_data(input_folder=input_folder, output_folder=output_folder)

In [None]:
import matplotlib.pyplot as plt
import random

output_folder = "./processed-dataset/images"

processed_images = [
  os.path.join(output_folder, image_dir, f)
  for image_dir in os.listdir(output_folder)
  if os.path.isdir(os.path.join(output_folder, image_dir))
  for f in os.listdir(os.path.join(output_folder, image_dir))
  if f.lower().endswith(('.jpg', '.jpeg', '.png'))
]

# Randomly sample images for visualization
num_samples = min(5, len(processed_images))
sampled_images = random.sample(processed_images, num_samples)

# Plot the images
plt.figure(figsize=(15, 5))

for i, img_path in enumerate(sampled_images):
  image = cv2.imread(img_path)
  image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

  plt.subplot(1, num_samples, i + 1)
  plt.imshow(image)
  plt.title(f"{os.path.basename(img_path)}")
  plt.axis("off")

plt.tight_layout()
plt.show()


For now, I believe our data preprocessing is sufficient. The model should be able to handle slightly blurry images as well as under and overexposed images. Next, I will proceed with the modeling phase. If some models, including deeper ones, struggle to achieve good performance, then refining the data will be the next step.

## Modelling

Looking back at the problem, the goal is to classify travel and photography pictures. Having a small yet powerful model is a significant advantage. One model I want to try is MobileNetV4 -- an efficient, lightweight model that offers high accuracy, fast training, and low deployment costs.

### Vanilla Model -- MobilenetV4 Small, No Data Augmentation

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import timm
import matplotlib.pyplot as plt

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data directories
train_dir = "./processed-dataset/images"
val_dir = "./processed-dataset/validation"

# Define transformations (Resize to 224x224 and Normalize)
transform = transforms.Compose([
  transforms.Resize((224, 224)),
  transforms.ToTensor(),
  transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load datasets
train_dataset = datasets.ImageFolder(root=train_dir, transform=transform)
val_dataset = datasets.ImageFolder(root=val_dir, transform=transform)

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)

# Get number of classes
num_classes = len(train_dataset.classes)
print(f"Number of classes: {num_classes}")

In [None]:
# Load MobileNetV4 Small from timm
model = timm.create_model("mobilenetv4_conv_small.e2400_r224_in1k", pretrained=True, num_classes=num_classes)
model.to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Metric storage
train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []

# Training loop
num_epochs = 10
best_val_acc = 0.0

for epoch in range(num_epochs):
  print(f"\n[Epoch {epoch+1}/{num_epochs}]")

  # -------------------- Training --------------------
  model.train()
  train_loss = 0.0
  train_correct = 0
  train_total = 0

  for batch_idx, (inputs, labels) in enumerate(train_loader):
    inputs, labels = inputs.to(device), labels.to(device)

    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    train_loss += loss.item()
    _, predicted = torch.max(outputs, 1)
    train_total += labels.size(0)
    train_correct += (predicted == labels).sum().item()

  train_epoch_loss = train_loss / len(train_loader)
  train_epoch_acc = train_correct / train_total
  train_losses.append(train_epoch_loss)
  train_accuracies.append(train_epoch_acc)

  print(f"Train Loss: {train_epoch_loss:.4f}, Accuracy: {train_epoch_acc:.4f}")

  # -------------------- Validation --------------------
  model.eval()
  val_loss = 0.0
  val_correct = 0
  val_total = 0

  with torch.no_grad():
    for batch_idx, (inputs, labels) in enumerate(val_loader):
      inputs, labels = inputs.to(device), labels.to(device)

      outputs = model(inputs)
      loss = criterion(outputs, labels)

      val_loss += loss.item()
      _, predicted = torch.max(outputs, 1)
      val_total += labels.size(0)
      val_correct += (predicted == labels).sum().item()

  val_epoch_loss = val_loss / len(val_loader)
  val_epoch_acc = val_correct / val_total
  val_losses.append(val_epoch_loss)
  val_accuracies.append(val_epoch_acc)

  print(f"Validation Loss: {val_epoch_loss:.4f}, Accuracy: {val_epoch_acc:.4f}")

  # Save the best model based on validation accuracy
  if val_epoch_acc > best_val_acc:
    best_val_acc = val_epoch_acc
    torch.save(model.state_dict(), "mobilenetv4_best.pth")
    print("Best model saved!")

print("\nTraining complete!")

# Store the training metrics
training_results = []
training_results.append({
      "model_name": "mobilenetv4_conv_small.e2400_r224_in1k",
      "train_losses": train_losses,
      "train_accuracies": train_accuracies,
      "val_losses": val_losses,
      "val_accuracies": val_accuracies
})

### With Data Augmentation

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import timm
import matplotlib.pyplot as plt
from PIL import Image

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data directories
train_dir = "./processed-dataset/images"
val_dir = "./processed-dataset/validation"

# -------------------- Data Augmentation --------------------
train_transform_aug = transforms.Compose([
  transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),  # Random crop
  transforms.RandomRotation(20),  # Rotate ±20 degrees
  transforms.RandomHorizontalFlip(0.5),  # 50% chance to flip
  transforms.ToTensor(),
  transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Validation transform (same for fair comparison)
val_transform = transforms.Compose([
  transforms.Resize((224, 224)),
  transforms.ToTensor(),
  transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load datasets
train_dataset_aug = datasets.ImageFolder(root=train_dir, transform=train_transform_aug)
val_dataset = datasets.ImageFolder(root=val_dir, transform=val_transform)

# DataLoaders
train_loader_aug = DataLoader(train_dataset_aug, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)

# Get number of classes
num_classes = len(train_dataset_aug.classes)

# Load MobileNetV4 Small from timm
model = timm.create_model("mobilenetv4_conv_small.e2400_r224_in1k", pretrained=True, num_classes=num_classes)
model.to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Metric storage
train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []
best_val_acc = 0.0
num_epochs = 10

for epoch in range(num_epochs):
  print(f"\n[Epoch {epoch+1}/{num_epochs}]")

  # -------------------- Training --------------------
  model.train()
  train_loss = 0.0
  train_correct = 0
  train_total = 0

  for inputs, labels in train_loader:
    inputs, labels = inputs.to(device), labels.to(device)

    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    train_loss += loss.item()
    _, predicted = torch.max(outputs, 1)
    train_total += labels.size(0)
    train_correct += (predicted == labels).sum().item()

  train_epoch_loss = train_loss / len(train_loader)
  train_epoch_acc = train_correct / train_total
  train_losses.append(train_epoch_loss)
  train_accuracies.append(train_epoch_acc)

  print(f"Train Loss: {train_epoch_loss:.4f}, Accuracy: {train_epoch_acc:.4f}")

  # -------------------- Validation --------------------
  model.eval()
  val_loss = 0.0
  val_correct = 0
  val_total = 0

  with torch.no_grad():
    for inputs, labels in val_loader:
      inputs, labels = inputs.to(device), labels.to(device)

      outputs = model(inputs)
      loss = criterion(outputs, labels)

      val_loss += loss.item()
      _, predicted = torch.max(outputs, 1)
      val_total += labels.size(0)
      val_correct += (predicted == labels).sum().item()

  val_epoch_loss = val_loss / len(val_loader)
  val_epoch_acc = val_correct / val_total
  val_losses.append(val_epoch_loss)
  val_accuracies.append(val_epoch_acc)

  print(f"Validation Loss: {val_epoch_loss:.4f}, Accuracy: {val_epoch_acc:.4f}")

  # Save the best model
  if val_epoch_acc > best_val_acc:
    best_val_acc = val_epoch_acc
    torch.save(model.state_dict(), "mobilenetv4_aug_best.pth")
    print("Best model saved!")

training_results.append({
  "model_name": "mobilenetv4_conv_small.e2400_r224_in1k-data_augment",
  "train_losses": train_losses,
  "train_accuracies": train_accuracies,
  "val_losses": val_losses,
  "val_accuracies": val_accuracies
})

### Larger Model -- MobilenetV4 Large

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import timm
import matplotlib.pyplot as plt
from PIL import Image

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data directories
train_dir = "./processed-dataset/images"
val_dir = "./processed-dataset/validation"

# -------------------- Data Augmentation --------------------
train_transform_aug = transforms.Compose([
  transforms.RandomResizedCrop(384, scale=(0.8, 1.0)),  # Random crop
  transforms.RandomRotation(20),  # Rotate ±20 degrees
  transforms.RandomHorizontalFlip(0.5),  # 50% chance to flip
  transforms.ToTensor(),
  transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Validation transform (same for fair comparison)
val_transform = transforms.Compose([
  transforms.Resize((384, 384)),
  transforms.ToTensor(),
  transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load datasets
train_dataset_aug = datasets.ImageFolder(root=train_dir, transform=train_transform_aug)
val_dataset = datasets.ImageFolder(root=val_dir, transform=val_transform)

# DataLoaders
train_loader_aug = DataLoader(train_dataset_aug, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)

# Get number of classes
num_classes = len(train_dataset_aug.classes)

# Load MobileNetV4 Small from timm
model = timm.create_model("mobilenetv4_conv_large.e600_r384_in1k", pretrained=True, num_classes=num_classes)
model.to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Metric storage
train_losses, val_losses = [], []
train_accuracies, val_accuracies = [], []
best_val_acc = 0.0
num_epochs = 10

for epoch in range(num_epochs):
  print(f"\n[Epoch {epoch+1}/{num_epochs}]")

  # -------------------- Training --------------------
  model.train()
  train_loss = 0.0
  train_correct = 0
  train_total = 0

  for inputs, labels in train_loader:
    inputs, labels = inputs.to(device), labels.to(device)

    optimizer.zero_grad()
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()

    train_loss += loss.item()
    _, predicted = torch.max(outputs, 1)
    train_total += labels.size(0)
    train_correct += (predicted == labels).sum().item()

  train_epoch_loss = train_loss / len(train_loader)
  train_epoch_acc = train_correct / train_total
  train_losses.append(train_epoch_loss)
  train_accuracies.append(train_epoch_acc)

  print(f"Train Loss: {train_epoch_loss:.4f}, Accuracy: {train_epoch_acc:.4f}")

  # -------------------- Validation --------------------
  model.eval()
  val_loss = 0.0
  val_correct = 0
  val_total = 0

  with torch.no_grad():
    for inputs, labels in val_loader:
      inputs, labels = inputs.to(device), labels.to(device)

      outputs = model(inputs)
      loss = criterion(outputs, labels)

      val_loss += loss.item()
      _, predicted = torch.max(outputs, 1)
      val_total += labels.size(0)
      val_correct += (predicted == labels).sum().item()

  val_epoch_loss = val_loss / len(val_loader)
  val_epoch_acc = val_correct / val_total
  val_losses.append(val_epoch_loss)
  val_accuracies.append(val_epoch_acc)

  print(f"Validation Loss: {val_epoch_loss:.4f}, Accuracy: {val_epoch_acc:.4f}")

  # Save the best model
  if val_epoch_acc > best_val_acc:
    best_val_acc = val_epoch_acc
    torch.save(model.state_dict(), "mobilenetv4-large_aug_best.pth")
    print("Best model saved!")

training_results.append({
  "model_name": "mobilenetv4_conv_large.e600_r384_in1k-data_augment",
  "train_losses": train_losses,
  "train_accuracies": train_accuracies,
  "val_losses": val_losses,
  "val_accuracies": val_accuracies
})

### Visualize Training Metrics

In [None]:
epochs = range(1, len(training_results[0]["train_losses"]) + 1)

plt.figure(figsize=(12, 5))

# -------------------- Loss Comparison --------------------
plt.subplot(1, 2, 1)
for res in training_results[:2]:
    plt.plot(epochs, res["train_losses"], label=f"{res['model_name']} - Train", linestyle="--", marker="o")
    plt.plot(epochs, res["val_losses"], label=f"{res['model_name']} - Val", linestyle="-", marker="s")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training & Validation Loss Comparison")
plt.legend()
plt.grid()

# -------------------- Accuracy Comparison --------------------
plt.subplot(1, 2, 2)
for res in training_results[:2]:
    plt.plot(epochs, res["train_accuracies"], label=f"{res['model_name']} - Train", linestyle="--", marker="o")
    plt.plot(epochs, res["val_accuracies"], label=f"{res['model_name']} - Val", linestyle="-", marker="s")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Training & Validation Accuracy Comparison")
plt.legend()
plt.grid()

plt.tight_layout()
plt.show()

In [None]:
for res in training_results:
  best_idx = np.argmax(res.get('val_accuracies'))
  print(f"Result for {res.get('model_name')}")
  print(f"Train accuracy: {res.get('train_accuracies')[best_idx]}")
  print(f"Val accuracy: {res.get('val_accuracies')[best_idx]}")
  print(f"Train loss: {res.get('train_losses')[best_idx]}")
  print(f"Val loss: {res.get('val_losses')[best_idx]}\n")

## Final Verdict on Image Classification Exercise

A small model (MobileNetV4-small) achieved ~88% validation accuracy with data augmentation. This suggests that data augmentation effectively helps the model learn unseen features from the training data, improving its ability to generalize and accurately classify images in the validation set.
The larger model requires more training time to achieve good results. It seems that 10 epochs are not sufficient, and additional training epochs may be needed for better convergence and performance.

**Improvement Possibility**
- **Model-Centric Approach**: Training the model for more epochs could help it converge better. Additionally, experimenting with a deeper model might improve accuracy. However, to prevent overfitting, techniques such as dropout, weight regularization, or data augmentation should be applied. Fine-tuning a pre-trained model on our dataset can also be an effective way to boost performance while maintaining efficiency.  

- **Data-Centric Approach**: Improving image quality can significantly enhance model performance. This can be done by omitting excessively blurry images, correcting under- and overexposed images through image processing techniques, and ensuring a more balanced and diverse dataset. Applying advanced augmentation techniques, such as adaptive histogram equalization or noise reduction, can also help the model learn more robust features.

## Run on Test Data

In [None]:
import cv2
import numpy as np

# Target size
TARGET_SIZE = 512

def resize_with_padding(image, target_size):
  """Resize image with padding to maintain aspect ratio."""
  h, w = image.shape[:2]

  # Compute scaling factor
  scale = target_size / max(h, w)
  new_w, new_h = int(w * scale), int(h * scale)

  # Resize image
  resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA)

  # Create a blank canvas
  padded = np.zeros((target_size, target_size, 3), dtype=np.uint8)

  # Compute padding offsets
  top = (target_size - new_h) // 2
  left = (target_size - new_w) // 2

  # Place resized image on canvas
  padded[top:top+new_h, left:left+new_w] = resized

  return padded


In [None]:
import os
import pandas as pd
import torch
from torchvision import transforms
import timm

transform = transforms.Compose([
  transforms.Lambda(lambda img: resize_with_padding(img, target_size=224)),
  transforms.ToTensor(),
  transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

classes = {
  0: "Art & Culture",
  1: "Architecture",
  2: "Food and Drinks",
  3: "Travel and Adventure"
}

image_dir = "./image-classification-dataset/test/test/classify"
image_paths = [os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
image_paths.sort()

model = timm.create_model("mobilenetv4_conv_small.e2400_r224_in1k", pretrained=True, num_classes=4)
model.load_state_dict(torch.load("mobilenetv4_best.pth", map_location=torch.device('cpu')))
model.eval()

prediction_results = {
  "filename": [],
  "predicted": [],
  "confidence": []
}

# Process and run inference
for img_path in image_paths:
  image = cv2.imread(img_path)
  image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
  input_tensor = transform(image).unsqueeze(0)

  with torch.no_grad():
    outputs = model(input_tensor)
    probabilities = torch.softmax(outputs, dim=1)
    conf, predicted = torch.max(probabilities, 1)

  predicted.item()  # Convert tensor to integer
  prediction_results["filename"].append(os.path.basename(img_path))
  prediction_results["predicted"].append(classes[predicted.item()])
  prediction_results["confidence"].append(conf.item()*100)

df = pd.DataFrame(prediction_results)
df.to_csv("image_classification_prediction.csv", index=False)