In [None]:
import set_path
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import cv2
from PIL import Image

In [None]:
# Define the paths to the data annotations
main_annotations_dir = "data/annotations"
train_annotations_path = f"{main_annotations_dir}/annotations_train.csv"
val_annotations_path = f"{main_annotations_dir}/annotations_val.csv"
test_annotations_path = f"{main_annotations_dir}/annotations_test.csv"

In [None]:
# Load all annotations
train_annotations = pd.read_csv(train_annotations_path)
val_annotations = pd.read_csv(val_annotations_path)
test_annotations = pd.read_csv(test_annotations_path)

print(f"Train annotations: {train_annotations.shape}")
print(f"Val annotations: {val_annotations.shape}")
print(f"Test annotations: {test_annotations.shape}")

In [None]:
# Rename columns
columns = ["image_name", "x1", "y1", "x2", "y2", "class", "image_width", "image_height"]
train_annotations.columns = columns
val_annotations.columns = columns
test_annotations.columns = columns

In [None]:
train_annotations.head()

In [None]:
# Combine into a single dataframe
all_annotations = pd.concat([train_annotations, val_annotations, test_annotations])
print(f"All annotations: {all_annotations.shape}")

## Utility functions

In [None]:
def print_statistics(data, title):
    minimum_value = data.min()
    maximum_value = data.max()
    mean_value = data.mean()
    std_value = data.std()
    print(f"{title} | Min: {minimum_value} | Max: {maximum_value} | Mean: {mean_value} | Std: {std_value}")

def plot_histogram(data, title, x_label, y_label, num_bins=100, use_log_scale=True):
    logged_data = np.log(data) # Logarithmic scale (for better visualisation)
    
    if use_log_scale:
        plt.hist(logged_data, bins=num_bins)
    else:
        plt.hist(data, bins=num_bins)
    plt.title(title)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.show()
    print_statistics(data, title)

## Finding the areas of all of the bounding boxes

In [None]:
all_annotations["bbox_width"] = all_annotations["x2"] - all_annotations["x1"]
all_annotations["bbox_height"] = all_annotations["y2"] - all_annotations["y1"]
all_annotations["bbox_area"] = all_annotations["bbox_width"] * all_annotations["bbox_height"]

In [None]:
all_annotations.head()

In [None]:
plot_histogram(all_annotations["bbox_area"], "Bounding Box Area Histogram", "Area", "Frequency")

- There is no indication of "heavy" skewness in the distribution of bounding box areas.

## Finding the aspect ratios of the images and each bounding box

In [None]:
all_annotations["image_aspect_ratio"] = all_annotations["image_width"] / all_annotations["image_height"]
all_annotations["bbox_aspect_ratio"] = all_annotations["bbox_width"] / all_annotations["bbox_height"]

In [None]:
all_annotations.head()

In [None]:
plot_histogram(all_annotations["image_aspect_ratio"], "Image Aspect Ratio Histogram", "Aspect Ratio", "Frequency")

In [None]:
plot_histogram(all_annotations["bbox_aspect_ratio"], "Bounding Box Aspect Ratio Histogram", "Aspect Ratio", "Frequency")

- There is no indication of "heavy" skewness in the distribution of bounding box aspect ratios.

## Inspecting class frequencies

In [None]:
class_value_counts = all_annotations["class"].value_counts()

In [None]:
print(class_value_counts)

- This dataset only contains a single class, therefore there should not be any issues regarding class imbalance

## More utility functions (for image visualisation)

In [None]:
def load_image(image_path):
    image = Image.open(image_path)
    image = image.convert("RGB")
    return np.array(image)

def get_bboxes(image_name, annotations):
    bboxes = annotations[annotations["image_name"] == image_name][["x1", "y1", "x2", "y2"]].values
    return bboxes

def visualise_image(image, bboxes):
    for bbox in bboxes:
        x_min, y_min, x_max, y_max = bbox
        image = cv2.rectangle(image, (x_min, y_min), (x_max, y_max), (0, 255, 0), 2)
    plt.figure(figsize=(10, 10))
    plt.imshow(image)
    plt.axis("off")
    plt.show()

## Visualisation of images with their corresponding bounding boxes

In [None]:
# Sample random images
random_images = all_annotations.sample(10)

In [None]:
for index, row in random_images.iterrows():
    image_name = row["image_name"]
    image_path = f"data/images/{image_name}"
    image = load_image(image_path)
    bboxes = get_bboxes(image_name, all_annotations)
    visualise_image(image, bboxes)