In [1]:
import utils
import numpy as np
from sklearn.model_selection import train_test_split
import cv2

In [2]:
train_images, train_labels, test_images, test_labels = utils.load_processed_data('preprocessed_data.pkl')

train_images, val_images, train_labels, val_labels = train_test_split(train_images, train_labels, test_size=0.25, train_size=0.75, random_state=1234, shuffle=True)


In [3]:

def adjust_brightness_contrast(img, brightness=0.2, contrast=0.2):
    """Randomly adjust brightness and contrast."""
    alpha = 1.0 + np.random.uniform(-contrast, contrast)  # Contrast control
    beta = np.random.uniform(-brightness * 255, brightness * 255)  # Brightness control
    adjusted = cv2.convertScaleAbs(img, alpha=alpha, beta=beta)
    return adjusted

def add_gaussian_noise(img, mean=0, std=10):
    """Add Gaussian noise to an image."""
    noise = np.random.normal(mean, std, img.shape).astype(np.float32)
    noisy = img.astype(np.float32) + noise
    return np.clip(noisy, 0, 255).astype(np.uint8)

def horizontal_flip(img):
    """Flip image horizontally."""
    return np.fliplr(img)

def augment_data_for_balance(images, labels, target_counts=None):
    """
    Augment data to balance classes using horizontal flip, brightness/contrast, and Gaussian noise.
    Each transformation is applied once per image at most.
    Total examples per original image = up to 1 (original) + 3 (transforms).
    """
    # Count existing examples per class
    unique_labels, counts = np.unique(labels, return_counts=True)
    class_counts = dict(zip(unique_labels, counts))
    
    # Determine target counts per class
    if target_counts is None:
        max_count = max(counts)
        target_counts = {label: max_count for label in unique_labels}
    elif isinstance(target_counts, (int, float)):
        target_count_value = int(target_counts)
        target_counts = {label: target_count_value for label in unique_labels}

    # Define transformations
    augmentation_funcs = [
        horizontal_flip,
        adjust_brightness_contrast,
        add_gaussian_noise
    ]
    max_transforms = len(augmentation_funcs)

    augmented_images = []
    augmented_labels = []

    for label in unique_labels:
        class_indices = np.where(labels == label)[0]
        current_images = images[class_indices]
        current_count = len(current_images)
        target_count = target_counts[label]

        per_image_limit = min((target_count - current_count) // current_count + 1, 1 + max_transforms)

        for img in current_images:
            # Always include original
            augmented_images.append(img)
            augmented_labels.append(label)

            # Apply transformations up to the per-image limit
            for i in range(1, per_image_limit):
                transformed_img = augmentation_funcs[i - 1](img)
                augmented_images.append(transformed_img)
                augmented_labels.append(label)

    # Convert to arrays and shuffle
    augmented_images = np.array(augmented_images)
    augmented_labels = np.array(augmented_labels)

    np.random.seed(0)
    indices = np.random.permutation(len(augmented_labels))
    
    return augmented_images[indices], augmented_labels[indices]





def print_class_distribution(labels, class_names=None):
    """
    Print the distribution of classes in a dataset.
    
    Parameters:
    labels: Array of labels
    class_names: Optional dictionary mapping class IDs to names
    """
    unique_labels, counts = np.unique(labels, return_counts=True)
    total = len(labels)
    
    print("Class distribution:")
    for i, (label, count) in enumerate(zip(unique_labels, counts)):
        if class_names is not None and label in class_names:
            class_name = class_names[label]
        else:
            class_name = f"Class {label}"
        
        percentage = (count / total) * 100
        print(f"{class_name}: {count} examples ({percentage:.1f}%)")

In [4]:
train_images_aug, train_labels_aug = augment_data_for_balance(train_images, train_labels, target_counts=2000)

In [5]:
print("Before augmentation")
print_class_distribution(train_labels)

print("\nAfter augmentation")
print_class_distribution(train_labels_aug)

Before augmentation
Class distribution:
Class 1: 939 examples (10.2%)
Class 2: 208 examples (2.3%)
Class 3: 535 examples (5.8%)
Class 4: 3624 examples (39.4%)
Class 5: 1504 examples (16.3%)
Class 6: 512 examples (5.6%)
Class 7: 1881 examples (20.4%)

After augmentation
Class distribution:
Class 1: 1878 examples (14.6%)
Class 2: 832 examples (6.5%)
Class 3: 1605 examples (12.5%)
Class 4: 3624 examples (28.2%)
Class 5: 1504 examples (11.7%)
Class 6: 1536 examples (11.9%)
Class 7: 1881 examples (14.6%)


In [6]:
# pickle augmented image arrays
utils.save_processed_data('all_augmented_preprocessed_data.pkl', train_images_aug, train_labels_aug, val_images, val_labels, test_images, test_labels)

Data saved to ./281_final_project_data/all_augmented_preprocessed_data.pkl
