## Imports

In [1]:
import os
import shutil

from pathlib import Path

import numpy as np
import pandas as pd
import cv2
import imgaug.augmenters as iaa

import tensorflow as tf
print("Tensorflow version " + tf.__version__)

2024-04-20 23:28:09.547919: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-20 23:28:09.579038: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Tensorflow version 2.16.1


## Constants

In [2]:
original_dataset_path = Path("original")
preprocesed_dataset_path = Path("preprocessed")
augmented_dataset_path = Path("augmented")

dims = (64, 64)

## Create preprocesed_dir

In [3]:
preprocesed_dataset_path.mkdir(parents=True, exist_ok=True)

## Resize funtions

In [4]:
def resize_center_box(org_img, dims):
    org_height, org_width, _ = org_img.shape
    min_org_dims = min(org_height, org_width)
    if org_height == min_org_dims:
        pixels_skip = (org_width-min_org_dims)//2
        org_roi = org_img[0:org_height, pixels_skip: org_width-pixels_skip]
    else:
        pixels_skip = (org_height-min_org_dims)//2
        org_roi = org_img[pixels_skip:org_height-pixels_skip, 0: org_width]
    return cv2.resize(org_roi, dims)

def resize_strech(org_img, dims):
    return cv2.resize(org_img, dims)

## Load classes

In [5]:
animal_to_klass = {animal: klass for klass, animal in enumerate(os.listdir(original_dataset_path))}
klass_to_animal = {klass: animal for animal, klass in animal_to_klass.items()}

## Original class dirstibution

In [6]:
klass_initial_count = {
    klass: len(os.listdir(os.path.join(original_dataset_path, animal))) for klass, animal in klass_to_animal.items()
}
klass_initial_count

{0: 4821, 1: 1820, 2: 3098, 3: 2623, 4: 4319}

## Make each img have provided dims and create df_dataset_meta

In [7]:
dataset_meta = {
    "klass": [],
    "img_path": [],
}

for animal in animal_to_klass.keys():
    original_klass_path = original_dataset_path.joinpath(animal)
    preprocessed_klass_path = preprocesed_dataset_path.joinpath(animal)
    
    # Create dir for class if not exists
    preprocessed_klass_path.mkdir(parents=True, exist_ok=True)

    for idx, file in enumerate(os.listdir(original_klass_path)):
        original_file_path = original_klass_path.joinpath(file)
        preprocessed_file_path = preprocessed_klass_path.joinpath(f"{idx}.jpeg")

        # Read img
        org_img = cv2.imread(str(original_file_path))
        
        # Provide resize_center_box() or resize_strech()
        resized_img = resize_center_box(org_img, dims)

        # Save img
        cv2.imwrite(str(preprocessed_file_path), resized_img)

        dataset_meta["klass"].append(animal_to_klass[animal])
        dataset_meta["img_path"].append(str(preprocessed_file_path))

df_dataset_meta = pd.DataFrame(dataset_meta["klass"], index=dataset_meta["img_path"], columns=["klass"])

## 9.1 Split original into train/val/test

In [8]:
seed_value = 43  # Seed value for reproducibility

val_size = 250
test_size = 250
train_size = df_dataset_meta.shape[0] - val_size - test_size

df_train_meta = df_dataset_meta.sample(train_size, random_state=seed_value)
df_remainging_meta = df_dataset_meta.drop(df_train_meta.index)
df_val_meta = df_remainging_meta.sample(val_size, random_state=seed_value)
df_test_meta = df_remainging_meta.drop(df_val_meta.index)

## 9.2 Uniform class distribution with normalized and augmented data

In [9]:
augment_to_count = 10_000

### Distribution after val/train split

In [10]:
klass_train_count = df_train_meta["klass"].value_counts().to_dict()
klass_train_count

{0: 4672, 4: 4194, 2: 3003, 3: 2543, 1: 1769}

In [11]:
dataset_augmnented_meta = {
    "klass": [],
    "img_path": [],
}

# Define augmentation pipeline
seq = iaa.Sequential([
    # iaa.Affine(scale=(1, 1.1)), # Scale from 1 to 1.1
    iaa.Fliplr(0.5),  # horizontally flip 50% of the images
    iaa.Affine(rotate=(-15, 15)),  # rotate images by -15 to 15 degrees
    iaa.Resize({"height": dims[1], "width": dims[0]}),
    iaa.AdditiveGaussianNoise(scale=(0, 0.05*255)),
    iaa.Crop(px=(5, 15)),  # crop images from each side by 0 to 20px (randomly chosen)
])

avg_img_augment_count = {}
how_many_should_be_augmented = {}
augmented = {}

for animal in animal_to_klass.keys():
    augmentend_klass_path = augmented_dataset_path.joinpath(animal)
    
    # Create dir for class if not exists
    augmentend_klass_path.mkdir(parents=True, exist_ok=True)
    
    avg_img_augment_count[animal] = augment_to_count / klass_train_count[animal_to_klass[animal]] - 1
    how_many_should_be_augmented[animal] = 0
    
    augmented[animal]=0


for idx, (preprocessed_file_path, row)  in enumerate(df_train_meta.iterrows()):
    klass = row["klass"]

    animal = klass_to_animal[klass]

    preprocessed_klass_path = preprocesed_dataset_path.joinpath(animal)
    augmentend_klass_path = augmented_dataset_path.joinpath(animal)
    
    how_many_should_be_augmented[animal] += avg_img_augment_count[animal]

    current_augment_count = int(how_many_should_be_augmented[animal]) #  this many augmentations of this img need to be added
    how_many_should_be_augmented[animal] -= current_augment_count

    
    # Read img
    org_img = cv2.imread(str(preprocessed_file_path))

    dataset_augmnented_meta["klass"].append(animal_to_klass[animal])
    dataset_augmnented_meta["img_path"].append(str(preprocessed_file_path))
    
    # Augment the image
    augmented_images = seq(images=[org_img] * current_augment_count)

    # Save augmented images
    for augmented_image in augmented_images:
        augmented_file_path = augmentend_klass_path.joinpath(f"{augmented[animal]}.jpeg")
        cv2.imwrite(str(augmented_file_path), augmented_image)
        dataset_augmnented_meta["klass"].append(animal_to_klass[animal])
        dataset_augmnented_meta["img_path"].append(str(augmented_file_path))
        augmented[animal] += 1

df_dataset_augmented_meta = pd.DataFrame(dataset_augmnented_meta["klass"], index=dataset_augmnented_meta["img_path"], columns=["klass"])

In [13]:
df_train_augmented_meta = df_dataset_augmented_meta.copy()

## 9.3 Val is part of train

In [14]:
seed_value = 43  # Seed value for reproducibility

df_train_augmented_plus_val_meta = pd.concat([df_dataset_augmented_meta, df_val_meta])

## Load images for training

In [15]:
split1_dir = Path("SPLIT_1")
split2_dir = Path("SPLIT_2")
split3_dir = Path("SPLIT_3")

In [16]:
def save_df_to_dir(dir, df):
    class_to_dir = {}
    class_counter = {}
    for animal in animal_to_klass.keys():
        animal_dir = dir.joinpath(Path(animal))
        animal_dir.mkdir(parents=True, exist_ok=True)
        class_to_dir[animal_to_klass[animal]] = animal_dir
        class_counter[animal_to_klass[animal]] = 0
    
    for idx, row in df.iterrows():
        klass = row["klass"]
        org_filepath = Path(idx)
        new_filepath = Path(class_to_dir[row["klass"]].joinpath(f"{str(class_counter[klass])}.jpeg"))
        class_counter[klass] += 1
        shutil.copyfile(org_filepath, new_filepath)
        

def save_splits(split_dir, df_train_meta, df_val_meta, df_test_meta):
    train_dir = split_dir.joinpath(Path("training"))
    val_dir = split_dir.joinpath(Path("validation"))
    test_dir = split_dir.joinpath(Path("testing"))

    # Create dirs
    train_dir.mkdir(parents=True, exist_ok=True)
    val_dir.mkdir(parents=True, exist_ok=True)
    test_dir.mkdir(parents=True, exist_ok=True)
    
    save_df_to_dir(train_dir, df_train_meta)
    save_df_to_dir(val_dir, df_val_meta)
    save_df_to_dir(test_dir, df_test_meta)

In [18]:
save_splits(split1_dir, df_train_meta, df_val_meta, df_test_meta)
save_splits(split2_dir, df_train_augmented_meta, df_val_meta, df_test_meta)
save_splits(split3_dir, df_train_augmented_plus_val_meta, df_val_meta, df_test_meta)