In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Extra non "data science / image processing" libraries
from pathlib import Path

# Global variables
from global_variables import (
    DATA_FOLDER,
    CLASS_TO_OBJECT_NAME,
    MAP_COLUMN_TO_CONVERTER,
    OBJECT_CODE_TO_CLASS,
)

In [None]:
def create_images_metadata(folder: Path) -> pd.DataFrame:
    all_file_series = []
    for subfolder in folder.glob("*"):
        subfolder_name = subfolder.stem
        subfolder_metadata = subfolder_name.split("_")
        background, illumination, environment = subfolder_metadata
        subfolder_files = list(subfolder.glob("*.jpg"))

        for file in subfolder_files:
            file_name = file.stem
            file_object_codes = file_name.split("-")
            img = plt.imread(file)
            height, width, channels = img.shape
            for object_code in file_object_codes:
                try:
                    file_series = pd.Series(
                        {
                            "file_path": file,
                            "height": height,
                            "width": width,
                            "channels": channels,
                            "background": background,
                            "illumination": illumination,
                            "environment": environment,
                            "objects_in_image_code": file_object_codes,
                            "objects_in_image_numeric": [
                                OBJECT_CODE_TO_CLASS[object_code]
                                for object_code in file_object_codes
                            ],
                            "label_code": object_code,
                            "label_numeric": OBJECT_CODE_TO_CLASS[object_code],
                        }
                    )
                    all_file_series.append(file_series)
                except KeyError as e:
                    print(f"{file} has an invalid object code.")


    df = pd.DataFrame(all_file_series)
    return df

df = create_images_metadata(DATA_FOLDER)
df.head()

In [None]:
assert df.height.nunique() == 1
assert df.width.nunique() == 1
assert df.channels.nunique() == 1

In [None]:
def display_counts_for(df, column):
    counts = df[column].value_counts()
    counts = counts.sort_index()
    counts.index = counts.index.map(MAP_COLUMN_TO_CONVERTER[column])
    display(counts)
    counts.plot(kind="bar", title=f"Contagens de {column}")
    plt.show()

In [None]:
display_counts_for(df, "label_code")

In [None]:
display_counts_for(df, "background")

In [None]:
display_counts_for(df, "illumination")

In [None]:
display_counts_for(df, "environment")

In [None]:
def display_sample_images(df, n_images=3):
    sample = df.sample(n_images)
    for i in range(n_images):
        file_path = sample["file_path"].values[i]
        img = plt.imread(file_path, format="jpg")
        plt.imshow(img, cmap="gray")
        plt.xticks([])
        plt.yticks([])
        plt.show()

display_sample_images(df)

In [None]:
def plot_database_sample(df, images_per_class=12):
    """Plots a subplot of images with images_per_class images per class in each
    row.
    """
    _, axs = plt.subplots(10, images_per_class, figsize=(20, 20), dpi=200)
    for i in range(10):
        class_df = df[df["label_numeric"] == i]
        sample = class_df.sample(images_per_class)
        for j in range(images_per_class):
            file_path = sample["file_path"].values[j]
            img = plt.imread(file_path)
            axs[i, j].imshow(img, cmap="gray")
            axs[i, j].set_xticks([])
            axs[i, j].set_yticks([])
            if j == 0:
                axs[i, j].set_ylabel(CLASS_TO_OBJECT_NAME[i], fontsize=14)
    plt.subplots_adjust(wspace=0, hspace=0)
    plt.tight_layout()
    plt.show()


plot_database_sample(df)

In [None]:
df.to_csv("images_metadata.csv", index=False)