# Data Profiling
This notebook shows a preliminary exploration of the dataset, that is composed by images of foosball matches from variuos angles.

For the analysis we will check image's shapes, ratios, resolutions, brightness and saturations.

## Setup
Execute this cell before the other ones.

In [None]:
import pandas as pd
from pathlib import Path
import sys
import ipywidgets as widgets
from IPython.display import display
sys.path.append(str(Path("../src").resolve()))
from utility import *
from config import *


clustering_options = [
    ("Default", DEFAULT_IMAGES_DATAFRAME_DIRECTORY),
    ("Added", ADDED_IMAGES_DATAFRAME_DIRECTORY),
    ("Augmented", AUGMENTED_IMAGES_DATAFRAME_DIRECTORY)
]

# Dropdown widget
dropdown = widgets.Dropdown(
    options=clustering_options,
    description='Clustering:'
)
display(dropdown)

button = widgets.Button(description="Load Data")

# Output widget
out = widgets.Output()
display(out)

def on_load_data_button_clicked(b):
    '''
    Function executed when the 'Load Data' button is clicked.
    '''
    global df
    global df_chosen
    df_path = dropdown.value
    df_chosen = next(key for key, val in clustering_options if val == df_path)
    
    with out: 
        out.clear_output()
        print(f"Dataframe selected: {df_path}")
        df = pd.read_parquet(df_path)
        print("Dataframe loaded.")

button.on_click(on_load_data_button_clicked)
display(button)


## Shape Analysis
An image is considered a square when its ratio (width/height) is between 0.95 and 1.05 (not necessarily 1.0).

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

ax = sns.countplot(x="shape", data=df, hue="shape")

for p in ax.patches:
    ax.text(
        x=p.get_x() + p.get_width() / 2,
        y=p.get_height() + 10,
        s=int(p.get_height()),
        ha='center'
    )

plt.title(f"Shape ({df_chosen})")
plt.xlabel("")
plt.ylabel("Number of images")
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.histplot(data=df, x="ratio", hue="shape", element="step", stat="count")
plt.title(f"Shape ({df_chosen})")
plt.xlabel("Ratio width/height")
plt.ylabel("Number of images")
plt.show()

# Resolution Analysis
An image has high resolution when its max length side is greater than 640, since an image with that dimension is downscaled for YOLO.

We need to discriminate beetween images that are downscaled and images that are upscaled, because the upscaled ones require interpolation,<br>
that might bring some noise, artifacts or loss of informations.

**Warning:** Some image have a resolution for which they would be classified as high resolution, but they might have been upscaled with an interpolation.<br>
So we can't fully trust these numbers to discriminate between interpolated and not interpolated images.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

ax = sns.countplot(x="resolution", data=df)

for p in ax.patches:
    ax.text(
        x=p.get_x() + p.get_width() / 2,
        y=p.get_height() + 10,
        s=int(p.get_height()),
        ha='center'
    )

plt.title(f"Resolution ({df_chosen})")
plt.xlabel("")
plt.ylabel("Number of images")
plt.show()

# Brightness and Saturation Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.displot(df["brightness_mean"], color="gray")
plt.title(f"Brightness ({df_chosen})")
plt.xlabel("Brightness mean")
plt.ylabel("Number of images")
plt.show()

sns.displot(df["saturation_mean"], color="orange")
plt.title(f"Saturation ({df_chosen})")
plt.xlabel("Saturation mean")
plt.ylabel("Number of images")
plt.show()
