# Dataset Size

In [1]:
%load_ext autoreload
%autoreload 2

import re

import pandas as pd
import plotly.express as px

from htc.models.data.DataSpecification import DataSpecification
from htc.models.data.run_size_dataset import label_mapping_dataset_size
from htc.models.image.DatasetImage import DatasetImage
from htc.tivita.DataPath import DataPath
from htc.utils.Config import Config
from htc.utils.sqldf import sqldf

## Total number of pigs seen during training

In [2]:
rows = []

specs = DataSpecification("pigs_semantic-only_dataset-size_repetitions=5V2.json")
for fold, dataset in specs:
    match = re.search(r"pigs=(\d+)", fold)
    assert match is not None
    n_pigs = int(match.group(1))

    match = re.search(r"seed=(\d+)", fold)
    assert match is not None
    seed = int(match.group(1))

    pigs = sorted({p.subject_name for p in dataset["train_semantic"]})
    for pig in pigs:
        rows.append([n_pigs, seed, pig])

df = pd.DataFrame(rows, columns=["n_pigs", "seed", "subject_name"])
df

Unnamed: 0,n_pigs,seed,subject_name
0,1,0,P044
1,1,1,P047
2,1,2,P061
3,1,3,P069
4,1,4,P049
...,...,...,...
535,15,0,P060
536,15,0,P061
537,15,0,P069
538,15,0,P070


In [3]:
df_counts = sqldf("""
    SELECT n_pigs, COUNT(DISTINCT subject_name) AS n_pigs_total
    FROM df
    GROUP BY n_pigs
    ORDER BY n_pigs
""")
df_counts

Unnamed: 0,n_pigs,n_pigs_total
0,1,5
1,2,8
2,3,10
3,4,12
4,5,14
5,6,14
6,7,14
7,8,15
8,9,15
9,10,15


In [4]:
px.bar(df_counts, x="n_pigs", y="n_pigs_total")

## Number of pixels per label

In [5]:
# All images from the training set
image_names = set()
for fold, dataset in specs:
    image_names.update([p.image_name() for p in dataset["train_semantic"]])

image_names = sorted(image_names)

# Relevant pixels for each image
rows = []
mapping = label_mapping_dataset_size()
for image_name in image_names:
    path = DataPath.from_image_name(image_name)
    sample = DatasetImage(
        [path], train=False, config=Config({"input/no_features": True, "label_mapping": mapping.mapping_name_index})
    )[0]

    # Similar to DatasetPatchStream
    relevant_pixels = sample["valid_pixels"]
    relevant_pixels[sample["labels"] == mapping.name_to_index("background")] = False
    patch_size_half = 32
    relevant_pixels[:patch_size_half, :] = False
    relevant_pixels[:, :patch_size_half] = False
    relevant_pixels[-patch_size_half:, :] = False
    relevant_pixels[:, -patch_size_half:] = False

    rows.append([path.subject_name, path.timestamp, relevant_pixels.sum().item()])

df = pd.DataFrame(rows, columns=["subject_name", "timestamp", "n_pixels"])
df.sort_values(by=["n_pixels"], inplace=True)
df

Unnamed: 0,subject_name,timestamp,n_pixels
143,P058,2020_05_13_19_04_00,31380
142,P058,2020_05_13_19_03_10,31522
147,P058,2020_05_13_19_06_54,43765
145,P058,2020_05_13_19_06_03,46472
268,P070,2020_07_24_19_55_51,47176
...,...,...,...
27,P045,2020_02_05_10_56_52,227857
37,P045,2020_02_05_11_01_20,228394
68,P049,2020_02_11_19_11_32,230012
78,P049,2020_02_11_19_16_11,230105


In [6]:
fig = px.histogram(df, x="n_pixels")
fig.update_layout(bargap=0.2)

In [7]:
# Used to find images which do not have enough pixels for a label (e.g. only small skin annotation; cf. filter_min_pixels function in run_size_dataset.py)
px.line(df.reset_index(), x="n_pixels").update_traces(mode="lines+markers")