# K-Fold Selection

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import plotly.graph_objects as go

from htc.cpp import kfold_combinations
from htc.models.data.DataSpecification import DataSpecification
from htc.settings_seg import settings_seg

In [2]:
paths = DataSpecification("pigs_semantic-only_5foldsV2.json").paths()
pigs = sorted({p.subject_name for p in paths})

rows = []
for path in paths:
    for label in filter(lambda l: l in settings_seg.labels, path.annotated_labels()):
        label_index = settings_seg.label_mapping.name_to_index(label)
        rows.append([path.subject_name, pigs.index(path.subject_name), path.timestamp, label_index])

df = pd.DataFrame(rows, columns=["subject_name", "pig_index", "timestamp", "label_index"])
df

Unnamed: 0,subject_name,pig_index,timestamp,label_index
0,P041,0,2019_12_14_12_00_16,9
1,P041,0,2019_12_14_12_00_16,4
2,P041,0,2019_12_14_12_00_16,13
3,P041,0,2019_12_14_12_00_16,5
4,P041,0,2019_12_14_12_01_09,4
...,...,...,...,...
2228,P071,14,2020_08_05_11_57_30,10
2229,P071,14,2020_08_05_11_57_30,14
2230,P071,14,2020_08_05_11_57_30,11
2231,P071,14,2020_08_05_11_57_30,12


## Find all $k$-fold combinations with the maximum number of labels

In [3]:
pig_indices = df["pig_index"].unique().tolist()
pig_labels = {
    pig_index: sorted(df.query(f"pig_index == {pig_index}")["label_index"].unique().tolist())
    for pig_index in df["pig_index"].unique()
}
min_labels = 88
folds = kfold_combinations(pig_indices, pig_labels, min_labels)
len(folds)

2015

## Additional statistics of the remaining folds

In [4]:
rows = []
for fold in folds:
    splits = []
    for g in range(5):
        splits.append([fold[g * 3], fold[g * 3 + 1], fold[g * 3 + 2]])

    total_std_pigs = 0
    total_std_images = 0
    n_pigs = 0
    n_labels = []
    for split in splits:
        df_test = df.query("pig_index in @split")
        label_dist_pigs = df_test.groupby("label_index")["subject_name"].nunique().values
        label_dist_images = df_test.groupby(["label_index"])["timestamp"].nunique().values

        n_pigs += np.sum(label_dist_pigs)
        total_std_pigs += np.std(label_dist_pigs)
        total_std_images += np.std(label_dist_images)
        n_labels.append(len(label_dist_pigs))

    rows.append([splits, total_std_pigs, np.std(n_labels), total_std_images, n_pigs])

df_folds = pd.DataFrame(rows, columns=["splits", "std_pigs", "n_labels_std", "std_images", "n_pigs"])

In [5]:
df_folds.sort_values(by="std_pigs", inplace=True)
df_folds

Unnamed: 0,splits,std_pigs,n_labels_std,std_images,n_pigs
1100,"[[0, 6, 10], [1, 4, 9], [2, 11, 14], [3, 8, 12...",4.399190,1.019804,89.640749,188
1741,"[[0, 10, 12], [1, 6, 9], [2, 11, 14], [3, 4, 8...",4.400032,1.019804,88.449107,188
1075,"[[0, 6, 10], [1, 3, 9], [2, 11, 14], [4, 8, 12...",4.402463,1.019804,88.771202,188
1696,"[[0, 10, 12], [1, 4, 9], [2, 11, 14], [3, 6, 8...",4.403905,1.019804,89.405186,188
1851,"[[0, 10, 14], [1, 6, 9], [2, 11, 12], [3, 4, 8...",4.407481,1.019804,89.608142,188
...,...,...,...,...,...
396,"[[0, 1, 14], [2, 6, 11], [3, 8, 12], [4, 5, 10...",4.766993,1.019804,90.078500,188
0,"[[0, 1, 7], [2, 3, 11], [4, 5, 8], [6, 10, 12]...",4.788821,1.019804,90.474055,188
1367,"[[0, 8, 12], [1, 6, 7], [2, 3, 11], [4, 5, 10]...",4.795013,1.019804,90.445178,188
342,"[[0, 1, 14], [2, 3, 11], [4, 5, 8], [6, 10, 12...",4.805158,1.019804,90.012832,188


In [6]:
for split in df_folds.iloc[0]["splits"]:
    df_test = df.query("pig_index in @split")
    print(df_test.groupby("label_index")["subject_name"].nunique().values)

[2 1 1 3 3 3 3 3 2 2 3 3 3 3 1 1 1 1]
[1 1 1 3 3 3 3 3 2 2 2 3 3 2 1 1 1 1 1]
[2 1 1 3 3 3 3 3 1 2 3 3 3 3 2 1 1]
[1 1 1 3 3 3 3 3 2 1 3 3 3 3 1 1 1 1]
[2 1 1 3 3 3 3 3 2 2 2 3 3 3 2 1]


In [7]:
for split in df_folds.iloc[0]["splits"]:
    df_test = df.query("pig_index in @split")
    print(df_test.groupby("label_index")["timestamp"].nunique().values)

[ 3 14 14 35 35 45 33 31 11 17 23 25 20 11  8  2  3  6]
[ 4  6  6 32 48 51 18 29 15 19  8 37 38 37  5  1  3  3  3]
[ 4  4  4 36 37 50 40 39  5 10 24 34 50 44  8  4  6]
[ 2 13 13 58 62 59 31 46 23  1 38 47 44 44 14  8  8 15]
[13 14 14 49 61 65 50 50 20 12 33 65 68 68 14 13]


Minimizing the standard deviation of the label distribution is not enough.

In [8]:
df_folds_best = df_folds.sort_values(by=["n_labels_std", "std_pigs", "std_images"])
df_folds_best

Unnamed: 0,splits,std_pigs,n_labels_std,std_images,n_pigs
1100,"[[0, 6, 10], [1, 4, 9], [2, 11, 14], [3, 8, 12...",4.399190,1.019804,89.640749,188
1741,"[[0, 10, 12], [1, 6, 9], [2, 11, 14], [3, 4, 8...",4.400032,1.019804,88.449107,188
1075,"[[0, 6, 10], [1, 3, 9], [2, 11, 14], [4, 8, 12...",4.402463,1.019804,88.771202,188
1696,"[[0, 10, 12], [1, 4, 9], [2, 11, 14], [3, 6, 8...",4.403905,1.019804,89.405186,188
1851,"[[0, 10, 14], [1, 6, 9], [2, 11, 12], [3, 4, 8...",4.407481,1.019804,89.608142,188
...,...,...,...,...,...
843,"[[0, 5, 10], [1, 6, 9], [2, 4, 11], [3, 12, 13...",4.729709,1.019804,89.810726,188
72,"[[0, 1, 9], [2, 3, 11], [4, 5, 13], [6, 10, 12...",4.733115,1.019804,89.950701,188
421,"[[0, 1, 14], [2, 7, 11], [3, 9, 13], [4, 5, 8]...",4.734246,1.019804,90.656024,188
341,"[[0, 1, 14], [2, 3, 11], [4, 5, 8], [6, 9, 13]...",4.746013,1.019804,89.861880,188


In [9]:
for split in df_folds_best.iloc[0]["splits"]:
    df_test = df.query("pig_index in @split")
    print(df_test.groupby("label_index")["subject_name"].nunique().values)

[2 1 1 3 3 3 3 3 2 2 3 3 3 3 1 1 1 1]
[1 1 1 3 3 3 3 3 2 2 2 3 3 2 1 1 1 1 1]
[2 1 1 3 3 3 3 3 1 2 3 3 3 3 2 1 1]
[1 1 1 3 3 3 3 3 2 1 3 3 3 3 1 1 1 1]
[2 1 1 3 3 3 3 3 2 2 2 3 3 3 2 1]


In [10]:
for split in df_folds_best.iloc[0]["splits"]:
    df_test = df.query("pig_index in @split")
    print(df_test.groupby("label_index")["timestamp"].nunique().values)

[ 3 14 14 35 35 45 33 31 11 17 23 25 20 11  8  2  3  6]
[ 4  6  6 32 48 51 18 29 15 19  8 37 38 37  5  1  3  3  3]
[ 4  4  4 36 37 50 40 39  5 10 24 34 50 44  8  4  6]
[ 2 13 13 58 62 59 31 46 23  1 38 47 44 44 14  8  8 15]
[13 14 14 49 61 65 50 50 20 12 33 65 68 68 14 13]


In [11]:
[[pigs[s] for s in split] for split in df_folds_best.iloc[0]["splits"]]

[['P041', 'P050', 'P060'],
 ['P044', 'P048', 'P059'],
 ['P045', 'P061', 'P071'],
 ['P047', 'P058', 'P069'],
 ['P049', 'P057', 'P070']]

In the end, we optimized three criteria to get a balanced label distribution:
- Each fold should have as many label classes as possible
- The number of pigs per label should be similar
- The number of images per label should be similar (has only a minor effect)

In [12]:
fig = go.Figure()
colors = ["rgb(20, 20, 20)", "rgb(80, 80, 80)", "rgb(120, 120, 120)", "rgb(160, 160, 160)", "rgb(200, 200, 200)"]

for color, split in zip(colors, df_folds_best.iloc[0]["splits"]):
    df_test = df.query("pig_index in @split")
    values = df_test.groupby("label_index")["subject_name"].nunique().values
    fold_name = "fold_" + ",".join(df_test["subject_name"].unique().tolist())
    fig.add_trace(go.Bar(x=settings_seg.labels, y=values, marker_color=color, name=fold_name))

fig.update_layout(title_x=0.5, title_text="KFold Label Distribution")
fig.show()

## Existing validation folds
The best fold from above does not match with the existing fold because this notebook has not been updated after the new dataset. However, the existing fold still matches the criterion of maximal number of organs across folds (88 of 90).

In [13]:
from htc.models.data.run_pig_dataset import folds_pigs

existing_id = None
for i in range(len(df_folds_best)):
    val_pigs = [[pigs[s] for s in split] for split in df_folds_best.iloc[i]["splits"]]
    if val_pigs == folds_pigs:
        existing_id = i
        break

assert existing_id is not None
df_folds_best.iloc[existing_id]

splits          [[0, 10, 12], [1, 6, 9], [2, 11, 14], [3, 5, 1...
std_pigs                                                 4.419619
n_labels_std                                             1.019804
std_images                                              88.570138
n_pigs                                                        188
Name: 1744, dtype: object