# Experiment Analysis

In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path

import numpy as np
import pandas as pd
import plotly.graph_objects as go

from htc.evaluation.analyze_tfevents import read_tfevent_losses
from htc.evaluation.metrics.scores import normalize_grouped_cm
from htc.models.common.MetricAggregation import MetricAggregation
from htc.settings import settings
from htc.utils.Config import Config
from htc.utils.helper_functions import utilization_table
from htc.utils.LabelMapping import LabelMapping
from htc.utils.sqldf import sqldf
from htc.utils.visualization import (
    create_class_scores_figure,
    create_confusion_figure,
    create_ece_figure,
    create_running_metric_plot,
    create_spec_labels_figure,
    create_surface_dice_plot,
    create_training_stats_figure,
    create_training_stats_label_figure,
    show_class_scores_epoch,
    show_loss_chart,
    visualize_dict,
)

In [2]:
# Parameter for papermill
run_dir = settings.training_dir / "image/2022-02-03_22-58-44_generated_default_model_comparison"

In [3]:
assert run_dir != "", "No run_dir specified"
if type(run_dir) == str:
    run_dir = Path(run_dir)
print(f"Model: {run_dir.parent.name}")
print(f"Experiment: {run_dir.name}")

Model: image
Experiment: 2022-02-03_22-58-44_generated_default_model_comparison


In [4]:
# Load training files
df_val = pd.read_pickle(run_dir / "validation_table.pkl.xz").query("dataset_index == 0")
df_train = read_tfevent_losses(run_dir)
config = Config(run_dir / "config.json")
mapping = LabelMapping.from_config(config)

metrics = ["dice_metric"]
if "surface_distance_metric" in df_val:
    metrics.append("surface_distance_metric")

# Aggregated metrics (respecting the hierarchy of the data) with a metric score per subject (due to mode="image_level")
agg = MetricAggregation(df_val, config, metrics=metrics)
df_grouped = agg.grouped_metrics(mode="image_level")

In [5]:
df_train.head()

Unnamed: 0,fold_name,epoch_index,step,lr-Adam,train/ce_loss_step,train/dice_loss_step,dice_metric,train/ce_loss_epoch,train/dice_loss_epoch
0,"fold_P041,P060,P069",0,0,0.001,,,,,
1,"fold_P041,P060,P069",0,49,,0.814283,0.676449,,,
2,"fold_P041,P060,P069",0,99,,0.610165,0.570003,0.101304,1.109594,0.730523
3,"fold_P041,P060,P069",0,100,0.00099,,,,,
4,"fold_P041,P060,P069",1,149,,0.358511,0.425239,,,


In [6]:
df_val.head()

Unnamed: 0,epoch_index,best_epoch_index,dataset_index,fold_name,image_name,ce_loss,ece,dice_metric,used_labels,dice_metric_image,confusion_matrix,surface_distance_metric,surface_distance_metric_image,surface_dice_metric_mean,surface_dice_metric_image_mean
0,0,46,0,"fold_P041,P060,P069",P041#2019_12_14_12_00_16,2.426057,"{'error': 0.39524492621421814, 'accuracies': [...","[0.68173087, 0.0, 0.0, 0.0, 0.0]","[0, 4, 5, 9, 13]",0.136346,"[[158785, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",,,,
1,0,46,0,"fold_P041,P060,P069",P041#2019_12_14_12_01_09,3.976027,"{'error': 0.648522138595581, 'accuracies': [0,...","[0.39084372, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[0, 3, 4, 5, 6, 7, 10, 11, 13]",0.043427,"[[74615, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0...",,,,
2,0,46,0,"fold_P041,P060,P069",P041#2019_12_14_12_01_39,3.574123,"{'error': 0.6113175749778748, 'accuracies': [0...","[0.4529424, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0, 3, 4, 5, 6, 7, 10, 11, 12, 13]",0.045294,"[[89941, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0...",,,,
3,0,46,0,"fold_P041,P060,P069",P041#2019_12_14_13_33_30,3.801623,"{'error': 0.7022789120674133, 'accuracies': [0...","[0.3317932, 0.0, 0.0, 0.0, 0.0, 0.0]","[0, 4, 5, 9, 12, 13]",0.055299,"[[60967, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0...",,,,
4,0,46,0,"fold_P041,P060,P069",P060#2020_05_14_19_14_12,3.052603,"{'error': 0.4414898157119751, 'accuracies': [0...","[0.64125204, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]","[0, 3, 4, 5, 6, 7, 10, 11]",0.080157,"[[144981, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",,,,


In [7]:
df_grouped.head()

Unnamed: 0,subject_name,dice_metric,surface_distance_metric
0,P041,0.909151,4.997667
1,P044,0.883229,4.668928
2,P045,0.911744,2.455003
3,P047,0.881175,10.478194
4,P048,0.881856,10.048947


## Config

In [8]:
visualize_dict(config.data)

## Metrics
Visualization of the metrics stored in the TensorBoard events file (everything which is logged via `self.log()`).

In [9]:
show_loss_chart(df_train, df_val)

For each fold, the current maximum for every epoch of the main metric (the checkpoint metric, `validation/checkpoint_metric` in the config).

In [10]:
create_running_metric_plot(df_train)

## Training Statistics
The following heatmaps visualize which images the network saw during training. On the $y$-axis is the sorted (by image name) list of images and on the $x$-axis we have the training epochs. For each fold and epoch, we can see how often the network saw a particular image. Usually, we want a random order of images, so if you have the feeling you are looking at a noise image, everything is fine :-)

In [11]:
if len(sorted(run_dir.rglob("trainings_stats.npz"))) > 0:
    fig = create_training_stats_figure(run_dir)
    fig.show()

In [12]:
if len(sorted(run_dir.rglob("trainings_stats.npz"))) > 0:
    fig = create_training_stats_label_figure(run_dir)
    fig.show()

## Hardware Utilization
GPU and CPU utilization for each fold.
> Note: You will only get reasonable numbers if the training was long enough. For super short training times, you might get warnings or nan values

In [13]:
utilization_table(run_dir)

Unnamed: 0,fold,node,hours,gpu_util_mean,gpu_util_std,cpu_util_mean,cpu_util_std
0,"fold_P041,P060,P069",e230-dgxa100-4,3.661637,0.887035,0.250468,0.061282,0.025481
1,"fold_P044,P050,P059",e230-dgxa100-4,3.644604,0.889473,0.254434,0.061238,0.025452
2,"fold_P045,P061,P071",hdf19-gpu11,1.796746,0.490188,0.35375,0.586911,0.161596
3,"fold_P047,P049,P070",hdf19-gpu12,1.635331,0.536253,0.352392,0.365345,0.103208
4,"fold_P048,P057,P058",hdf19-gpu11,1.846824,0.492464,0.34265,0.581556,0.16346


## Best Run

In [14]:
if "surface_distance_metric" in df_grouped:
    fig = create_surface_dice_plot(df_grouped["dice_metric"], df_grouped["surface_distance_metric"])
else:
    fig = go.Figure()
    fig.add_trace(
        go.Box(
            y=df_grouped["dice_metric"],
            text=df_grouped["subject_name"],
            name="dice",
            boxmean="sd",
            boxpoints="all",
        )
    )
    fig.update_layout(
        title_x=0.5,
        title_text=(
            f'Image scores<br>μ_dice={np.mean(df_grouped["dice_metric"]):.03f} ±'
            f' {np.std(df_grouped["dice_metric"]):.03f}'
        ),
    )
    fig.update_yaxes(title_text="Dice metric")
    fig.update_layout(width=500, autosize=False)

fig.show()

In [15]:
df_val_selection = df_val[["epoch_index", "best_epoch_index", "fold_name", "dice_metric_image"]]
sqldf("SELECT DISTINCT fold_name, best_epoch_index FROM df_val_selection ORDER BY best_epoch_index")

Unnamed: 0,fold_name,best_epoch_index
0,"fold_P041,P060,P069",46
1,"fold_P047,P049,P070",52
2,"fold_P044,P050,P059",70
3,"fold_P045,P061,P071",75
4,"fold_P048,P057,P058",79


## Class Scores (best run)

In [16]:
df_label = agg.grouped_metrics()

fig = go.Figure()
fig.add_trace(
    go.Box(y=df_label["dice_metric"], name="dice", boxmean="sd", boxpoints="all", text=df_label["label_name"])
)
fig.update_layout(
    title_x=0.5,
    title_text=(
        f"Dice across classes<br>μ_dice={df_label['dice_metric'].mean():.03f} ± {np.std(df_label['dice_metric']):.03f}"
    ),
)
fig.update_yaxes(title_text="Dice metric")
fig.update_layout(width=500, autosize=False)

Comparison of class dice scores. Each boxplot shows the distribution of dice values across subjects. The image and pixel counts are the sum over all images in the validation sets.

In [17]:
if "confusion_matrix" in df_val:
    create_class_scores_figure(agg)

## Class Imbalances (training time)

In [18]:
create_spec_labels_figure(config)

In [19]:
if "confusion_matrix" in df_val:
    show_class_scores_epoch(df_val, mapping)

## Confusion Matrix (best run)
Normalized confusion matrix. For each row, you can see how many of the annotated images were classified to the respective class.

In [20]:
if "confusion_matrix" in df_val:
    cm_rel, cm_std = normalize_grouped_cm(np.stack(agg.grouped_cm()["confusion_matrix"].values))
    create_confusion_figure(cm_rel, labels=mapping.label_names()).show()

## ECE Error (best run)

In [21]:
if "ece" in df_val:
    create_ece_figure(df_val).show()