In [None]:
import sys
import os
import json
from tqdm import tqdm

import numpy as np
import pandas as pd
import SimpleITK as sitk

import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
test_or_val = "validation"
# test_or_val = "test"

datafolder = f"./contours_compare_{test_or_val}"

root, dirs, files = next(os.walk(f"{datafolder}"))
for i, dir in enumerate(dirs):
    print(f"{i}: {dir}")



In [None]:
set_number = 1

dataset = dirs[set_number]
dataset_path = f"{datafolder}/{dataset}"

In [None]:
try:
    with open(f"{dataset_path}/summary.json") as f:
        output_statistics = json.load(f)
except:
    print(f"{dataset} doesn't have a summary json. Aborting...")

### Defining a range of dice thresholds

helper functions to calculate metrics of the case of a single dice threshold \
metric used is F1-score 

$\frac{2tp}{2tp + fp + fn}$


In [None]:
def f1_score(dict_with_cm):
    TP, FP, TN, FN = dict_with_cm["TP"], dict_with_cm["FP"], dict_with_cm["TN"], dict_with_cm["FN"]

    score = 2 * TP / (2 * TP + FP + FN)
    return score

def sensitivity(dict_with_cm):
    TP, FP, TN, FN = dict_with_cm["TP"], dict_with_cm["FP"], dict_with_cm["TN"], dict_with_cm["FN"]

    score = TP / (TP + FN)
    return score

def specificity(dict_with_cm):
    TP, FP, TN, FN = dict_with_cm["TP"], dict_with_cm["FP"], dict_with_cm["TN"], dict_with_cm["FN"]
    print(FP, TN)
    score = TN / (TN + FP)
    return score

# Determine dice score of segmentation wrt ground truth
# regions to calculate dice score of are expected to be labeled with 1
def dice(gt, seg):
    label = 1
    twice_the_intersection = np.sum(np.where(seg == label, 1, 0)[gt == label]) * 2.0
    sum_of_volumes = np.sum(seg) + np.sum(gt)

    return twice_the_intersection / sum_of_volumes


In [None]:
dice_thresholds = np.arange(0.025, 0.91, 0.025)
print(dice_thresholds)

In [None]:
gg3_f1_scores = []
gg4_f1_scores = []
cribriform_f1_scores = []
foreground_f1_scores = []

path_to_gt = f"{dataset_path}/ground_truth"
path_to_mp = f"{dataset_path}/model_output"


# The evaluation of model output statistics
###############
# get_avg_dice_score = lambda stats_dicts: {label: stats_dicts[label]["tp_dice_scores_sum"]/stats_dicts[label]["TP"]  if stats_dicts[label]["TP"] != 0 else 0 for label in stats_dicts.keys()}

i = 0

# for dice_threshold in tqdm(dice_thresholds[0:1]):
for dice_threshold in tqdm(dice_thresholds):
    stats_dicts = {label:
                    {
                        "TP": 0,
                        "FP": 0,
                        "TN": 0,
                        "FN": 0,
                    }
                    for label in [1, 2, 3, 4]
                }

    for i, patient_filename in enumerate(os.listdir(path_to_gt)):        
        gt_array = sitk.GetArrayFromImage(sitk.ReadImage(f"{path_to_gt}/{patient_filename}"))
        mp_array = sitk.GetArrayFromImage(sitk.ReadImage(f"{path_to_mp}/{patient_filename}"))

        # For every label except background, consider only regions of that label and decide
        # on confusion metric and/or dice score for those regions
        for label in [1, 2, 3]:
            label_only_gt = np.where(gt_array == int(label), 1, 0)
            label_only_seg = np.where(mp_array == int(label), 1, 0)

            if np.all(label_only_gt == 0) and np.all(label_only_seg == 0):
                stats_dicts[label]["TN"] += 1
            elif np.all(label_only_gt == 0) and not np.all(label_only_seg == 0):
                stats_dicts[label]["FP"] += 1
            elif not np.all(label_only_gt == 0) and np.all(label_only_seg == 0):
                stats_dicts[label]["FN"] += 1
            else:
                dice_score = dice(label_only_gt, label_only_seg)

                if dice_score > dice_threshold:
                    stats_dicts[label]["TP"] += 1
                else:
                    stats_dicts[label]["FN"] += 1

        # Now do the same one more time, but without distinguishing between the different
        # patterns, so only for foreground vs background.
        simplified_gt_array = np.where(gt_array != 0, 1, 0)
        simplified_mp_array = np.where(mp_array != 0, 1, 0)

        if np.all(simplified_gt_array == 0) and np.all(simplified_mp_array == 0):
            stats_dicts[4]["TN"]
        elif np.all(simplified_gt_array == 0) and not np.all(simplified_mp_array == 0):
            stats_dicts[4]["FP"]
        elif not np.all(simplified_gt_array == 0) and np.all(simplified_mp_array == 0):
            stats_dicts[4]["FN"]
        else:
            dice_score = dice(simplified_gt_array, simplified_mp_array)

            if dice_score > dice_threshold:
                stats_dicts[label]["TP"] += 1
            else:
                stats_dicts[label]["FN"] += 1

    print(stats_dicts)
    # Saving this resulting f1_scores
    for label in stats_dicts.keys():
        print(f"{label} : {f1_score(stats_dicts[label])}")

        if label == 1:
            gg3_f1_scores.append(f1_score(stats_dicts[label]))
        elif label == 2:
            gg4_f1_scores.append(f1_score(stats_dicts[label]))
        elif label == 3:
            cribriform_f1_scores.append(f1_score(stats_dicts[label]))
        elif label == 4:
            foreground_f1_scores.append(f1_score(stats_dicts[label]))



### Saving outputs to file

In [None]:
dataset_f1scores = {
    "dice_thresholds": [round(threshold, 4) for threshold in dice_thresholds],
    "GG3": gg3_f1_scores,
    "GG4": gg4_f1_scores,
    "Cribriform": cribriform_f1_scores
}

with open(f"{dataset_path}/{dataset}_f1scores.json", 'w', encoding = 'utf-8') as f:
    json.dump(dataset_f1scores, f, ensure_ascii = False, indent = 4)

In [None]:
gg3_f1_scores[0] = 0.6129032258064516

In [None]:
test = gg3_f1_scores

In [None]:
print(gg3_f1_scores)
print(gg4_f1_scores)
print(cribriform_f1_scores)
# print(foreground_background_f1_scores)

# print(gg3_f1_scores[3])
# print(gg4_f1_scores[3])
# print(cribriform_f1_scores[3])
# print(foreground_background_f1_scores[3])
test[0:3]


In [None]:
dataset_005_scores

In [None]:
sns.set_style('darkgrid')

SMALL_SIZE = 8
MEDIUM_SIZE = 12
BIGGER_SIZE = 16

plt.rc('font', size=12)          # controls default text sizes
plt.rc('axes', titlesize=24)     # fontsize of the axes title
plt.rc('axes', labelsize=14)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=12)    # fontsize of the tick labels
plt.rc('ytick', labelsize=12)    # fontsize of the tick labels
plt.rc('legend', fontsize=16)    # legend fontsize
plt.rc('figure', titlesize=16)  # fontsize of the figure title

In [None]:
fig, axis = plt.subplots(figsize = (7, 6))

sns.lineplot(x = dice_thresholds, y = gg3_f1_scores, ax = axis, label = "GG3")
sns.lineplot(x = dice_thresholds, y = gg4_f1_scores, ax = axis, label = "GG4")
sns.lineplot(x = dice_thresholds, y = cribriform_f1_scores, ax = axis, label = "Cribriform")
# if SIMPLIFIED_DATASET:
#     sns.lineplot(x = dice_thresholds, y = foreground_background_f1_scores, ax = axis, label = "Foreground")

plt.axvline(0.1, color = "red", linestyle = "dashed", linewidth = 1, label = "Dice == 0.1")
plt.legend()
plt.xlabel("Dice threshold")
plt.ylabel("F1-score")


plt.title(f"F1-scores for region detection\n")


In [None]:
# dice_thresholds.index(0.1)

index_of_usual_threshold = np.where(dice_thresholds == 0.1)[0][0]

print(gg3_f1_scores[index_of_usual_threshold])
print(gg4_f1_scores[index_of_usual_threshold])
print(cribriform_f1_scores[index_of_usual_threshold])

In [None]:
fig, axis = plt.subplots(figsize = (8, 6))

sns.lineplot(x = dice_thresholds, y = gg3_sensitivity_scores, ax = axis, label = "GG3 sensitivity")
sns.lineplot(x = dice_thresholds, y = gg4_sensitivity_scores, ax = axis, label = "GG4 sensitivity")
sns.lineplot(x = dice_thresholds, y = cribriform_sensitivity_scores, ax = axis, label = "Cribriform sensitivity")

sns.lineplot(x = dice_thresholds, y = gg3_specificity_scores, ax = axis, label = "GG3 specificity")
sns.lineplot(x = dice_thresholds, y = gg4_specificity_scores, ax = axis, label = "GG4 specificity")
sns.lineplot(x = dice_thresholds, y = cribriform_specificity_scores, ax = axis, label = "Cribriform specificity")

plt.legend()
plt.xlabel("Dice threshold")
plt.ylabel("Value of metric")
plt.title(f"All-modalities dataset sensitivity and specificity for different TP-thresholds of Volumetric Dice score")

In [None]:
# print(len(os.listdir(path_to_simplified_gt)))
# print(len(foreground_background_dices))
# print(len(data))
# data.loc[16]
# # for dice in foreground_background_dices:
# #     print(dice)

In [None]:
data["fgbg_dices"] = foreground_background_dices[:42]
data[['filename','structure', 'volumetric_dice', 'fgbg_dices']]


In [None]:
path_to_imgs = f"../../exploratory-data-analysis"
all_modalities_fig = plt.imread(f"{path_to_imgs}/region_detection_f1_scores_allmodalities.png")
t2adc_fig = plt.imread(f"{path_to_imgs}/region_detection_f1_scores_t2_adc.png")

fig, axs = plt.subplots(1, 2, figsize = (16, 10))
plt.xticks(None)
axs[0].imshow(all_modalities_fig)
axs[1].imshow(t2adc_fig)