# Dataset analysis 

In [1]:
import numpy as np
import math
import pandas as pd
import os.path
import matplotlib.pyplot as plt
from typing import List, Dict
import seaborn as sns
import json
from matplotlib import cm
from pprint import pformat
import json


plt.style.use("seaborn")
sns.color_palette("colorblind")

In [None]:
def load_json(filename : str) -> Dict:
    with open(filename, 'r') as f:
        data = json.load(f)
    return data

In [None]:
base_path_raw = r'/media/jan/DataStorage/ProjectData/temp/results_weighted_dataset_X_contrast_3'
base_path = [os.path.abspath(base_path_raw.replace('X', str(i))) for i in range(3)]

In [None]:
def get_results_frame(dim : int, csv_name = 'parameter_overview') -> pd.DataFrame:
    model_folders = [os.path.join(base_path[dim], o) for o in os.listdir(base_path[dim]) if os.path.isdir(os.path.join(base_path[dim],o)) and not o.startswith('.')]
    results = list()
    for folder in model_folders:
        try:
            r = load_json(os.path.join(folder, r'score_dict_final.json'))
        except:
            print(f'No experimant dict for folder {folder}.')
            continue
        exp_dict = load_json(os.path.join(folder, f'exp_dict.json'))
        r.update({
            'model_base' : exp_dict['model']['base'],
            'context_span' : exp_dict['dataset']['context_span'],
            'blob_points' : exp_dict['dataset']['blob_points'],
            'bg_points' : exp_dict['dataset']['bg_points'],
            'loss' : exp_dict['model']['loss'],
            'foldername' : folder.split('/')[-1],
            'sources' : exp_dict['dataset']['sources'],
            'separation_loss' : ('separation_loss' in exp_dict['model']['loss']),
            'prior_extend' : ('prior_extend' in exp_dict['model']['loss']),
            'weighted_point_loss' : ('rot_point_loss_multi_weighted' in exp_dict['model']['loss'])
        })
        results.append(r)

    results = pd.DataFrame(results).sort_values(['test_weighted_dice', 'test_dice'], ascending = False).reset_index(drop=True)
    results.to_csv(f'{csv_name}_dim{dim}')
    return results

In [None]:
results_dict = {dim : get_results_frame(dim) for dim in range(3)}
results_dict[0][['context_span', 'blob_points', 'bg_points', 'loss', 'separation_loss', 'weighted_point_loss', 'prior_extend', 'sources', 'test_weighted_dice', 'foldername']]

In [None]:
results_dict[1][['context_span', 'blob_points', 'bg_points', 'loss', 'separation_loss', 'weighted_point_loss', 'prior_extend', 'sources', 'test_weighted_dice', 'foldername']]

In [None]:
results_dict[2][['context_span', 'blob_points', 'bg_points', 'loss', 'separation_loss', 'weighted_point_loss', 'prior_extend', 'sources', 'test_weighted_dice', 'foldername']]

investigate influence of leaving out a loss term

In [None]:
plt.figure(figsize=(8,12))
ax = plt.subplot(3,1,1)
res_select = results_dict[0].query('context_span == 1 & blob_points == 3 & bg_points == 10')
res_select.plot.barh(x = 'weighted_point_loss', y = ['test_weighted_dice', 'test_dice'], ax=plt.gca(), legend=False)
plt.title('Transverse slice model performance')
plt.yticks(ticks = [True, False], labels = ['weighted point loss', 'non-weighted point loss'])
plt.ylabel('')


plt.subplot(3,1,2, sharex = ax)
res_select = results_dict[1].query('context_span == 1 & blob_points == 5 & bg_points == 5')
res_select.plot.barh(x = 'weighted_point_loss', y = ['test_weighted_dice', 'test_dice'], ax=plt.gca(), legend=False)
plt.title('dim 1 slice model performance')
plt.yticks(ticks = [True, False], labels = ['weighted point loss', 'non-weighted point loss'])
plt.ylabel('')

plt.subplot(3,1,3, sharex = ax)
res_select = results_dict[2].query('context_span == 1 & blob_points == 5 & bg_points == 5 & separation_loss')
res_select.plot.barh(x = 'weighted_point_loss', y = ['test_weighted_dice', 'test_dice'], ax=plt.gca())
plt.title('Dim 2 slice model performance')
plt.yticks(ticks = [True, False], labels = ['weighted point loss', 'non-weighted point loss'])
plt.ylabel('')
plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.20),
          ncol=2)

plt.suptitle('weighted point loss vs non-weighted point loss')
plt.tight_layout()

plt.savefig('weightedvsnonweighted.png')

In [None]:
plt.figure(figsize=(8,8))
ax = plt.subplot(3,1,1)
res_select = results_dict[0].query('context_span == 1 & blob_points == 5 & bg_points == 5')
res_select.plot.barh(x = 'loss', y = ['test_weighted_dice', 'test_dice'], ax=plt.gca(), legend=False)
plt.title('Transverse slice model performance')
plt.yticks(ticks = [2, 1, 0], labels = ['no prior extend', 'no separation loss', '4 loss compontents'])
plt.ylabel('')

plt.subplot(3,1,2, sharex = ax)
res_select = results_dict[2].query('context_span == 1 & blob_points == 5 & bg_points == 5 & not weighted_point_loss')
res_select.plot.barh(x = 'loss', y = ['test_weighted_dice', 'test_dice'], ax=plt.gca())
plt.title('Sagittal slice model performance')
plt.yticks(ticks = [1, 0], labels = ['no separation loss', '4 loss compontents'])
plt.ylabel('')

plt.legend(loc='lower center', bbox_to_anchor=(0.5, -0.50),
          ncol=2)

plt.suptitle('Influence of loss components')
plt.tight_layout()

plt.savefig('Losscomponents.png')

In [None]:
res_select = results_dict[1].query('context_span == 1 & weighted_point_loss == False & separation_loss')
res_select = res_select[-res_select.foldername.str.contains('_')]
res_1 = res_select.pivot(index = 'blob_points', columns = 'bg_points', values = 'test_weighted_dice')
res_select = results_dict[2].query('context_span == 1 & weighted_point_loss == False & separation_loss')
res_select = res_select[-res_select.foldername.str.contains('_')]
res_2 = res_select.pivot(index = 'blob_points', columns = 'bg_points', values = 'test_weighted_dice')

In [None]:
plt.figure(figsize=(8,8))
plt.subplot(2,1,1)
ax = res_1.plot(ax = plt.gca(), legend=False, linestyle = ':', marker = '*')
plt.xlabel('weighted dice score')
plt.title('Coronal slice model performance')
ax.set_ylabel("weighted dice score")
plt.subplot(2,1,2, sharex = ax, sharey = ax)
ax = res_2.plot(ax = plt.gca(), linestyle = ':', marker = '*')
plt.title('Sagittal slice model performance')
ax.set_ylabel("weighted dice score")

plt.suptitle('Number of blob_points')

plt.savefig('BlobPoints_influence.png')

In [None]:
reconstruct_foldernames = []

res_select = results_dict[0].query('context_span == 1 & weighted_point_loss == False & separation_loss & blob_points == 1')
res_select = res_select[-res_select.foldername.str.contains('_')]
print(res_select[['foldername', 'test_weighted_dice']])
reconstruct_foldernames.append(res_select.foldername.iloc[0])
res_select = results_dict[1].query('context_span == 1 & weighted_point_loss == False & separation_loss & bg_points == 3 & blob_points == 1')
res_select = res_select[-res_select.foldername.str.contains('_')]
print(res_select[['foldername', 'test_weighted_dice']])
reconstruct_foldernames.append(res_select.foldername.iloc[0])
res_select = results_dict[2].query('context_span == 1 & weighted_point_loss == False & separation_loss & bg_points == 3 & blob_points == 1')
res_select = res_select[-res_select.foldername.str.contains('_')]
print(res_select[['foldername', 'test_weighted_dice']])
reconstruct_foldernames.append(res_select.foldername.iloc[0])


exp_dict_reconstruct = dict()
for i, foldername in enumerate(reconstruct_foldernames):
    folder = os.path.join(base_path[i], foldername)
    print(folder)
    exp_dict = load_json(os.path.join(folder, f'exp_dict.json'))
    exp_dict['hash'] = foldername
    exp_dict_reconstruct[i] = exp_dict

with open('exp_dict_reconstruct.json', 'w') as f:
    json.dump(exp_dict_reconstruct, f)
    

In [None]:

res_select.foldername.iloc[0]

In [None]:
test_wd = result.pivot(index = 'context_span', columns = ['loss', 'model_base'], values = 'test_weighted_dice')
test_wd = test_wd.reindex(sorted(test_wd.columns), axis=1)
print(test_wd.head())
test_wd.applymap(lambda x : round(x, 2)).to_html('test_wd.html')
test_wd.plot(kind='bar', subplots = True, layout = (2, 3), legend = False, figsize = (12, 6), ylim = (0.5, 1))
plt.suptitle('weighted dice score - Test')
plt.savefig('full_test_wd.png')

In [None]:
test_d = result.pivot(index = 'context_span', columns = ['loss', 'model_base'], values = 'test_dice')
test_d = test_d.reindex(sorted(test_d.columns), axis=1)
print(test_d.head())
test_d.applymap(lambda x : round(x, 2)).to_html('test_d.html')
test_d.plot(kind='bar', subplots = True, layout = (2, 3), legend = False, figsize = (12, 6), ylim = (0.5, 1))
plt.suptitle('dice score - Test')
plt.savefig('full_test_d.png')

In [None]:
train_wd = result.pivot(index = 'context_span', columns = ['loss', 'model_base'], values = 'train_weighted_dice')
train_wd = train_wd.reindex(sorted(train_wd.columns), axis=1)
train_wd.applymap(lambda x : round(x, 2)).to_html('train_wd.html')
print(train_wd)
train_wd.plot(kind='bar', subplots = True, layout = (2, 3), legend = False, figsize = (12, 6), ylim = (0.5, 1))
plt.suptitle('weighted dice score - Train')
plt.savefig('full_train_wd.png')

In [None]:
train_d = result.pivot(index = 'context_span', columns = ['loss', 'model_base'], values = 'test_dice')
train_d = train_wd.reindex(sorted(train_d.columns), axis=1)
train_d.applymap(lambda x : round(x, 2)).to_html('train_d.html')
print(train_d)
train_d.plot(kind='bar', subplots = True, layout = (2, 3), legend = False, figsize = (12, 6), ylim = (0.5, 1))
plt.suptitle('dice score - Train')
plt.savefig('train_d.png')

In [None]:
foldername = result.query("context_span == 3 & loss == 'weighted_cross_entropy' & model_base == 'fcn8_vgg16'").foldername.iloc[0]
print(foldername)
score_df = pd.read_csv(os.path.join(base_path[2], foldername, 'score_df.csv'), index_col=0)

f, ax = plt.subplots(2)
score_df.plot(x = 'epoch', y = 'val_weighted_dice', ax = ax[0])
score_df.plot(x = 'epoch', y = 'train_weighted_dice', ax = ax[0])
score_df.plot(x = 'epoch', y = 'train_loss', ax=ax[1], logy = True)

plt.savefig('full_learning_curve.png')



In [None]:
Myo_df = pd.read_csv(os.path.join(base_path[2], foldername, 'test_metrics_MyoSegmenTUM_df.csv'), index_col=0)
USieg_df = pd.read_csv(os.path.join(base_path[2], foldername, 'test_metrics_USiegen_df.csv'), index_col=0)
xVert_df = pd.read_csv(os.path.join(base_path[2], foldername, 'test_metrics_xVertSeg_df.csv'), index_col=0)
all_df = pd.read_csv(os.path.join(base_path[2], foldername, 'test_metrics_df.csv'), index_col=0)

temp = pd.DataFrame()
for name, df in zip(['MyoSegmentum', 'USiegen', 'xVertSeg', 'total'], [Myo_df, USieg_df, xVert_df, all_df]):
    temp[name] = df['dice']

temp.plot(kind = 'barh', xlim = (.5, 1), figsize=(12, 6))
plt.savefig('full_perSource.png')

In [None]:
base_path_raw = r'/media/jan/DataStorage/ProjectData/temp/results_weighted_dataset_X_contrast_3'
base_path = [os.path.abspath(base_path_raw.replace('X', str(i))) for i in range(3)]
folders = []
for bp in base_path:
    model_folders = [os.path.join(bp, o) for o in os.listdir(bp) if os.path.isdir(os.path.join(bp,o)) and not o.startswith('.')]
    folders.append(model_folders[0])

folders

In [None]:
result = []
for i in range(3):
    r = load_json(os.path.join(folders[i], r'score_dict_final.json'))
    exp_dict = load_json(os.path.join(folders[i], f'exp_dict.json'))
    r.update({
            'model_base' : exp_dict['model']['base'],
            'context_span' : exp_dict['dataset']['context_span'],
            'loss' : exp_dict['model']['loss'],
            'foldername' : folder.split('/')[-1],
            'dim' : i
        })
    result.append(r)

result = pd.DataFrame(result).sort_values(['test_weighted_dice', 'test_dice'], ascending = False)
result

In [None]:
for i in range(3):
    Myo_df = pd.read_csv(os.path.join(folders[i], 'test_metrics_MyoSegmenTUM_df.csv'), index_col=0)
    USieg_df = pd.read_csv(os.path.join(folders[i], 'test_metrics_USiegen_df.csv'), index_col=0)
    xVert_df = pd.read_csv(os.path.join(folders[i], 'test_metrics_xVertSeg_df.csv'), index_col=0)
    all_df = pd.read_csv(os.path.join(folders[i], 'test_metrics_df.csv'), index_col=0)

    temp = pd.DataFrame()
    for name, df in zip(['MyoSegmentum', 'USiegen', 'xVertSeg', 'total'], [Myo_df, USieg_df, xVert_df, all_df]):
        temp[name] = df['dice']

    temp.plot(kind = 'barh', figsize=(8, 6), title=f'dice score model for dimension {i}')
    plt.savefig(f'dim_{i}_perSource.png')

In [None]:
for i in range(3):
    score_df = pd.read_csv(os.path.join(folders[i], 'score_df.csv'), index_col=0)

    f, ax = plt.subplots(2)
    score_df.plot(x = 'epoch', y = 'val_weighted_dice', ax = ax[0])
    score_df.plot(x = 'epoch', y = 'train_weighted_dice', ax = ax[0])
    score_df.plot(x = 'epoch', y = 'train_loss', ax=ax[1], logy = False)
    plt.suptitle(f'learning curve dimension {i}')

    plt.savefig(f'weakly_dim{i}_learning_curve.png')