# Dataset analysis 

In [1]:
import SimpleITK as sitk
import numpy as np
import pandas as pd
import os.path
import re
import matplotlib.pyplot as plt
from typing import List
import seaborn as sns
import json
from matplotlib import cm
from pprint import pformat

import tikzplotlib

plt.style.use("seaborn")
sns.color_palette("colorblind")

In [2]:
base_path_raw = os.path.abspath('/media/jan/USB1/')
# Make list with subdirectories
tests = [os.path.join(base_path_raw, o) for o in os.listdir(base_path_raw) if os.path.isdir(os.path.join(base_path_raw,o)) and not o.startswith('.')]

reference = tests[0]
reference_selected = {split : pd.read_csv(os.path.join(reference, f'{split}_selected.csv')) for split in ['train', 'test', 'val']}

for test in tests:
    candidate_selected = {split : pd.read_csv(os.path.join(test, f'{split}_selected.csv')) for split in ['train', 'test', 'val']}
    correspondance = {split : candidate_selected[split].equals(reference_selected[split]) for split in ['train', 'test', 'val']}
    print(f'For test {test}, the correspondance between reference and candidate is : \n\t{pformat(correspondance)}')

For test /media/jan/USB1/test1, the correspondance between reference and candidate is : 
	{'test': True, 'train': True, 'val': True}
For test /media/jan/USB1/test2, the correspondance between reference and candidate is : 
	{'test': True, 'train': True, 'val': True}


In [3]:
reference_selected['train'].head()

Unnamed: 0.1,Unnamed: 0,img,tgt,scan_id,slice_id,patient,source,crop_nr
0,755,/root/space/output/dataset_2_contrast_3/xVertS...,/root/space/output/dataset_2_contrast_3/xVertS...,xVertSeg_005,9,xVertSeg_005,xVertSeg,4
1,756,/root/space/output/dataset_2_contrast_3/xVertS...,/root/space/output/dataset_2_contrast_3/xVertS...,xVertSeg_005,397,xVertSeg_005,xVertSeg,0
2,757,/root/space/output/dataset_2_contrast_3/xVertS...,/root/space/output/dataset_2_contrast_3/xVertS...,xVertSeg_005,192,xVertSeg_005,xVertSeg,3
3,758,/root/space/output/dataset_2_contrast_3/xVertS...,/root/space/output/dataset_2_contrast_3/xVertS...,xVertSeg_005,183,xVertSeg_005,xVertSeg,3
4,759,/root/space/output/dataset_2_contrast_3/xVertS...,/root/space/output/dataset_2_contrast_3/xVertS...,xVertSeg_005,355,xVertSeg_005,xVertSeg,4


In [4]:
counts_slices = pd.concat([df.groupby(['source']).scan_id.count().rename(name) for name, df in reference_selected.items()], axis= 1)
counts_slices.loc['total'] = counts_slices.sum(axis=0)
counts_slices.loc[:,'total'] = counts_slices.sum(axis=1)
counts_slices

Unnamed: 0_level_0,train,test,val,total
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MyoSegmenTUM,3170,723,861,4754
USiegen,906,145,103,1154
xVertSeg,3995,1251,1019,6265
total,8071,2119,1983,12173


In [5]:
counts_patients = pd.concat([df.drop_duplicates('patient', keep='first').groupby(['source']).scan_id.count().rename(name) for name, df in reference_selected.items()], axis= 1)
counts_patients.loc['total'] = counts_patients.sum(axis=0)
counts_patients.loc[:,'total'] = counts_patients.sum(axis=1)
counts_patients

Unnamed: 0_level_0,train,test,val,total
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MyoSegmenTUM,36,9,9,54
USiegen,7,2,1,10
xVertSeg,9,3,3,15
total,52,14,13,79


In [6]:
cols = [col for col in reference_selected['train'].columns if col != 'crop_nr']

In [7]:
test = reference_selected['train'][cols].head(10)
crop_nr = pd.DataFrame({'crop_nr': [i for i in range(5)]})

In [17]:
for groupname, group in reference_selected['train'].groupby('scan_id'):
    print(groupname)
    print(type(group))
    print(group.head())

et_2_contrast_3/USiege...   

                                                    tgt      scan_id  \
4256  /root/space/output/dataset_2_contrast_3/USiege...  USiegen_007   
4257  /root/space/output/dataset_2_contrast_3/USiege...  USiegen_007   
4258  /root/space/output/dataset_2_contrast_3/USiege...  USiegen_007   
4259  /root/space/output/dataset_2_contrast_3/USiege...  USiegen_007   
4260  /root/space/output/dataset_2_contrast_3/USiege...  USiegen_007   

      slice_id      patient   source  crop_nr  
4256         9  USiegen_015  USiegen        4  
4257        11  USiegen_015  USiegen        0  
4258        39  USiegen_015  USiegen        0  
4259        28  USiegen_015  USiegen        0  
4260        24  USiegen_015  USiegen        4  
USiegen_008
<class 'pandas.core.frame.DataFrame'>
      Unnamed: 0                                                img  \
4531        6863  /root/space/output/dataset_2_contrast_3/USiege...   
4532        6864  /root/space/output/dataset_2_contrast_3

In [30]:
crop_nr

Unnamed: 0,crop_nr
0,0
1,1
2,2
3,3
4,4


In [31]:
test.merge(crop_nr, how='cross')

Unnamed: 0.1,Unnamed: 0,img,tgt,scan_id,slice_id,patient,source,crop_nr
0,755,/root/space/output/dataset_2_contrast_3/xVertS...,/root/space/output/dataset_2_contrast_3/xVertS...,xVertSeg_005,9,xVertSeg_005,xVertSeg,0
1,755,/root/space/output/dataset_2_contrast_3/xVertS...,/root/space/output/dataset_2_contrast_3/xVertS...,xVertSeg_005,9,xVertSeg_005,xVertSeg,1
2,755,/root/space/output/dataset_2_contrast_3/xVertS...,/root/space/output/dataset_2_contrast_3/xVertS...,xVertSeg_005,9,xVertSeg_005,xVertSeg,2
3,755,/root/space/output/dataset_2_contrast_3/xVertS...,/root/space/output/dataset_2_contrast_3/xVertS...,xVertSeg_005,9,xVertSeg_005,xVertSeg,3
4,755,/root/space/output/dataset_2_contrast_3/xVertS...,/root/space/output/dataset_2_contrast_3/xVertS...,xVertSeg_005,9,xVertSeg_005,xVertSeg,4
5,756,/root/space/output/dataset_2_contrast_3/xVertS...,/root/space/output/dataset_2_contrast_3/xVertS...,xVertSeg_005,397,xVertSeg_005,xVertSeg,0
6,756,/root/space/output/dataset_2_contrast_3/xVertS...,/root/space/output/dataset_2_contrast_3/xVertS...,xVertSeg_005,397,xVertSeg_005,xVertSeg,1
7,756,/root/space/output/dataset_2_contrast_3/xVertS...,/root/space/output/dataset_2_contrast_3/xVertS...,xVertSeg_005,397,xVertSeg_005,xVertSeg,2
8,756,/root/space/output/dataset_2_contrast_3/xVertS...,/root/space/output/dataset_2_contrast_3/xVertS...,xVertSeg_005,397,xVertSeg_005,xVertSeg,3
9,756,/root/space/output/dataset_2_contrast_3/xVertS...,/root/space/output/dataset_2_contrast_3/xVertS...,xVertSeg_005,397,xVertSeg_005,xVertSeg,4
