In [1]:
import pandas as pd
import ismrmrd
from glob import glob
import os
import h5py
import numpy as np

  from ._conv import register_converters as _register_converters


In [None]:
def get_df(path, datasetName, start_slice, end_slice):
    files = glob(os.path.join(path, '*.h5'))
    df = pd.DataFrame()
    end_slice_is_last_slice = True if end_slice == -1 else False 
    for f in files:
        with h5py.File(f, 'r') as hf:
            folderDirectory = path
            filename = f.split('/')[-1]
            protocolName = hf.attrs['acquisition']
            numSlices, receiverChannels, encodeX, encodeY = hf['kspace'].shape
            start_slice = start_slice
            end_slice = numSlices if end_slice_is_last_slice else end_slice
            _, reconX, reconY = hf['reconstruction_rss'].shape
            xml_header = hf['ismrmrd_header'][()]
            header = ismrmrd.xsd.CreateFromDocument(xml_header)
            systemVendor = header.acquisitionSystemInformation.systemVendor
            systemModel =  header.acquisitionSystemInformation.systemModel
            systemFieldStrength_T =  header.acquisitionSystemInformation.systemFieldStrength_T
            institutionName =  header.acquisitionSystemInformation.institutionName
            sequence_type = header.sequenceParameters.sequence_type
            new_row = {
                'folderDirectory': folderDirectory,
                'filename': filename,
                'start_slice': start_slice,
                'end_slice': end_slice,
                'numSlices': numSlices,
                'datasetName': datasetName,
                'protocolName': protocolName,
                'receiverChannels': receiverChannels,
                'encodeX': encodeX,
                'encodeY': encodeY,
                'reconX': reconX,
                'reconY': reconY,
                'systemVendor': systemVendor,
                'systemModel': systemModel,
                'systemFieldStrength_T': systemFieldStrength_T,
                'institutionName': institutionName,
                'sequence_type': sequence_type

            }
            new_row = pd.DataFrame(new_row, index=[0])
            df = pd.concat((df, new_row)).reset_index(drop=True)

    return df


datasetName = 'fastmri_knee'
path = '/media/ssd2/fastMRIdata/knee/multicoil_train/'
df_knee_train = get_df(path, datasetName, start_slice=0, end_slice=-1)
datasetName = 'fastmri_knee'
# path = '/media/ssd2/fastMRIdata/knee/multicoil_val/'
# df_knee_val = get_df(path, datasetName, start_slice=0, end_slice=-1)
# datasetName = 'fastmri_brain'
# path = '/media/ssd2/fastMRIdata/brain/multicoil_train/'
# df_brain_train = get_df(path, datasetName, start_slice=0, end_slice=-1)
# datasetName = 'fastmri_brain'
# path = '/media/ssd2/fastMRIdata/brain/multicoil_val/'
# df_brain_val = get_df(path, datasetName, start_slice=0, end_slice=-1)
# datasetName = 'fastmri_prostate_T2'
# path = '/media/ssd2/fastMRIdata/prostate/T2_converted/train/'
df_prostate_train = get_df(path, datasetName, start_slice=0, end_slice=-1)

In [7]:
df_knee_train['folderDirectory'] = np.nan
df_knee_train['folderDirectory'] = np.nan

In [6]:
df_knee_train

Unnamed: 0,folderDirectory,filename,start_slice,end_slice,numSlices,datasetName,protocolName,receiverChannels,encodeX,encodeY,reconX,reconY,systemVendor,systemModel,systemFieldStrength_T,institutionName,sequence_type
0,,file1000298.h5,0,30,30,fastmri_knee,CORPDFS_FBK,15,640,372,320,320,SIEMENS,Aera,1.49400,NYU,TurboSpinEcho
1,,file1000349.h5,0,35,35,fastmri_knee,CORPD_FBK,15,640,368,320,320,SIEMENS,Skyra,2.89362,NYU,TurboSpinEcho
2,,file1000265.h5,0,34,34,fastmri_knee,CORPD_FBK,15,640,372,320,320,SIEMENS,Aera,1.49400,HJD,TurboSpinEcho
3,,file1002371.h5,0,35,35,fastmri_knee,CORPDFS_FBK,15,640,368,320,320,SIEMENS,Prisma_fit,2.89362,NYU,TurboSpinEcho
4,,file1001307.h5,0,38,38,fastmri_knee,CORPD_FBK,15,640,368,320,320,SIEMENS,Biograph_mMR,2.89362,NYU,TurboSpinEcho
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
968,,file1002235.h5,0,30,30,fastmri_knee,CORPD_FBK,15,640,372,320,320,SIEMENS,Aera,1.49400,NYU,TurboSpinEcho
969,,file1001963.h5,0,30,30,fastmri_knee,CORPD_FBK,15,640,372,320,320,SIEMENS,Aera,1.49400,HJD,TurboSpinEcho
970,,file1001975.h5,0,30,30,fastmri_knee,CORPDFS_FBK,15,640,372,320,320,SIEMENS,Aera,1.49400,NYU,TurboSpinEcho
971,,file1000094.h5,0,33,33,fastmri_knee,CORPDFS_FBK,15,640,368,320,320,SIEMENS,Skyra,2.89362,NYU,TurboSpinEcho


In [11]:
# df.to_csv('./dataset_csv/full_datasets/trainset/fastmri_knee.csv', index=False)

### Contruct training sets for experiments using fastMRI subsets

In [5]:
from math import ceil, floor
seed = 4
rng = np.random.default_rng(seed)

df = pd.concat([df_brain_train, df_knee_train])
df = df.iloc[rng.permutation(len(df))]


# extract distribution fulling certain criteria
protocol_list = ['AXFLAIR', 'AXT1', 'AXT1PRE', 'AXT1POST', 'AXT2', 'CORPDFS_FBK', 'CORPD_FBK']
machine_list = ['Aera', 'Avanto', 'Skyra', 'Prisma_fit', 'TrioTim', 'Biograph_mMR']

df_new = pd.DataFrame()

for protocol in protocol_list:
    for machine in machine_list:
        recon_shape = [384, 384] if (machine in ['Skyra', 'Prisma_fit', 'TrioTim', 'Biograph_mMR']) and protocol == 'AXT2' else [320, 320]
        reconX, reconY = recon_shape[0], recon_shape[1]
        
        q = df[
            (df['systemModel'] == machine)
            & (df['protocolName'] == protocol)
            & (df['reconX'] == reconX)
            & (df['reconY'] == reconY)
            ]
            
        if not q.empty and  q.numSlices.sum() > 512:
            if q.numSlices.sum() > 2000: # extract around 2048 slices
                num_samples = 56 if protocol in ['CORPDFS_FBK', 'CORPD_FBK'] else 128
                num_samples += 4 if machine == 'Aera' else 0
                num_samples -= 2 if machine == 'Biograph_mMR' and protocol == 'CORPDFS_FBK' else 0
                    
                rows = rng.choice(len(q), num_samples, replace=False)

            elif q.numSlices.sum() < 1800: # extract around 512 slices
                num_samples = 14 if protocol in ['CORPDFS_FBK', 'CORPD_FBK'] else 34
                rows = rng.choice(len(q), num_samples, replace=False)
            else:
                rows = range(0,len(q))
 
            q = q.iloc[rows]

            
            df_new = pd.concat([df_new, q])
            

# construct distributions from Figure 1
df_train = pd.DataFrame()

for protocol in protocol_list:
    for machine in machine_list:
        q = df_new[
            (df_new['systemModel'] == machine)
            & (df_new['protocolName'] == protocol)
            ]
        
        if not q.empty and q.numSlices.sum() > 1500:
            df_train = pd.concat([df_train, q])

# construct distributions from Figure 1 and 9 additional distributions, all with 512 slices
df_robust = pd.DataFrame()
for protocol in protocol_list:
    for machine in machine_list:
        
        q = df_new[
            (df_new['systemModel'] == machine)
            & (df_new['protocolName'] == protocol)
            ]
        
        if not q.empty:
            if q.numSlices.sum() > 1500: # extract around 512 slices
                num_samples = 14 if protocol in ['CORPDFS_FBK', 'CORPD_FBK'] else 34
                num_samples += 1 if machine == 'Aera' else 0
                rows = rng.choice(len(q), num_samples, replace=False) 
                q = q.iloc[rows]

            df_robust = pd.concat([df_robust, q])
            
# construct subsets
def get_subset(df, factor):
    df_new = pd.DataFrame()
    for protocol in protocol_list:
        for machine in machine_list:

            q = df[
                (df['systemModel'] == machine)
                & (df['protocolName'] == protocol)
                ]

            if not q.empty:
                if factor in [8, 16, 32]:
                    q = q.iloc[:max(1, ceil(len(q)/factor))]
                else:
                    q = q.iloc[:max(1, len(q)//factor)]
                print(q.numSlices.sum(), protocol, machine)
                df_new = pd.concat([df_new, q])
    return df_new

for i in [1, 2, 4, 8, 16, 32, 64]:
    print('Factor', i)
    globals()['df_train_'+str(i)] = get_subset(df_train, i)

for i in [1, 2, 4]:
    print('Factor', i)
    globals()['df_robust_'+str(i)] = get_subset(df_robust, i)
    

Factor 1
2020 AXFLAIR Skyra
2068 AXT1 Aera
2064 AXT1POST Aera
2036 AXT1POST Avanto
2026 AXT1POST Skyra
2090 AXT2 Aera
2036 AXT2 Avanto
2028 AXT2 Skyra
2044 AXT2 Prisma_fit
2026 AXT2 Biograph_mMR
2040 CORPDFS_FBK Aera
2063 CORPDFS_FBK Skyra
2066 CORPDFS_FBK Biograph_mMR
2040 CORPD_FBK Aera
2068 CORPD_FBK Skyra
1895 CORPD_FBK Biograph_mMR
Factor 2
1016 AXFLAIR Skyra
1036 AXT1 Aera
1036 AXT1POST Aera
1014 AXT1POST Avanto
1014 AXT1POST Skyra
1038 AXT2 Aera
1022 AXT2 Avanto
1016 AXT2 Skyra
1022 AXT2 Prisma_fit
1010 AXT2 Biograph_mMR
1026 CORPDFS_FBK Aera
1017 CORPDFS_FBK Skyra
1029 CORPDFS_FBK Biograph_mMR
1002 CORPD_FBK Aera
1023 CORPD_FBK Skyra
923 CORPD_FBK Biograph_mMR
Factor 4
510 AXFLAIR Skyra
518 AXT1 Aera
516 AXT1POST Aera
508 AXT1POST Avanto
510 AXT1POST Skyra
522 AXT2 Aera
510 AXT2 Avanto
506 AXT2 Skyra
512 AXT2 Prisma_fit
506 AXT2 Biograph_mMR
510 CORPDFS_FBK Aera
503 CORPDFS_FBK Skyra
492 CORPDFS_FBK Biograph_mMR
501 CORPD_FBK Aera
519 CORPD_FBK Skyra
457 CORPD_FBK Biograph_mMR


In [6]:
df_train_1

Unnamed: 0,folder,filename,start_slice,end_slice,numSlices,datasetName,protocolName,receiverChannels,encodeX,encodeY,reconX,reconY,systemVendor,systemModel,systemFieldStrength_T,institutionName,sequence_type
1465,/media/ssd2/fastMRIdata/brain/multicoil_train/,file_brain_AXFLAIR_201_6002928.h5,0,16,16,fastmri_brain,AXFLAIR,16,640,320,320,320,SIEMENS,Skyra,2.89360,TH RADIOLOGY,TurboSpinEcho
1352,/media/ssd2/fastMRIdata/brain/multicoil_train/,file_brain_AXFLAIR_201_6003004.h5,0,16,16,fastmri_brain,AXFLAIR,20,640,320,320,320,SIEMENS,Skyra,2.89360,TH RADIOLOGY,TurboSpinEcho
2153,/media/ssd2/fastMRIdata/brain/multicoil_train/,file_brain_AXFLAIR_200_6002503.h5,0,16,16,fastmri_brain,AXFLAIR,20,640,320,320,320,SIEMENS,Skyra,2.89360,NYU LANGONE CBI,TurboSpinEcho
392,/media/ssd2/fastMRIdata/brain/multicoil_train/,file_brain_AXFLAIR_200_6002616.h5,0,16,16,fastmri_brain,AXFLAIR,20,640,320,320,320,SIEMENS,Skyra,2.89360,NYU LANGONE CBI,TurboSpinEcho
195,/media/ssd2/fastMRIdata/brain/multicoil_train/,file_brain_AXFLAIR_201_6002951.h5,0,16,14,fastmri_brain,AXFLAIR,16,640,320,320,320,SIEMENS,Skyra,2.89360,TH RADIOLOGY,TurboSpinEcho
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
522,/media/ssd2/fastMRIdata/knee/multicoil_train/,file1002066.h5,0,30,39,fastmri_knee,CORPD_FBK,15,640,368,320,320,SIEMENS,Biograph_mMR,2.89362,NYU,TurboSpinEcho
174,/media/ssd2/fastMRIdata/knee/multicoil_train/,file1001385.h5,0,30,38,fastmri_knee,CORPD_FBK,15,640,368,320,320,SIEMENS,Biograph_mMR,2.89362,NYU,TurboSpinEcho
205,/media/ssd2/fastMRIdata/knee/multicoil_train/,file1001879.h5,0,30,38,fastmri_knee,CORPD_FBK,15,640,368,320,320,SIEMENS,Biograph_mMR,2.89362,NYU,TurboSpinEcho
6,/media/ssd2/fastMRIdata/knee/multicoil_train/,file1001528.h5,0,30,43,fastmri_knee,CORPD_FBK,15,640,368,320,320,SIEMENS,Biograph_mMR,2.89362,NYU,TurboSpinEcho


In [222]:
df_train_1.to_csv('../dataset_csv/fastmri_sliced/for_debug/train/seed_'+str(seed)+'/trainset_01.csv', index=False)
df_train_2.to_csv('../dataset_csv/fastmri_sliced/for_debug/train/seed_'+str(seed)+'/trainset_02.csv', index=False)
df_train_4.to_csv('../dataset_csv/fastmri_sliced/for_debug/train/seed_'+str(seed)+'/trainset_04.csv', index=False)
df_train_8.to_csv('../dataset_csv/fastmri_sliced/for_debug/train/seed_'+str(seed)+'/trainset_08.csv', index=False)
df_train_16.to_csv('../dataset_csv/fastmri_sliced/for_debug/train/seed_'+str(seed)+'/trainset_16.csv', index=False)
df_train_32.to_csv('../dataset_csv/fastmri_sliced/for_debug/train/seed_'+str(seed)+'/trainset_32.csv', index=False)
df_train_64.to_csv('../dataset_csv/fastmri_sliced/for_debug/train/seed_'+str(seed)+'/trainset_64.csv', index=False)

df_robust_1.to_csv('../dataset_csv/fastmri_sliced/for_debug/train/seed_'+str(seed)+'/robustset_01.csv', index=False)
df_robust_2.to_csv('../dataset_csv/fastmri_sliced/for_debug/train/seed_'+str(seed)+'/robustset_02.csv', index=False)
df_robust_4.to_csv('../dataset_csv/fastmri_sliced/for_debug/train/seed_'+str(seed)+'/robustset_04.csv', index=False)

df_train_1.to_csv('../dataset_csv/fastmri_sliced/train/seed_'+str(seed)+'/trainset_01.csv', index=False)
df_train_2.to_csv('../dataset_csv/fastmri_sliced/train/seed_'+str(seed)+'/trainset_02.csv', index=False)
df_train_4.to_csv('../dataset_csv/fastmri_sliced/train/seed_'+str(seed)+'/trainset_04.csv', index=False)
df_train_8.to_csv('../dataset_csv/fastmri_sliced/train/seed_'+str(seed)+'/trainset_08.csv', index=False)
df_train_16.to_csv('../dataset_csv/fastmri_sliced/train/seed_'+str(seed)+'/trainset_16.csv', index=False)
df_train_32.to_csv('../dataset_csv/fastmri_sliced/train/seed_'+str(seed)+'/trainset_32.csv', index=False)
df_train_64.to_csv('../dataset_csv/fastmri_sliced/train/seed_'+str(seed)+'/trainset_64.csv', index=False)

df_robust_1.to_csv('../dataset_csv/fastmri_sliced/train/seed_'+str(seed)+'/robustset_01.csv', index=False)
df_robust_2.to_csv('../dataset_csv/fastmri_sliced/train/seed_'+str(seed)+'/robustset_02.csv', index=False)
df_robust_4.to_csv('../dataset_csv/fastmri_sliced/train/seed_'+str(seed)+'/robustset_04.csv', index=False)


### Contruct test sets for experiments using fastMRI subsets

In [7]:
# testset
rng = np.random.default_rng(0)

df = pd.concat([df_brain_val, df_knee_val])
df = df.iloc[rng.permutation(len(df))]

combs = [
    {'protocolName':'AXFLAIR', 'systemModel':'Skyra'},
    {'protocolName':'AXFLAIR', 'systemModel':'Prisma_fit'},
    {'protocolName':'AXT1', 'systemModel':'Aera'},
    {'protocolName':'AXT1', 'systemModel':'Skyra'},
    {'protocolName':'AXT1PRE', 'systemModel':'Skyra'},
    {'protocolName':'AXT1PRE', 'systemModel':'TrioTim'},
    {'protocolName':'AXT1PRE', 'systemModel':'Biograph_mMR'},
    {'protocolName':'AXT1POST', 'systemModel':'Aera'},
    {'protocolName':'AXT1POST', 'systemModel':'Avanto'},
    {'protocolName':'AXT1POST', 'systemModel':'Skyra'},
    {'protocolName':'AXT1POST', 'systemModel':'Prisma_fit'},
    {'protocolName':'AXT1POST', 'systemModel':'TrioTim'},
    {'protocolName':'AXT1POST', 'systemModel':'Biograph_mMR'},
    {'protocolName':'AXT2', 'systemModel':'Aera'},
    {'protocolName':'AXT2', 'systemModel':'Avanto'},
    {'protocolName':'AXT2', 'systemModel':'Skyra'},
    {'protocolName':'AXT2', 'systemModel':'Prisma_fit'},
    {'protocolName':'AXT2', 'systemModel':'TrioTim'},
    {'protocolName':'AXT2', 'systemModel':'Biograph_mMR'},
    {'protocolName':'CORPDFS_FBK', 'systemModel':'Aera'},
    {'protocolName':'CORPDFS_FBK', 'systemModel':'Skyra'},
    {'protocolName':'CORPDFS_FBK', 'systemModel':'Prisma_fit'},
    {'protocolName':'CORPDFS_FBK', 'systemModel':'Biograph_mMR'},
    {'protocolName':'CORPD_FBK', 'systemModel':'Aera'},
    {'protocolName':'CORPD_FBK', 'systemModel':'Skyra'},
    {'protocolName':'CORPD_FBK', 'systemModel':'Prisma_fit'},
    {'protocolName':'CORPD_FBK', 'systemModel':'Biograph_mMR'},
]

df_test = pd.DataFrame()

for comb in combs:
    protocol, machine = comb['protocolName'], comb['systemModel']
    recon_shape = [384, 384] if (machine in ['Skyra', 'Prisma_fit', 'TrioTim', 'Biograph_mMR']) and protocol == 'AXT2' else [320, 320]
    reconX, reconY = recon_shape[0], recon_shape[1]

    q = df[
        (df['systemModel'] == machine)
        & (df['protocolName'] == protocol)
        & (df['reconX'] == reconX)
        & (df['reconY'] == reconY)
        ]

    if q.numSlices.sum() > 100:
        num_samples = 4 if protocol in ['CORPDFS_FBK', 'CORPD_FBK'] else 8
        num_samples -= 1 if machine == 'Prisma_fit' and protocol in ['CORPDFS_FBK', 'CORPD_FBK'] else 0
        rows = rng.choice(len(q), num_samples, replace=False)

        q = q.iloc[rows]

    print(q.numSlices.sum(), protocol, machine)

    df_test = pd.concat([df_test, q])

128 AXFLAIR Skyra
126 AXFLAIR Prisma_fit
128 AXT1 Aera
124 AXT1 Skyra
128 AXT1PRE Skyra
126 AXT1PRE TrioTim
128 AXT1PRE Biograph_mMR
122 AXT1POST Aera
128 AXT1POST Avanto
124 AXT1POST Skyra
128 AXT1POST Prisma_fit
128 AXT1POST TrioTim
128 AXT1POST Biograph_mMR
128 AXT2 Aera
126 AXT2 Avanto
128 AXT2 Skyra


128 AXT2 Prisma_fit
128 AXT2 TrioTim
128 AXT2 Biograph_mMR
132 CORPDFS_FBK Aera
155 CORPDFS_FBK Skyra
110 CORPDFS_FBK Prisma_fit
146 CORPDFS_FBK Biograph_mMR
137 CORPD_FBK Aera
148 CORPD_FBK Skyra
110 CORPD_FBK Prisma_fit
153 CORPD_FBK Biograph_mMR


In [8]:
df_test

Unnamed: 0,folder,filename,start_slice,end_slice,numSlices,datasetName,protocolName,receiverChannels,encodeX,encodeY,reconX,reconY,systemVendor,systemModel,systemFieldStrength_T,institutionName,sequence_type
651,/media/ssd2/fastMRIdata/brain/multicoil_val/,file_brain_AXFLAIR_201_6002979.h5,0,16,16,fastmri_brain,AXFLAIR,20,640,320,320,320,SIEMENS,Skyra,2.89360,TH RADIOLOGY,TurboSpinEcho
552,/media/ssd2/fastMRIdata/brain/multicoil_val/,file_brain_AXFLAIR_200_6002562.h5,0,16,16,fastmri_brain,AXFLAIR,20,640,320,320,320,SIEMENS,Skyra,2.89360,NYU LANGONE CBI,TurboSpinEcho
178,/media/ssd2/fastMRIdata/brain/multicoil_val/,file_brain_AXFLAIR_200_6002605.h5,0,16,16,fastmri_brain,AXFLAIR,20,640,320,320,320,SIEMENS,Skyra,2.89360,NYU LANGONE CBI,TurboSpinEcho
791,/media/ssd2/fastMRIdata/brain/multicoil_val/,file_brain_AXFLAIR_201_6002949.h5,0,16,16,fastmri_brain,AXFLAIR,16,640,320,320,320,SIEMENS,Skyra,2.89360,TH RADIOLOGY,TurboSpinEcho
1338,/media/ssd2/fastMRIdata/brain/multicoil_val/,file_brain_AXFLAIR_201_6002974.h5,0,16,16,fastmri_brain,AXFLAIR,20,640,320,320,320,SIEMENS,Skyra,2.89360,TH RADIOLOGY,TurboSpinEcho
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,/media/ssd2/fastMRIdata/knee/multicoil_val/,file1001077.h5,0,37,35,fastmri_knee,CORPD_FBK,15,640,368,320,320,SIEMENS,Prisma_fit,2.89362,NYU,TurboSpinEcho
47,/media/ssd2/fastMRIdata/knee/multicoil_val/,file1001763.h5,0,37,45,fastmri_knee,CORPD_FBK,15,640,368,320,320,SIEMENS,Biograph_mMR,2.89362,NYU,TurboSpinEcho
16,/media/ssd2/fastMRIdata/knee/multicoil_val/,file1001331.h5,0,37,36,fastmri_knee,CORPD_FBK,15,640,368,320,320,SIEMENS,Biograph_mMR,2.89362,NYU,TurboSpinEcho
12,/media/ssd2/fastMRIdata/knee/multicoil_val/,file1001444.h5,0,37,35,fastmri_knee,CORPD_FBK,15,640,368,320,320,SIEMENS,Biograph_mMR,2.89362,NYU,TurboSpinEcho


In [9]:
# df_testset.to_csv('./dataset_csv/testset.csv', index=False)