#### This script processes h5 files to extract metadata, such as acquisition type, magnetic field strength, and system model, from the ISMRMRD header. It reads the h5 files from specified directories, parses the XML header to retrieve the required information, and saves the extracted data into CSV files. The script then groups the files by acquisition type, magnetic field strength, and system model, and counts the number of files in each group, providing a summary of the dataset's composition. This process is applied to both training and validation datasets, ensuring comprehensive metadata extraction and organization.

In [1]:
%matplotlib inline
import h5py
import numpy as np
from matplotlib import pyplot as plt

Knee data

In [4]:
import h5py
import os
import glob
import pandas as pd
import xml.etree.ElementTree as ET

def process_hdf5_files(folder_path, csv_file_path):
    h5_files = glob.glob(f"{folder_path}*.h5")
    data = []
    namespaces = {'ns': 'http://www.ismrm.org/ISMRMRD'}

    for file_path in h5_files:
        with h5py.File(file_path, 'r') as hf:
            # Access the acquisition attribute
            acquisition = hf.attrs.get('acquisition', 'Unknown')
            ismrmrd_header_str = hf['ismrmrd_header'][()].decode('utf-8')
            root = ET.fromstring(ismrmrd_header_str)

            # Find the magnetic field strength element
            magnetic_field_element = root.find('.//ns:acquisitionSystemInformation/ns:systemFieldStrength_T', namespaces)
            magnetic_field = magnetic_field_element.text if magnetic_field_element is not None else 'Unknown'

            # Find the system model element
            system_model_element = root.find('.//ns:acquisitionSystemInformation/ns:systemModel', namespaces)
            system_model = system_model_element.text if system_model_element is not None else 'Unknown'

            file_name_only = os.path.basename(file_path)
            data.append([file_name_only, acquisition, magnetic_field, system_model])

    df = pd.DataFrame(data, columns=['filename', 'acquisition', 'magnetic_field', 'system_model'])
    df.to_csv(csv_file_path, index=False)
    print(f"Data saved to {csv_file_path}")

    df = pd.read_csv(csv_file_path)
    grouped = df.groupby(['acquisition', 'magnetic_field', 'system_model']).size().reset_index(name='file_count')

    return grouped

# Process training data
train_folder_path = '/vol/datasets/cil/2021_11_23_fastMRI_data/knee/unzipped/multicoil_train/'
train_csv_file_path = '/vol/ideadata/fa51puco/datasets/knee_multicoil_acquisition_magnetic_field_system_model_data.csv'
train_grouped = process_hdf5_files(train_folder_path, train_csv_file_path)

# Process validation data
val_folder_path = '/vol/datasets/cil/2021_11_23_fastMRI_data/knee/unzipped/multicoil_val/'
val_csv_file_path = '/vol/ideadata/fa51puco/datasets/knee_multicoil_acquisition_magnetic_field_system_model_data_val.csv'
val_grouped = process_hdf5_files(val_folder_path, val_csv_file_path)

# Print data
print("Training Data:")
print("Acquisition - Magnetic Field - System Model - File Count")
for _, row in train_grouped.iterrows():
    print(f"{row['acquisition']} - {row['magnetic_field']} - {row['system_model']} - {row['file_count']}")

print("\nValidation Data:")
print("Acquisition - Magnetic Field - System Model - File Count")
for _, row in val_grouped.iterrows():
    print(f"{row['acquisition']} - {row['magnetic_field']} - {row['system_model']} - {row['file_count']}")

Data saved to /vol/ideadata/fa51puco/datasets/knee_multicoil_acquisition_magnetic_field_system_model_data.csv
Data saved to /vol/ideadata/fa51puco/datasets/knee_multicoil_acquisition_magnetic_field_system_model_data_val.csv
Training Data:
Acquisition - Magnetic Field - System Model - File Count
CORPDFS_FBK - 1.494 - Aera - 206
CORPDFS_FBK - 2.89362 - Biograph_mMR - 54
CORPDFS_FBK - 2.89362 - Prisma_fit - 19
CORPDFS_FBK - 2.89362 - Skyra - 210
CORPD_FBK - 1.494 - Aera - 205
CORPD_FBK - 2.89362 - Biograph_mMR - 50
CORPD_FBK - 2.89362 - Prisma_fit - 22
CORPD_FBK - 2.89362 - Skyra - 207

Validation Data:
Acquisition - Magnetic Field - System Model - File Count
CORPDFS_FBK - 1.494 - Aera - 47
CORPDFS_FBK - 2.89362 - Biograph_mMR - 10
CORPDFS_FBK - 2.89362 - Prisma_fit - 3
CORPDFS_FBK - 2.89362 - Skyra - 39
CORPD_FBK - 1.494 - Aera - 47
CORPD_FBK - 2.89362 - Biograph_mMR - 10
CORPD_FBK - 2.89362 - Prisma_fit - 3
CORPD_FBK - 2.89362 - Skyra - 40


Brain data

In [5]:
import h5py
import os
import glob
import pandas as pd
import xml.etree.ElementTree as ET

def process_hdf5_files(folder_path, csv_file_path):
    h5_files = glob.glob(f"{folder_path}*.h5")
    data = []
    namespaces = {'ns': 'http://www.ismrm.org/ISMRMRD'}

    for file_path in h5_files:
        with h5py.File(file_path, 'r') as hf:
            # Access the acquisition attribute
            acquisition = hf.attrs.get('acquisition', 'Unknown')
            ismrmrd_header_str = hf['ismrmrd_header'][()].decode('utf-8')
            root = ET.fromstring(ismrmrd_header_str)

            # Find the magnetic field strength element
            magnetic_field_element = root.find('.//ns:acquisitionSystemInformation/ns:systemFieldStrength_T', namespaces)
            magnetic_field = magnetic_field_element.text if magnetic_field_element is not None else 'Unknown'

            # Find the system model element
            system_model_element = root.find('.//ns:acquisitionSystemInformation/ns:systemModel', namespaces)
            system_model = system_model_element.text if system_model_element is not None else 'Unknown'

            file_name_only = os.path.basename(file_path)
            data.append([file_name_only, acquisition, magnetic_field, system_model])

    df = pd.DataFrame(data, columns=['filename', 'acquisition', 'magnetic_field', 'system_model'])
    df.to_csv(csv_file_path, index=False)
    print(f"Data saved to {csv_file_path}")

    df = pd.read_csv(csv_file_path)
    grouped = df.groupby(['acquisition', 'magnetic_field', 'system_model']).size().reset_index(name='file_count')

    return grouped

# Process training data
train_folder_path = '/vol/datasets/cil/2021_11_23_fastMRI_data/brain/unzipped/multicoil_train/'
train_csv_file_path = '/vol/ideadata/fa51puco/datasets/brain_multicoil_acquisition_magnetic_field_system_model_data.csv'
train_grouped = process_hdf5_files(train_folder_path, train_csv_file_path)

# Process validation data
val_folder_path = '/vol/datasets/cil/2021_11_23_fastMRI_data/brain/unzipped/multicoil_val/'
val_csv_file_path = '/vol/ideadata/fa51puco/datasets/brain_multicoil_acquisition_magnetic_field_system_model_data_val.csv'
val_grouped = process_hdf5_files(val_folder_path, val_csv_file_path)

# Print data
print("Training Data:")
print("Acquisition - Magnetic Field - System Model - File Count")
for _, row in train_grouped.iterrows():
    print(f"{row['acquisition']} - {row['magnetic_field']} - {row['system_model']} - {row['file_count']}")

print("\nValidation Data:")
print("Acquisition - Magnetic Field - System Model - File Count")
for _, row in val_grouped.iterrows():
    print(f"{row['acquisition']} - {row['magnetic_field']} - {row['system_model']} - {row['file_count']}")

Data saved to /vol/ideadata/fa51puco/datasets/brain_multicoil_acquisition_magnetic_field_system_model_data.csv
Data saved to /vol/ideadata/fa51puco/datasets/brain_multicoil_acquisition_magnetic_field_system_model_data_val.csv
Training Data:
Acquisition - Magnetic Field - System Model - File Count
AXFLAIR - 1.494 - Aera - 79
AXFLAIR - 2.8936 - Biograph_mMR - 23
AXFLAIR - 2.8936 - Prisma_fit - 47
AXFLAIR - 2.8936 - Skyra - 138
AXFLAIR - 2.8936 - TrioTim - 57
AXT1 - 1.494 - Aera - 204
AXT1 - 2.8936 - Skyra - 44
AXT1POST - 1.494 - Aera - 181
AXT1POST - 1.494 - Avanto - 375
AXT1POST - 2.8936 - Biograph_mMR - 65
AXT1POST - 2.8936 - Prisma_fit - 52
AXT1POST - 2.8936 - Skyra - 186
AXT1POST - 2.8936 - TrioTim - 90
AXT1PRE - 1.494 - Aera - 25
AXT1PRE - 2.8936 - Biograph_mMR - 39
AXT1PRE - 2.8936 - Prisma_fit - 31
AXT1PRE - 2.8936 - Skyra - 69
AXT1PRE - 2.8936 - TrioTim - 86
AXT2 - 1.494 - Aera - 443
AXT2 - 1.494 - Avanto - 587
AXT2 - 2.8936 - Biograph_mMR - 369
AXT2 - 2.8936 - Prisma_fit - 349
A