In [1]:
DATA_FOLDER = "<PATH_TO_DATA_FOLDER>"
STRUCTURED_DATA_DIR = "<PATH_TO_STRUCTURED_DATA>"

## __Display directory tree in the data folder__

In [9]:
import os
import argparse

def generate_directory_tree(startpath, prefix=''):
    if not os.path.isdir(startpath):
        print(f"Error: Provided path '{startpath}' is not a valid directory.")
        return
    try:
        items = sorted([item for item in os.listdir(startpath) if not item.startswith('.')])
    except PermissionError:
        print(f"{prefix}|-- [Permission Denied]")
        return
    pointers = ['|-- ' for _ in range(len(items) - 1)] + ['`-- ']

    for pointer, item in zip(pointers, items):
        print(prefix + pointer + item)
        path = os.path.join(startpath, item)
        if os.path.isdir(path):
            extension = '|   ' if pointer == '|-- ' else '    '
            generate_directory_tree(path, prefix + extension)
generate_directory_tree(STRUCTURED_DATA_DIR)    

|-- annotation_file_RSNA_20250321.tsv
|-- case_RSNA_20250321.tsv
|-- clinical_manifest_RSNA_20250321.tsv
|-- image_manifest_RSNA_20250321.tsv
|-- imaging_study_RSNA_20250321.tsv
`-- mr_series_RSNA_20250321.tsv


## __Read .tsv data table into pandas dataframe__

In [10]:
import pandas as pd
import numpy as np

import os

### __Demographics data__

In [12]:
demo_df = pd.read_csv(os.path.join(STRUCTURED_DATA_DIR, "case_RSNA_20250321.tsv"), sep='\t')

In [13]:
demo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1232 entries, 0 to 1231
Data columns (total 21 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   type                            1232 non-null   object 
 1   submitter_id                    1232 non-null   object 
 2   datasets                        0 non-null      float64
 3   age_at_index                    1220 non-null   float64
 4   age_at_index_gt89               1232 non-null   object 
 5   case_ids                        1232 non-null   object 
 6   country_of_residence            1232 non-null   object 
 7   covid19_positive                1232 non-null   object 
 8   ethnicity                       1232 non-null   object 
 9   gen3_linked_subjects_available  0 non-null      float64
 10  icu_indicator                   1232 non-null   bool   
 11  index_event                     1232 non-null   object 
 12  linked_external_data            0 

In [14]:
demo_df.head()

Unnamed: 0,type,submitter_id,datasets,age_at_index,age_at_index_gt89,case_ids,country_of_residence,covid19_positive,ethnicity,gen3_linked_subjects_available,...,index_event,linked_external_data,long_covid_diagnosis,race,sex,site_id,token_record_id,ventilator_indicator,treatment_info,zip
0,case,593973-000769,,80.0,No,593973-000769,US,Not Reported,Not Hispanic or Latino,,...,Study Enrollment,,Not Reported,White,Male,593973,,False,,
1,case,593973-000015,,54.0,No,593973-000015,US,Not Reported,Not Hispanic or Latino,,...,Study Enrollment,,Not Reported,Black or African American,Female,593973,,False,,
2,case,593973-000752,,55.0,No,593973-000752,US,Not Reported,Not Hispanic or Latino,,...,Study Enrollment,,Not Reported,Black or African American,Female,593973,,False,,
3,case,593973-000155,,42.0,No,593973-000155,US,Not Reported,Not Hispanic or Latino,,...,Study Enrollment,,Not Reported,White,Male,593973,,False,,
4,case,593973-001082,,55.0,No,593973-001082,US,Not Reported,Not Hispanic or Latino,,...,Study Enrollment,,Not Reported,White,Female,593973,,False,,


In [15]:
demo_df.race.value_counts()

White                                        795
Black or African American                    342
Not Reported                                  33
Other                                         28
Asian                                         24
American Indian or Alaska Native               9
Native Hawaiian or other Pacific Islander      1
Name: race, dtype: int64

In [16]:
demo_df.sex.value_counts()

Female    675
Male      557
Name: sex, dtype: int64

In [17]:
demo_df.ethnicity.value_counts()

Not Hispanic or Latino    1141
Hispanic or Latino          58
Not Reported                33
Name: ethnicity, dtype: int64

In [18]:
print(f""" Mean age: {demo_df.age_at_index.mean()} with standard deviation: {demo_df.age_at_index.std()}""")

 Mean age: 54.515573770491805 with standard deviation: 16.36646324135574


### __Imaging parameters__

In [20]:
para_df = pd.read_csv(os.path.join(STRUCTURED_DATA_DIR, "mr_series_RSNA_20250321.tsv"), sep='\t')

In [21]:
para_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1255 entries, 0 to 1254
Data columns (total 40 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   imaging_studies.submitter_id    1255 non-null   object 
 1   angio_flag                      1255 non-null   object 
 2   case_ids                        1255 non-null   object 
 3   contrast_bolus_agent            11 non-null     object 
 4   diffusion_b_value               0 non-null      float64
 5   diffusion_gradient_orientation  0 non-null      float64
 6   echo_number                     1252 non-null   float64
 7   echo_train_length               1255 non-null   int64  
 8   echo_time                       1255 non-null   float64
 9   image_type                      1255 non-null   object 
 10  imaged_nucleus                  1255 non-null   object 
 11  imager_pixel_spacing            0 non-null      float64
 12  lossy_image_compression         3 

In [22]:
para_df.columns

Index(['imaging_studies.submitter_id', 'angio_flag', 'case_ids',
       'contrast_bolus_agent', 'diffusion_b_value',
       'diffusion_gradient_orientation', 'echo_number', 'echo_train_length',
       'echo_time', 'image_type', 'imaged_nucleus', 'imager_pixel_spacing',
       'lossy_image_compression', 'magnetic_field_strength',
       'mr_acquisition_type', 'number_of_temporal_positions', 'manufacturer',
       'manufacturer_model_name', 'modality', 'pixel_spacing',
       'series_description', 'series_uid', 'spatial_resolution',
       'repetition_time', 'scan_options', 'scanning_sequence', 'sequence_name',
       'sequence_variant', 'slice_thickness', 'software_version',
       'spacing_between_slices', 'type', 'data_type', 'data_format',
       'data_category', 'submitter_id', 'image_data_modification_method',
       'image_data_modification_name', 'image_data_modified',
       'number_of_instances'],
      dtype='object')