In [1]:
import re, json, os
from pathlib import Path
import subprocess
from datetime import datetime
from tqdm import tqdm
import pandas as pd

def get_sub_ses_acq_run_t1(t1_path):
    pattern = r'(sub-\w+)(?:_(ses-\w+))?(?:_(acq-\w+))?(?:_(run-\d{1,2}))?_T1w'
    matches = re.findall(pattern, t1_path.name)
    sub, ses, acq, run = matches[0][0], matches[0][1], matches[0][2], matches[0][3]
    return sub, ses, acq, run

In [88]:
find_cmd = 'find /nfs2/harmonization/BIDS/ABVIB -mindepth 4 -maxdepth 4 \( -type l -o -type f \) -name "*T1w.nii.gz"'
t1s = subprocess.run(find_cmd, shell=True, capture_output=True, text=True).stdout.strip().splitlines()

In [4]:
#l = t1s[:10]

def create_json_dict(filepaths):
    """
    Given a list of filenames, create the initial BIDS json dictionary
    """

    user = os.getlogin()
    date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    nested_d = {}
    for t1 in tqdm(filepaths):
        current_d = nested_d
        sub, ses, acq, run = get_sub_ses_acq_run_t1(Path(t1))
        for tag in [sub, ses, acq, run]:
            if tag:
                current_d = current_d.setdefault(tag, {})
        #set the default values
        row = {'QA_status': 'yes', 'reason': '', 'user': user, 'date': date}
        current_d.update(row)
        current_d = nested_d

    return nested_d

    #print(json.dumps(nested_d, indent=4))

def convert_json_to_csv(json_dict): ### THERE IS A BUG HERE, NOT SURE WHAT IT IS
    """
    Given a QA JSON dictionary, convert it to a CSV file
    """

    def get_tag_type(d):
        tag_types = {
            'sub': 'sub',
            'ses': 'ses',
            'acq': 'acq',
            'run': 'run'
        }
        for key, value in tag_types.items():
            if d.startswith(key):
                return value
        assert False, f"Unknown tag type: {d}"

    def get_leaf_dicts(d, path=None, curr_dict=None):
        if path is None:
            path = []
        if curr_dict is None:
            curr_dict = {}
        leaf_dicts = []
        for key, value in d.items():
            #print(key)
            if isinstance(value, dict):
                new_path = path + [key]
                curr_dict[get_tag_type(key)] = key  #### For some reason, curr_dict is carrying over previous values
                leaf_dicts.extend(get_leaf_dicts(value, new_path, curr_dict))
            else:
                #curr_dict.update(d)
                #d.update(curr_dict)
                leaf_dicts.append((path, d))
                break
        return leaf_dicts

    #get the leaf dictionaries
    leaf_dicts = get_leaf_dicts(json_dict)

    #make sure that the paths are included in the leaf dictionaries
    for paths,ds in leaf_dicts:
        for path in paths:
            ds[path[:3]] = path
            assert path in ds.values(), f"Path {path} not in dict {ds}"
        if 'run' not in ds:
            ds['run'] = ''
        if 'acq' not in ds:
            ds['acq'] = ''
        if 'ses' not in ds:
            ds['ses'] = ''
    #now get a list of only the leaf dictionaries
    leaf_dicts = [ds for paths,ds in leaf_dicts]
    #finally, convert to a csv
    header = ['sub', 'ses', 'acq', 'run', 'QA_status', 'reason', 'user', 'date']
    df = pd.DataFrame(leaf_dicts)
    #reorder the columns accroding to the header
    df = df[header]
    #replace NaN with empty string
    df = df.fillna('')

    df.to_csv('QA.csv', index=False)

    return df

def read_csv_to_json(df):
    """
    Given a QA CSV dataframe, convert it to a QA JSON dictionary
    """

    json_data = {}

    for index, row in df.iterrows():
        #sub, ses, acq, run = row['sub'], row['ses'], row['acq'], row['run']
        qa_status, reason, user, date = row['QA_status'], row['reason'], row['user'], row['date']
        current_d = json_data
        has_d = {}
        for tag in ['sub', 'ses', 'acq', 'run']:
            if row[tag]:
                current_d = current_d.setdefault(row[tag], {})
                has_d[tag] = row[tag]
        #set the values
        add_row = {'QA_status': qa_status, 'reason': reason, 'user': user, 'date': date}
        if 'run' not in has_d:
            add_row.update({'run': ''})
        if 'acq' not in has_d:
            add_row.update({'acq': ''})
        if 'ses' not in has_d:
            add_row.update({'ses': ''})
        add_row.update(has_d)
        current_d.update(add_row)
        current_d = json_data
    
    #print(json.dumps(json_data, indent=4))

    return json_data

def compare_dicts(d1, d2):
    """
    Compare two dictionaries
    """
    
    #assert len(d1) == len(d2), "Dictionaries have different lengths"
    for key in d1:
        #print(key)
        #print(d1)
        #print(d2)
        assert key in d2, f"Key {key} not in d2. d1: {d1} \n d2: {d2}"
        if isinstance(d1[key], dict):
            compare_dicts(d1[key], d2[key])
        else:
            assert d1[key] == d2[key], f"Values for key {key} are different: {d1[key]} vs {d2[key]}"


nested_d = create_json_dict(t1s)
df = convert_json_to_csv(nested_d)
converted_json = read_csv_to_json(df)

# for x in converted_json:
#     print(nested_d[x])
#     print(converted_json[x])

#print(nested_d['sub-2007'])
#print(converted_json['sub-2007'])
compare_dicts(nested_d, converted_json)
compare_dicts(converted_json, nested_d)

#dump the dictionary to a json file
# with open('qa1.json', 'w') as f:
#     json.dump(nested_d, f, indent=4)

# with open('qa2.json', 'w') as f:
#     json.dump(converted_json, f, indent=4)

print(nested_d)

100%|██████████| 75/75 [00:00<00:00, 43861.24it/s]

{'sub-7003': {'ses-20100408': {'acq-SPGR': {'QA_status': 'yes', 'reason': '', 'user': 'root', 'date': '2024-07-10 00:06:29', 'sub': 'sub-7003', 'ses': 'ses-20100408', 'acq': 'acq-SPGR', 'run': ''}}}, 'sub-201': {'ses-20090310': {'acq-MPRAGE': {'QA_status': 'yes', 'reason': '', 'user': 'root', 'date': '2024-07-10 00:06:29', 'sub': 'sub-201', 'ses': 'ses-20090310', 'acq': 'acq-MPRAGE', 'run': ''}}}, 'sub-3831': {'ses-20120508': {'acq-MPRAGE': {'QA_status': 'yes', 'reason': '', 'user': 'root', 'date': '2024-07-10 00:06:29', 'sub': 'sub-3831', 'ses': 'ses-20120508', 'acq': 'acq-MPRAGE', 'run': ''}}}, 'sub-61040': {'ses-20120110': {'acq-MPRAGE': {'QA_status': 'yes', 'reason': '', 'user': 'root', 'date': '2024-07-10 00:06:29', 'sub': 'sub-61040', 'ses': 'ses-20120110', 'acq': 'acq-MPRAGE', 'run': ''}}}, 'sub-257': {'ses-20111013': {'acq-MPRAGE': {'run-1': {'QA_status': 'yes', 'reason': '', 'user': 'root', 'date': '2024-07-10 00:06:29', 'sub': 'sub-257', 'ses': 'ses-20111013', 'acq': 'acq-MPR




In [80]:
for x in t1s[:10]:
    print(x)

/nfs2/harmonization/BIDS/ABVIB/sub-7003/ses-20100408/anat/sub-7003_ses-20100408_acq-SPGR_T1w.nii.gz
/nfs2/harmonization/BIDS/ABVIB/sub-201/ses-20090310/anat/sub-201_ses-20090310_acq-MPRAGE_T1w.nii.gz
/nfs2/harmonization/BIDS/ABVIB/sub-3831/ses-20120508/anat/sub-3831_ses-20120508_acq-MPRAGE_T1w.nii.gz
/nfs2/harmonization/BIDS/ABVIB/sub-61040/ses-20120110/anat/sub-61040_ses-20120110_acq-MPRAGE_T1w.nii.gz
/nfs2/harmonization/BIDS/ABVIB/sub-257/ses-20111013/anat/sub-257_ses-20111013_acq-MPRAGE_run-1_T1w.nii.gz
/nfs2/harmonization/BIDS/ABVIB/sub-257/ses-20111013/anat/sub-257_ses-20111013_acq-MPRAGE_run-2_T1w.nii.gz
/nfs2/harmonization/BIDS/ABVIB/sub-2007/ses-20120320/anat/sub-2007_ses-20120320_acq-SPGR_T1w.nii.gz
/nfs2/harmonization/BIDS/ABVIB/sub-7055/ses-20120712/anat/sub-7055_ses-20120712_acq-SPGR_T1w.nii.gz
/nfs2/harmonization/BIDS/ABVIB/sub-60024/ses-20100909/anat/sub-60024_ses-20100909_acq-MPRAGE_T1w.nii.gz
/nfs2/harmonization/BIDS/ABVIB/sub-218/ses-20121108/anat/sub-218_ses-20121108_

In [3]:
## test read in the json
import json

def get_tag_type(d):
    tag_types = {
        'sub': 'sub',
        'ses': 'ses',
        'acq': 'acq',
        'run': 'run'
    }
    for key, value in tag_types.items():
        if d.startswith(key):
            return value
    assert False, f"Unknown tag type: {d}"

def get_leaf_dicts(d, path=None, curr_dict=None):
    if path is None:
        path = []
    if curr_dict is None:
        curr_dict = {}
    leaf_dicts = []
    for key, value in d.items():
        #print(key)
        if isinstance(value, dict):
            new_path = path + [key]
            curr_dict[get_tag_type(key)] = key  #### For some reason, curr_dict is carrying over previous values
            leaf_dicts.extend(get_leaf_dicts(value, new_path, curr_dict))
        else:
            leaf_dicts.append((path, d))
            break
    return leaf_dicts

def are_unique_qa_dicts(dict_list):
    """
    Given a list of qa dictionaries, check that no two dictionaries are the same

    Only considers the sub, ses, acq, and run elements
    """

    def add_items(curr_set, elt):
        curr_set.add(elt)
        return len(curr_set)

    seen = set()
    for d in tqdm(dict_list):
        sub, ses, acq, run = d['sub'], d['ses'], d['acq'], d['run']
        if len(seen) == add_items(seen, (sub, ses, acq, run)):
            return False
    return True

def _create_pngs(t1s):
    pngs = []
    for t1 in t1s:
        sub, ses, acq, run = get_sub_ses_acq_run_t1(Path(t1))
        png = f'{sub}_'
        if ses:
            png += f"{ses}_"
        png += "WMAtlas"
        if acq:
            png += f"{acq}"
        if run:
            png += f"{run}"
        png += ".png"
        pngs.append(png)

    return pngs

def assert_tags_in_dict(paths, leaf_dicts):
    """
    For given lists of paths and leaf dictionaries, assert that the paths are in the dictionaries
    """
    for paths,ds in zip(paths, leaf_dicts):
        for path in paths:
            assert path in ds.values(), f"Path {path} not in dict {ds}"

def get_BIDS_fields_from_png(filename, return_pipeline=False):
    """
    Given a QA png filename, return the BIDS fields.
    """
    #pattern = r"sub-(?P<sub>\d+)_ses-(?P<ses>\d+)_\w+acq-(?P<acq>\d+)run-(?P<run>\d+)\.png"
    pattern = r'(sub-\w+)(?:_(ses-\w+))?(?:_(\w+))(?:(acq-\w+))?(?:(run-\d{1,2}))?.png'
    match = re.match(pattern, filename)
    assert match, f"Filename {filename} does not match the expected pattern."
    tags = {'sub': match.group(1), 'ses': match.group(2), 'acq': match.group(4), 'run': match.group(5)}
    if return_pipeline:
        tags['pipeline'] = match.group(3)
    return tags


def check_png_for_json(dicts, pngs):
    """
    Given a list of QA json leaf dictionaries and list of pngs, make sure that every single json entry has a corresponding png file
    """

    #get the pipeline
    pipeline = get_BIDS_fields_from_png(pngs[0], return_pipeline=True)['pipeline']

    for dic in dicts:
        sub, ses, acq, run = dic['sub'], dic['ses'], dic['acq'], dic['run']
        png = f'{sub}_'
        if ses:
            png += f"{ses}_"
        png += f"{pipeline}"
        if acq:
            png += f"{acq}"
        if run:
            png += f"{run}"
        png += ".png"
        assert png in pngs, f"PNG {png} from {dic} not in list of pngs"

def check_json_for_png(nested, pngs):
    """
    Given a nested json and list of pngs, make sure that every single png file has a corresponding json entry.

    If it does not, then add the default values to the json file.
    """

    user = os.getlogin()
    date = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    for png in pngs:
        tags = get_BIDS_fields_from_png(png)
        sub, ses, acq, run = tags['sub'], tags['ses'], tags['acq'], tags['run']
        current_d = nested
        for tag in [sub, ses, acq, run]:
            if tag:
                try:
                    current_d = current_d[tag]
                except KeyError:
                    print(f"PNG {png} has no corresponding json entry. Adding to json file.")
                    current_d = current_d.setdefault(tag, {})
        #if current_d is blank, then we need to add the default values
        if not current_d:
            row = {'QA_status': 'yes', 'reason': '', 'user': os.getlogin(), 'date': date}
            current_d.update(row)
            current_d.update(tags)
    
    return nested
                    

jsonf = "qa1.json"
with open(jsonf, 'r') as f:
    data = json.load(f)

#check to make sure that every single json entry has a corresponding png file
t1s_f = "t1s.txt"
with open(t1s_f, 'r') as f:
    t1s = f.readlines()
    t1s = [x.strip() for x in t1s]

paths, leaf_dicts = zip(*get_leaf_dicts(data))
print(are_unique_qa_dicts(leaf_dicts))

#now, go through every single path and ds combination to make sure that the path is represented in the ds
assert_tags_in_dict(paths, leaf_dicts)

#now make sure that every single json entry has a corresponding png file
pngs = _create_pngs(t1s)
check_png_for_json(leaf_dicts, pngs)

#now, make sure that every single png file has a corresponding json entry. If it does not, then we need to add to the json file
pngs += ['sub-2007_ses-2019_WMAtlasacq-MPRrun-1.png']
data = check_json_for_png(data, pngs)
print(data['sub-2007']['ses-2019']['acq-MPR']['run-1'])

100%|██████████| 75/75 [00:00<00:00, 93400.48it/s]

True
PNG sub-2007_ses-2019_WMAtlasacq-MPRrun-1.png has no corresponding json entry. Adding to json file.
PNG sub-2007_ses-2019_WMAtlasacq-MPRrun-1.png has no corresponding json entry. Adding to json file.
PNG sub-2007_ses-2019_WMAtlasacq-MPRrun-1.png has no corresponding json entry. Adding to json file.
{'QA_status': 'yes', 'reason': '', 'user': 'root', 'date': '2024-07-10 00:06:25', 'sub': 'sub-2007', 'ses': 'ses-2019', 'acq': 'acq-MPR', 'run': 'run-1'}





In [18]:
#check that no two dictionaries in a list are the same
def are_unique_dicts(dict_list):
    """
    Given a list of dictionaries, check that no two dictionaries are the same

    First, only considers the sub, ses, acq, and run elements
    """

    def add_items(curr_set, elt):
        curr_set.add(elt)
        return len(curr_set)

    seen = set()
    for d in dict_list:
        sub, ses, acq, run = d['sub'], d['ses'], d['acq'], d['run']
        if len(seen) == add_items(seen, (sub, ses, acq, run)):
            return False
    return True

frozenset({2, 4})
{frozenset({2, 4})}


In [31]:
#'sub-1_ses-1_WMAtlasacq-1run-1.png'
def _create_pngs(t1s):
    pngs = []
    for t1 in t1s:
        sub, ses, acq, run = get_sub_ses_acq_run_t1(Path(t1))
        png = f'{sub}_'
        if ses:
            png += f"{ses}_"
        png += "WMAtlas"
        if acq:
            png += f"{acq}"
        if run:
            png += f"{run}"
        png += ".png"
        pngs.append(png)

    return pngs


['sub-7003_ses-20100408_WMAtlasacq-SPGR.png', 'sub-201_ses-20090310_WMAtlasacq-MPRAGE.png', 'sub-3831_ses-20120508_WMAtlasacq-MPRAGE.png', 'sub-61040_ses-20120110_WMAtlasacq-MPRAGE.png', 'sub-257_ses-20111013_WMAtlasacq-MPRAGErun-1.png', 'sub-257_ses-20111013_WMAtlasacq-MPRAGErun-2.png', 'sub-2007_ses-20120320_WMAtlasacq-SPGR.png', 'sub-7055_ses-20120712_WMAtlasacq-SPGR.png', 'sub-60024_ses-20100909_WMAtlasacq-MPRAGE.png', 'sub-218_ses-20121108_WMAtlasacq-MPRAGE.png', 'sub-262_ses-20111123_WMAtlasacq-MPRAGE.png', 'sub-268_ses-20111129_WMAtlasacq-MPRAGE.png', 'sub-286_ses-20120531_WMAtlasacq-MPRAGE.png', 'sub-159_ses-20090519_WMAtlasacq-MPRAGE.png', 'sub-159_ses-20110520_WMAtlasacq-MPRAGE.png', 'sub-60008_ses-20120321_WMAtlasacq-MPRAGE.png', 'sub-7004_ses-20120322_WMAtlasacq-SPGR.png', 'sub-7004_ses-20100322_WMAtlasacq-SPGR.png', 'sub-1031_ses-20110623_WMAtlasacq-SPGR.png', 'sub-3592_ses-20110104_WMAtlasacq-MPRAGE.png', 'sub-3471_ses-20100526_WMAtlasacq-MPRAGE.png', 'sub-7045_ses-201201

In [None]:
len