# Get all xnat subjects that have been tracked
Filter out any subjects that we don't know which project they came from.

In [1]:
import os
import pandas as pd
from datetime import datetime, timedelta
import re

In [2]:
xnat_files = ['MCL_CT.csv', 'Atwater_CT.csv','CANARY2a_CT.csv', 'HealthMyne_CT.csv', 'LTP2_CT.csv', 'Moffitt_CT.csv', 
    'Pitt_CT.csv', 'VLRVUVA_CT.csv', 'CTDNA_CT.csv', 'healthmyneGGO_CT.csv', 'MafeCANARY_CT.csv',
    'OptellumAneri_CT.csv', 'Optellum_CT.csv', 'TMA34_CT.csv', 'UW_CT.csv', 'VLR_CT.csv']
xnat_paths = [os.path.join('/home/local/VANDERBILT/litz/data/ajrccm/xnat20221201/', f) for f in xnat_files] 
projects = ['MCL', 'Atwater', 'Canary', 'HealthMyne', 'LTP2', 'Moffitt', 'Pitt', 'VLRVUV', 'CTDNA', 
    'healthmyneGGO', 'MafeCANARY', 'OptellumAneri', 'Optellum', 'TMA34', 'UW', 'VLR']
ajr_path = "/home/local/VANDERBILT/litz/data/ajrccm/AJRCCM Dataset MCL and Date.xlsx"
xnat = []
for i, xnat_path in enumerate(xnat_paths):
    df = pd.read_csv(xnat_path, dtype={'XNAT_CTSESSIONDATA ID': str, 'Subject': str, 'dcmPatientName': str, 'dcmPatientId': str, 'Date': str})
    df['xnat_project'] = projects[i]
    xnat.append(df)
xnat = pd.concat(xnat, axis=0, ignore_index=True)
xnat['Date'] = pd.to_datetime(xnat['Date'], format='%Y-%m-%d')
# try to impute date from ID
def impute_date(x):
    if pd.isnull(x['Date']):
        try:
            date = pd.to_datetime(x['XNAT_CTSESSIONDATA ID'].split('_')[1], format='%Y%m%d')
            return date.strftime("%Y-%m-%d")
        except:
            return None
    else:
        return x['Date']
xnat['Date'] = xnat.apply(lambda x: impute_date(x), axis=1)
# parse mcl_id from MCLID_MCLID_CTDate and MCL_SPORE_MCLID formats
def parse_mclid(x):
    split = x.split('_')
    for s in split:
        if s.isdigit():
            return s

# xnat record must have Subject and Date
xnat = xnat[~xnat['Subject'].isnull()]
# xnat = xnat[~xnat['Date'].isnull()]
xnat['mcl_id'] = xnat['Subject'].apply(lambda x: parse_mclid(x))

  xnat['Date'] = xnat.apply(lambda x: impute_date(x), axis=1)


In [3]:
xnat_subj = xnat.groupby('mcl_id', as_index=False).max()

unmatched = pd.read_csv('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/xnat_unmatched_v2.csv', dtype={'mcl_id':str})
matched = xnat_subj[~xnat_subj['mcl_id'].isin(unmatched['mcl_id'])]
matched.to_csv('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/xnat_matched_v2.csv', index=False)
print(f"Total xnat (n={len(xnat_subj)}) - unmatched (n={len(unmatched)}) = matched (n={len(matched)})")

Total xnat (n=4861) - unmatched (n=755) = matched (n=4108)


  xnat_subj = xnat.groupby('mcl_id', as_index=False).max()


In [4]:
# write mcl_ids as comma separated list for xnatdownload

idstring = ','.join(matched['mcl_id'].tolist()) # comma sep ids
dst = '/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/xnat_matched_v2.txt'
with open(dst, 'w') as f:
    f.write(idstring)

# write project list from xnat
projectids = ['MCL', 'Atwater', 'CANARY', 'HealthMyne','THO1292','Moffitt','Pitt', 'VLR-VUVA', 'CTDNA', 'GGO',  'MafeCANARY', 'OptellumAneri', 'Optellum', 'TMA34', 'UW', 'VLR']
pstring = ','.join(projectids)
with open('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/project_ids.txt', 'w') as f:
    f.write(pstring)

In [8]:
# skip xnatdownload done
success2 = pd.read_csv('/nfs/masi/MCL/xnat/xnat20230505_matchedv2/download_report_v2.csv', dtype={'subject_label':str})
success3 = pd.read_csv('/nfs/masi/MCL/xnat/xnat20230505_matchedv2/download_report_v3.csv', dtype={'subject_label':str})
success4 = pd.read_csv('/nfs/masi/MCL/xnat/xnat20230505_matchedv2/download_report_v4.csv', dtype={'subject_label':str})
success5 = pd.read_csv('/nfs/masi/MCL/xnat/xnat20230505_matchedv2/download_report_v5.csv', dtype={'subject_label':str})
success6 = pd.read_csv('/nfs/masi/MCL/xnat/xnat20230505_matchedv2/download_report_v6.csv', dtype={'subject_label':str})
success = pd.concat([success2, success3, success4, success5, success6])
print(len(matched))
not_done = matched[~matched['mcl_id'].isin(success['subject_label'])]
not_done = not_done[~not_done['mcl_id'].isin(['27471483422', '18375153884'])]
not_done_idstring = ','.join(not_done['mcl_id'].tolist())
with open('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/xnat_matched_v2_job6.txt', 'w') as f:
    f.write(not_done_idstring)

4108


In [11]:
with open('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/xnat_matched_v2_job6.txt', 'r') as f:
    a = f.read()
len(a.split(','))

2345

In [66]:
not_done[not_done['mcl_id']=='18375153884']

Unnamed: 0,mcl_id,Subject,Date,Age,xnat_project,Hand,YOB,CT Sessions


In [60]:
success[success['subject_label']=='18375153884']

Unnamed: 0,object_type,project_id,subject_label,session_type,session_label,as_label,as_type,as_description,quality,resource,fpath
