# Get all xnat subjects that have been tracked
Filter out any subjects that we don't know which project they came from.

In [2]:
import os
import pandas as pd
from datetime import datetime, timedelta
import re

In [11]:
xnat_files = ['MCL_CT.csv', 'Atwater_CT.csv','CANARY2a_CT.csv', 'HealthMyne_CT.csv', 'LTP2_CT.csv', 'Moffitt_CT.csv', 
    'Pitt_CT.csv', 'VLRVUVA_CT.csv', 'CTDNA_CT.csv', 'healthmyneGGO_CT.csv', 'MafeCANARY_CT.csv',
    'OptellumAneri_CT.csv', 'Optellum_CT.csv', 'TMA34_CT.csv', 'UW_CT.csv', 'VLR_CT.csv']
xnat_paths = [os.path.join('/home/local/VANDERBILT/litz/data/ajrccm/xnat20221201/', f) for f in xnat_files] 
# projects = ['MCL', 'Atwater', 'Canary', 'HealthMyne', 'LTP2', 'Moffitt', 'Pitt', 'VLRVUV', 'CTDNA', 
#     'healthmyneGGO', 'MafeCANARY', 'OptellumAneri', 'Optellum', 'TMA34', 'UW', 'VLR']
projects = ['MCL', 'Atwater', 'CANARY', 'HealthMyne','THO1292','Moffitt','Pitt', 'VLR-VUVA', 'CTDNA', 'GGO',  'MafeCANARY', 'OptellumAneri', 'Optellum', 'TMA34', 'UW', 'VLR']
ajr_path = "/home/local/VANDERBILT/litz/data/ajrccm/AJRCCM Dataset MCL and Date.xlsx"
xnat = []
for i, xnat_path in enumerate(xnat_paths):
    df = pd.read_csv(xnat_path, dtype={'XNAT_CTSESSIONDATA ID': str, 'Subject': str, 'dcmPatientName': str, 'dcmPatientId': str, 'Date': str})
    df['xnat_project'] = projects[i]
    xnat.append(df)
xnat = pd.concat(xnat, axis=0, ignore_index=True)
xnat['Date'] = pd.to_datetime(xnat['Date'], format='%Y-%m-%d')
# try to impute date from ID
def impute_date(x):
    if pd.isnull(x['Date']):
        try:
            date = pd.to_datetime(x['XNAT_CTSESSIONDATA ID'].split('_')[1], format='%Y%m%d')
            return date.strftime("%Y-%m-%d")
        except:
            return None
    else:
        return x['Date']
xnat['Date'] = xnat.apply(lambda x: impute_date(x), axis=1)
# parse mcl_id from MCLID_MCLID_CTDate and MCL_SPORE_MCLID formats
def parse_mclid(x):
    split = x.split('_')
    for s in split:
        if s.isdigit():
            return s

# xnat record must have Subject and Date
xnat = xnat[~xnat['Subject'].isnull()]
# xnat = xnat[~xnat['Date'].isnull()]
xnat['mcl_id'] = xnat['Subject'].apply(lambda x: parse_mclid(x))

  xnat['Date'] = xnat.apply(lambda x: impute_date(x), axis=1)


In [7]:
xnat_subj = xnat.groupby('mcl_id', as_index=False).max()

unmatched = pd.read_csv('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/xnat_unmatched_v2.csv', dtype={'mcl_id':str})
matched = xnat_subj[~xnat_subj['mcl_id'].isin(unmatched['mcl_id'])]
# matched.to_csv('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/xnat_matched_v2.csv', index=False)
print(f"Total xnat (n={len(xnat_subj)}) - unmatched (n={len(unmatched)}) = matched (n={len(matched)})")

Total xnat (n=4861) - unmatched (n=755) = matched (n=4108)


  xnat_subj = xnat.groupby('mcl_id', as_index=False).max()


In [5]:
# write mcl_ids as comma separated list for xnatdownload

idstring = ','.join(matched['mcl_id'].tolist()) # comma sep ids
dst = '/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/xnat_matched_v2.txt'
with open(dst, 'w') as f:
    f.write(idstring)

# write project list from xnat
projectids = ['MCL', 'Atwater', 'CANARY', 'HealthMyne','THO1292','Moffitt','Pitt', 'VLR-VUVA', 'CTDNA', 'GGO',  'MafeCANARY', 'OptellumAneri', 'Optellum', 'TMA34', 'UW', 'VLR']
pstring = ','.join(projectids)
with open('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/project_ids.txt', 'w') as f:
    f.write(pstring)

In [6]:
# skip xnatdownload done
success2 = pd.read_csv('/nfs/masi/MCL/xnat/xnat20230505_matchedv2/download_report_v2.csv', dtype={'subject_label':str})
success3 = pd.read_csv('/nfs/masi/MCL/xnat/xnat20230505_matchedv2/download_report_v3.csv', dtype={'subject_label':str})
success4 = pd.read_csv('/nfs/masi/MCL/xnat/xnat20230505_matchedv2/download_report_v4.csv', dtype={'subject_label':str})
success5 = pd.read_csv('/nfs/masi/MCL/xnat/xnat20230505_matchedv2/download_report_v5.csv', dtype={'subject_label':str})
success6 = pd.read_csv('/nfs/masi/MCL/xnat/xnat20230505_matchedv2/download_report_v6.csv', dtype={'subject_label':str})
success7 = pd.read_csv('/nfs/masi/MCL/xnat/xnat20230505_matchedv2/download_report_v7.csv', dtype={'subject_label':str})
success = pd.concat([success2, success3, success4, success5, success6, success7])
print(len(matched))
not_done = matched[~matched['mcl_id'].isin(success['subject_label'])]
not_done = not_done[~not_done['mcl_id'].isin(['27471483422', '18375153884'])]
not_done_idstring = ','.join(not_done['mcl_id'].tolist())
with open('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/xnat_matched_v2_job7.txt', 'w') as f:
    f.write(not_done_idstring)

4108


In [7]:
with open('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/xnat_matched_v2_job7.txt', 'r') as f:
    a = f.read()
len(a.split(','))

501

## Unmatched data

In [12]:
unmatched = pd.read_csv('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/xnat_unmatched_v2.csv', dtype={'mcl_id':str})
xnat_unmatched = xnat.merge(unmatched, on='XNAT_CTSESSIONDATA ID') # merge xnat records with unmatched list
xnat_sess_unmatched = xnat[xnat['mcl_id'].isin(xnat_unmatched['mcl_id_x'])]  # xnat session IDs of all unmatched
xnat_sess_unmatched = xnat_sess_unmatched[xnat_sess_unmatched['XNAT_CTSESSIONDATA ID'].notnull()]
xnat_sess_unmatched
xnat_sess_unmatched.to_csv('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/xnat_sess_unmatched.csv', index=False)

In [9]:
unmatched = pd.read_csv('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/xnat_unmatched_v2.csv', dtype={'mcl_id':str})
xnat_unmatched = xnat.merge(unmatched, on='XNAT_CTSESSIONDATA ID') # merge xnat records with unmatched list
unmatched_ids = xnat[xnat['mcl_id'].isin(xnat_unmatched['mcl_id_x'])]['XNAT_CTSESSIONDATA ID'] # xnat session IDs of all unmatched
unmatched_ids = unmatched_ids.apply(lambda x: str(x))

# comma separated list of unmatched IDs
dst = '/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/xnat_unmatched_v2.txt'
with open(dst, 'w') as f:
    f.write(','.join(unmatched_ids.tolist()))
# check txt file contains ids
with open(dst, 'r') as f:
    a = f.read()
    print(len(a.split(',')))

1635


download script:
`cat /home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/xnat_unmatched_v2.txt | xargs -n 1 -I {} Xnatdownload -p MCL,Atwater,CANARY,HealthMyne,THO1292,Moffitt,Pitt,VLR-VUVA,CTDNA,GGO,MafeCANARY,OptellumAneri,Optellum,TMA34,UW,VLR -d /nfs/masi/MCL/xnat/xnat20230616_unmatched --rs DICOM --sess {} -s all`

In [44]:
# split xnatdownload into n jobs
dst_root = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/"
njobs = 4
for n in range(njobs):
    jobids = unmatched_ids[n::njobs]
    jobfname = os.path.join(dst_root, f"xnat_unmatched_v2_job{n}.txt")
    with open(jobfname, 'w') as f:
        f.write(','.join(jobids.tolist()))

    with open(jobfname, 'r') as f:
        a = f.read()
        print(len(a.split(',')))

409
409
409
408


  jobids = unmatched_ids[n::njobs]


### remove unmatched data from XNAT

In [24]:
xnat_sess_unmatched.iloc[1423:]
remainder = xnat_sess_unmatched.iloc[1423:] # xnat delete failed after this row
remainder
# remainder = xnat_sess_unmatched.iloc[1423:].to_csv('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/xnat_sess_unmatched_remainder1.csv')

Unnamed: 0,XNAT_CTSESSIONDATA ID,Subject,Date,Age,dcmAccessionNumber,dcmPatientId,dcmPatientName,UID,Scans,xnat_project,M/F,Hand,YOB,CT Sessions,mcl_id
10486,10000001-01,10000001,2016-06-24,,,,,,Series0003(1),CANARY,,,,,10000001
10487,10000005-01,10000005,2016-06-24,,,,,,_Series0106(1),CANARY,,,,,10000005
10488,10000006-01,10000006,2016-06-24,,,,,,"Dose Info_Series80228(1), NO IV_Series0003(1),...",CANARY,,,,,10000006
10489,10000009-01,10000009,2016-06-24,,,,,,Series0106(1),CANARY,,,,,10000009
10490,10000010-01,10000010,2016-06-24,,,,,,"2X 1.8_Series0002(1), AXIAL CHEST MIP_Series80...",CANARY,,,,,10000010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12047,10000075-01,10000075,2016-06-24,,,,,,1X.5 THINS_Series0002(1),VLR,,,,,10000075
12048,10000076-01,10000076,2016-06-24,,,,,,THIN CHEST W_O_Series0002(1),VLR,,,,,10000076
12049,10000077-01,10000077,2016-06-24,,,,,,Series0002(1),VLR,,,,,10000077
12050,10000078-01,10000078,2016-06-24,,,,,,CT 3_Series0003(1),VLR,,,,,10000078


In [37]:
project_count = xnat_sess_unmatched.groupby(['Subject', 'XNAT_CTSESSIONDATA ID'], as_index=False)['xnat_project'].count().rename(columns={'xnat_project':'project_count'})
multi_project = project_count[project_count['project_count'] > 1]
vlr = xnat[xnat['xnat_project']=='VLR']
multi_vlr = vlr.merge(multi_project, on=['Subject', 'XNAT_CTSESSIONDATA ID']) # subjects in vlr also in another project

single_project = project_count[project_count['project_count'] == 1]
single_remainder = remainder.merge(single_project, on=['Subject', 'XNAT_CTSESSIONDATA ID']) # subjects only in a single project
remainder = pd.concat([multi_vlr, single_remainder])
remainder.to_csv('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/xnat_sess_unmatched_remainder1.csv', index=False)

In [7]:
remainder = pd.read_csv('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/xnat_sess_unmatched_remainder1.csv')
remainder.iloc[2:].to_csv('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/xnat_sess_unmatched_remainder2.csv', index=False)


In [8]:
remainder2 = pd.read_csv('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/xnat/xnat_sess_unmatched_remainder2.csv')
remainder2

Unnamed: 0,XNAT_CTSESSIONDATA ID,Subject,Date,Age,dcmAccessionNumber,dcmPatientId,dcmPatientName,UID,Scans,xnat_project,M/F,Hand,YOB,CT Sessions,mcl_id,project_count,project_count_x,project_count_y
0,10000004-01,10000004,2016-06-24,,,,,,Series0006(1),VLR,,,,,10000004,2.0,,
1,10000005-01,10000005,2016-06-24,,,,,,_Series0106(1),VLR,,,,,10000005,3.0,,
2,10000006-01,10000006,2016-06-24,,,,,,"Dose Info_Series80228(1), NO IV_Series0003(1),...",VLR,,,,,10000006,3.0,,
3,10000009-01,10000009,2016-06-24,,,,,,Series0106(1),VLR,,,,,10000009,4.0,,
4,10000010-01,10000010,2016-06-24,,,,,,"2X 1.8_Series0002(1), AXIAL CHEST MIP_Series80...",VLR,,,,,10000010,3.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,10000058-01,10000058,2016-06-24,,,,,,Series0005(1),VLR,,,,,10000058,,1.0,1.0
77,10000059-01,10000059,2016-06-24,,,,,,Series0002(1),VLR,,,,,10000059,,1.0,1.0
78,10000061-01,10000061,2016-06-24,,,,,,CT 3_Series0003(1),VLR,,,,,10000061,,1.0,1.0
79,10000072-01,10000072,2016-06-24,,,,,,Series0002(1),VLR,,,,,10000072,,1.0,1.0
