# Extra scans in XNAT


In [8]:
import os
import pandas as pd
from datetime import datetime, timedelta
import re

In [9]:
xnat_files = ['MCL_CT.csv', 'Atwater_CT.csv','CANARY2a_CT.csv', 'HealthMyne_CT.csv', 'LTP2_CT.csv', 'Moffitt_CT.csv', 
    'Pitt_CT.csv', 'VLRVUVA_CT.csv', 'CTDNA_CT.csv', 'healthmyneGGO_CT.csv', 'MafeCANARY_CT.csv',
    'OptellumAneri_CT.csv', 'Optellum_CT.csv', 'TMA34_CT.csv', 'UW_CT.csv', 'VLR_CT.csv']
xnat_paths = [os.path.join('/home/local/VANDERBILT/litz/data/ajrccm/xnat20221201/', f) for f in xnat_files] 
projects = ['MCL', 'Atwater', 'Canary', 'HealthMyne', 'LTP2', 'Moffitt', 'Pitt', 'VLRVUV', 'CTDNA', 
    'healthmyneGGO', 'MafeCANARY', 'OptellumAneri', 'Optellum', 'TMA34', 'UW', 'VLR']
ajr_path = "/home/local/VANDERBILT/litz/data/ajrccm/AJRCCM Dataset MCL and Date.xlsx"
xnat = []
for i, xnat_path in enumerate(xnat_paths):
    df = pd.read_csv(xnat_path, dtype={'XNAT_CTSESSIONDATA ID': str, 'Subject': str, 'dcmPatientName': str, 'dcmPatientId': str, 'Date': str})
    df['xnat_project'] = projects[i]
    xnat.append(df)
xnat = pd.concat(xnat, axis=0, ignore_index=True)
xnat['Date'] = pd.to_datetime(xnat['Date'], format='%Y-%m-%d')
# try to impute date from ID
def impute_date(x):
    if pd.isnull(x['Date']):
        try:
            date = pd.to_datetime(x['XNAT_CTSESSIONDATA ID'].split('_')[1], format='%Y%m%d')
            return date.strftime("%Y-%m-%d")
        except:
            return None
    else:
        return x['Date']
xnat['Date'] = xnat.apply(lambda x: impute_date(x), axis=1)
# parse mcl_id from MCLID_MCLID_CTDate and MCL_SPORE_MCLID formats
def parse_mclid(x):
    split = x.split('_')
    for s in split:
        if s.isdigit():
            return s

# xnat record must have Subject and Date
xnat = xnat[~xnat['Subject'].isnull()]
# xnat = xnat[~xnat['Date'].isnull()]
xnat['mcl_id'] = xnat['Subject'].apply(lambda x: parse_mclid(x))

  xnat['Date'] = xnat.apply(lambda x: impute_date(x), axis=1)


In [None]:
xnat[xnat['Date'].isnull()]

Unnamed: 0,XNAT_CTSESSIONDATA ID,Subject,Date,Age,dcmAccessionNumber,dcmPatientId,dcmPatientName,UID,Scans,xnat_project,M/F,Hand,YOB,CT Sessions,mcl_id
2765,19669946331__093734_859000,19669946331__093734_859000,NaT,,2819497684894126,,19669946331,1.2.840.113654.2.70.1.151947200095643505920966...,"AX LUNG NCE(1), AX NCE(1), AX NCE Z(1), COR MP...",MCL,,,,,19669946331
5588,30009657087-20151124,30009657087,NaT,,,,,,1.25(1),MCL,,,,,30009657087
5599,30041657825__130053_000000,30041657825__130053_000000,NaT,,2819497684894126,,30041657825,1.2.840.113654.2.70.1.172306694182546639368758...,"B30 SOFT 3.0 SPO cor(1), B30 SOFT 3.0 SPO...",MCL,,,,,30041657825
5842,31041894607__171930_606000,31041894607__171930_606000,NaT,,2819497684894126,,31041894607,1.2.840.113654.2.70.1.327286486989233862855957...,"AX LUNG NCE(1), AX NCE(1), AX NCE Z(1), COR MP...",MCL,,,,,31041894607
6395,33400829042__101902,33400829042__101902,NaT,,2819497684894126,,33400829042,1.2.840.113654.2.70.1.323074360637228262517743...,"CHEST(1), CT Chest1 Cor 3 Avg(1), CT Chest2 Sa...",MCL,,,,,33400829042
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10871,0514-052,0514-052,NaT,,,,,,CT(1),Moffitt,,,,,
10872,0514-060,0514-060,NaT,,,,,,CT(1),Moffitt,,,,,
10873,0514-061,0514-061,NaT,,,,,,CT(1),Moffitt,,,,,
10874,0514-064,0514-064,NaT,,,,,,CT(1),Moffitt,,,,,


: 

In [10]:
# redcap subjects
redcap = "/home/local/VANDERBILT/litz/data/mcl/biorepository_mcls.csv"
redcap = pd.read_csv(redcap, dtype={'MCL.ID': str})
redcap = redcap.rename(columns={'MCL.ID': 'mcl_id'})

In [11]:
# duplicates - sessions can be duplicated by using different MCLID naming or uploading same session across projects
dups = xnat.groupby(['mcl_id', 'Date'], as_index=False).size()
dups = dups[dups['size']>1]
dups = dups.merge(xnat, on=['mcl_id', 'Date']).rename(columns={'size':'num_duplicates'})
dups.to_csv("/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/extra/xnat_duplicates.csv", index=False)

# account for duplicate sessions
xnat_grp = xnat.groupby(['mcl_id', 'Date'], as_index=False).max()

  xnat_grp = xnat.groupby(['mcl_id', 'Date'], as_index=False).max()


In [12]:
both = redcap.merge(xnat_grp, on='mcl_id', how='inner') # both 
nomcl = xnat[xnat['mcl_id'].isnull()] # xnat records without MCL ID (LUNG-###, DECAMP ID, etc.)
redcap_only = redcap[~redcap['mcl_id'].isin(xnat_grp['mcl_id'])]# in redcap but not xnat
xnat_only = xnat_grp[~xnat_grp['mcl_id'].isin(redcap['mcl_id'])] # in xnat but not redcap

print(len(both['mcl_id'].unique()))
print(len(nomcl))
print(len(redcap_only['mcl_id'].unique()))
print(len(xnat_only['Subject'].unique()))

1421
146
7230
3446


In [13]:
# concatenate redcap-xnat intersection and nomcl
cols = ['mcl_id', 'Date', 'XNAT_CTSESSIONDATA ID', 'xnat_project']
both_grp = both[cols].groupby(['mcl_id', 'XNAT_CTSESSIONDATA ID', 'Date'], as_index=False).max()

nomcl['other_id'] = nomcl['Subject']
nomcl_grp = nomcl[cols + ['other_id']].groupby(['other_id', 'XNAT_CTSESSIONDATA ID', 'Date'], as_index=False).max()
extra_cohort = pd.concat([both_grp, nomcl_grp])
extra_cohort

# redcap_only = redcap_only['mcl_id']
# xnat_only = xnat_only[['mcl_id', 'Date', 'XNAT_CTSESSIONDATA ID', 'xnat_project']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nomcl['other_id'] = nomcl['Subject']


Unnamed: 0,mcl_id,XNAT_CTSESSIONDATA ID,Date,xnat_project,other_id
0,10179873529,10179873529-20120508,2012-05-08,MCL,
1,10250356363,10250356363-20130327,2013-03-27,MCL,
2,10250356363,10250356363-20130923,2013-09-23,MCL,
3,10310069205,10310069205_20181005,2018-10-05,MCL,
4,10321528884,10321528884-20160629,2016-06-29,MafeCANARY,
...,...,...,...,...,...
45,,LUNG-0103_20150622,2017-06-23,Canary,LUNG-0103
46,,LUNG-0104_20150917,2017-06-23,Canary,LUNG-0104
47,,LUNG-0105_20150628,2017-06-23,Canary,LUNG-0105
48,,LUNG-0106_20170108,2017-06-23,Canary,LUNG-0106


## Subjects in XNAT/Redcap but not in any other cohorts

In [14]:
# vumc cohorts from csv file
vumc_cohort_paths = ["../cohorts/bronch/bronch_v1.csv", "../cohorts/multi_mcl/multi_mcl_v1.csv", "../cohorts/vlsp/vlsp_cohort_v1.csv"]
cohorts=['bronch', 'multi_mcl', 'vlsp']
vumc_cohorts = []
for i, path in enumerate(vumc_cohort_paths):
    cohort = pd.read_csv(path)
    cohort['cohort'] = cohorts[i]
    vumc_cohorts.append(cohort[['pid', 'id', 'session', 'scan_date', 'cohort']])
vumc_cohorts = pd.concat(vumc_cohorts, axis=0, ignore_index=True)
vumc_cohorts['scan_date'] = vumc_cohorts['scan_date'].apply(lambda x: pd.to_datetime(x))

# livu cohort
img_dir = "/nfs/masi/MCL/xnat_upload/xnat20210825_ImageVU/passed_qa"
rows = []
mcl_ids = os.listdir(img_dir)
for mclid in mcl_ids:
    dates = []
    for i, sessionid in enumerate(os.listdir(os.path.join(img_dir, str(mclid)))):
        regex = f'(?<={mclid})[\d]*$'
        session_date_str = re.search(regex, sessionid).group()
        session_date = datetime.strptime(session_date_str, '%Y%m%d')
        dates.append(session_date)
        
    # sort list by earliest to latest
    dates.sort()
    
    for i, date in enumerate(dates):
        rows.append({'pid':str(mclid), 'session': int(i), 'scan_date': date})

livu = pd.DataFrame(rows)
livu['cohort'] = 'livu'
def get_scanid(x):
    datestr = x['scan_date'].strftime('%Y%m%d')
    return f"{x['pid']}time{datestr}"
livu['id'] = livu.apply(lambda x: get_scanid(x), axis=1)

vumc_cohorts = pd.concat([vumc_cohorts, livu])

### Extra XNAT-Redcap on the subject level

In [15]:
vumc_cohorts['pid']

0       27054566930
1       20802811825
2       20802811825
3       24540308812
4       29479908093
           ...     
4105     2251425869
4106     2251425869
4107    18673442340
4108    18673442340
4109    18673442340
Name: pid, Length: 7239, dtype: object

In [16]:
both_grp_grp = both_grp.groupby('mcl_id', as_index=False).max()
minus_cohorts = both_grp_grp[~both_grp_grp['mcl_id'].isin(vumc_cohorts['pid'])]

nomcl_grp_grp = nomcl_grp.groupby('other_id', as_index=False).max()
both_df = pd.concat([
    minus_cohorts[['mcl_id', 'xnat_project', 'XNAT_CTSESSIONDATA ID']], 
    nomcl[['other_id', 'xnat_project', 'XNAT_CTSESSIONDATA ID']]
])
# both_df.to_csv('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/extra/xnat_redcap_ids.csv', index=False)


### Extra XNAT but not Redcap on the subject level

In [17]:
xnat_only = xnat_only[['mcl_id', 'Date', 'XNAT_CTSESSIONDATA ID', 'xnat_project']]
xnat_only_grp = xnat_only.groupby('mcl_id', as_index=False).max()
xnat_minus_cohorts = xnat_only_grp[~xnat_only_grp['mcl_id'].isin(vumc_cohorts['pid'])]
print(f"Xnat only ({len(xnat_only_grp)}) minus vumc cohorts ({len(vumc_cohorts['pid'].unique())}) = {len(xnat_minus_cohorts)}")
xnat_minus_cohorts = xnat_minus_cohorts[['mcl_id', 'xnat_project','XNAT_CTSESSIONDATA ID']]


Xnat only (3435) minus vumc cohorts (3786) = 1529


In [18]:
# all xnat records that are not in vumc cohorts
both_df['in_biorepo'] = True
xnat_minus_cohorts['in_biorepo'] = False
extra_xnat = pd.concat([both_df, xnat_minus_cohorts])
# extra_xnat.to_csv('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/extra/extra_xnat_ids.csv', index=False)

### Extra XNAT - CANARY intersection

In [48]:
canary = "/home/local/VANDERBILT/litz/data/mcl/Identified CANARY2 NODULE DATA_2017JUN21 (002).csv"
canary = pd.read_csv(canary, dtype={'MCL ID': str}).rename(columns={'DE-IDENTIFIED #': 'other_id', 'MCL ID': 'mcl_id'})
otherid = both_df[~both_df['other_id'].isnull()]
xnat_canary = canary.merge(otherid, on='other_id', how='inner')
# extra_xnat['in_canary2'] = extra_xnat['other_id'].isin()
extra_xnat['in_canary2'] = extra_xnat['other_id'].isin(xnat_canary['other_id'])
extra_xnat
extra_xnat.to_csv('/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/extra/extra_xnat_ids.csv', index=False)

### Extra XNAT on the scan level

In [104]:
merged = both_grp.merge(vumc_cohorts, left_on=['mcl_id', 'Date'], right_on=['pid', 'scan_date'], how='outer', indicator=True)
anti_merge = merged[(merged['_merge']=='left_only')].drop('_merge', axis=1)[both_grp.columns]
# anti_merge = anti_merge[~anti_merge['mcl_id'].isnull()][both_grp.columns]
print(len(anti_merge['mcl_id'].unique()))
anti_merge['unreferenced_subject'] = ~anti_merge['mcl_id'].isin(vumc_cohorts['pid'])
anti_merge

965


Unnamed: 0,mcl_id,XNAT_CTSESSIONDATA ID,Date,xnat_project,unreferenced_subject
0,10179873529,10179873529-20120508,2012-05-08,MCL,True
3,10310069205,10310069205_20181005,2018-10-05,MCL,True
4,10321528884,10321528884-20160629,2016-06-29,MafeCANARY,True
6,1035612163,1035612163-20120423,2012-04-23,MCL,True
8,1039114794,1039114794-20030604,2003-06-04,MCL,True
...,...,...,...,...,...
2889,9824474782,9824474782_20150521,2015-05-21,MCL,True
2890,9824474782,9824474782_20150819,2015-08-19,MCL,True
2891,9824474782,9824474782_20160210,2016-02-10,MCL,True
2892,9824474782,9824474782_20160225,2016-02-25,MCL,True


In [9]:
merged = xnat_only.merge(vumc_cohorts, left_on=['mcl_id', 'Date'], right_on=['pid', 'scan_date'], how='outer', indicator=True)
anti_merge_xnat = merged[(merged['_merge']=='left_only')].drop('_merge', axis=1)[xnat_only.columns]
print(len(anti_merge_xnat['mcl_id'].unique()))
anti_merge_xnat['unreferenced_subject'] = ~anti_merge_xnat['mcl_id'].isin(vumc_cohorts['pid'])
anti_merge_xnat

1103


Unnamed: 0,mcl_id,Date,XNAT_CTSESSIONDATA ID,xnat_project,unreferenced_subject
0,00000087,2017-01-13,SPORE_00000087-20170113,MCL,True
1,00000929,2018-01-19,SPORE_00000929-20180119,MCL,True
2,00001089,2018-01-19,SPORE_00001089-20180119,MCL,True
3,00001124,2018-01-05,SPORE_00001124-20180105,MCL,True
4,03930575362,2009-02-06,03930575362_03930575362-20090206,MCL,True
...,...,...,...,...,...
6276,9999999999,2016-07-21,9999999999-20160721,MCL,True
6277,9999999999,2018-11-28,9999999999-20181128,MCL,True
6278,9999999999,2019-03-04,9999999999-20190304,MCL,True
6279,9999999999,2019-11-11,9999999999-20191111,MCL,True


In [135]:
# exclude duplicates sessions
# anti_merge.to_csv("/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/extra/extra_xnat.csv", index=False)
# anti_merge_xnat.to_csv("/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/extra/extra_xnat_noredcap.csv", index=False)

In [141]:
a = pd.read_csv("/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/extra/extra_xnat_noredcap.csv")
a.groupby('mcl_id').max()['unreferenced_subject'].value_counts()

True     1053
False      49
Name: unreferenced_subject, dtype: int64

In [30]:
moffitt = pd.read_excel("/home/local/VANDERBILT/litz/data/mcl/moffitt.xlsx", dtype={'MCL ID': str})
moffitt_both = anti_merge.groupby('mcl_id', as_index=False).max().merge(moffitt, left_on='mcl_id', right_on='MCL ID')
moffitt_xnat_only = anti_merge_xnat.groupby('mcl_id', as_index=False).max().merge(moffitt, left_on='mcl_id', right_on='MCL ID')
moffitt_both.to_csv("/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/extra/extra_xnat_moffitt.csv", index=False)
moffitt_xnat_only.to_csv("/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/extra/extra_xnat_noredcap_moffitt.csv", index=False)

In [27]:
len(moffitt_xnat_only)

56