# Bronchoscopy dataset

In [1]:
import pandas as pd
import os
from datetime import datetime

In [2]:
cli = "/home/local/VANDERBILT/litz/data/bronch/bronch_raw.xlsm"
dates = "/home/local/VANDERBILT/litz/data/bronch/bronch_dates.xlsx"
cli = pd.read_excel(cli, dtype={'MRN':str}) 
dates = pd.read_excel(dates, dtype={'MRN':str, 'MCL ID':str})
print(f"subjects in clinical data: {len(cli['MRN'].unique())}")
print(f"subjects in dates data: {len(dates['MRN'].unique())}")

# subjects may have multiple nodules - group by largest nodule
largest_nodule = cli.iloc[cli.groupby('MRN')['Size of Nodule (Specific) (cm)'].idxmax()]

sessions = largest_nodule.merge(dates, on='MRN')
sessions

subjects in clinical data: 387
subjects in dates data: 391


Unnamed: 0,Record ID_x,MRN,Final nodule diagnosis,Age,Sex,Size of Nodule (Specific) (cm),Smoking Status,Does the patient have a history of extrathoracic cancer?,History of cancer within the past five years?,Location of nodule/mass,...,How was the nodule detected? Ensure you check the original reference scan in PACS,Primary purpose of reference bronchoscopy,Record ID_y,MCL ID,Date of Procedure,Size of Nodule (cm),HM size,Location,Density_y,Ref CT date
0,407,10053866,Benign,62,female,0.8,Current,No,No,RUL,...,screen-detected,diagnostic,407,27054566930,2018-10-12 00:00:00,0.8,8.5,RUL,Solid,2018-10-04
1,209,10133197,Malignant,61,male,2.5,Former,No,No,LUL,...,detected during active surveillance,tissue sampling for molecular analysis (alread...,209,20802811825,2018-07-27 00:00:00,2.5,2.6,LUL,Solid,2018-06-12
2,209,10133197,Malignant,61,male,2.5,Former,No,No,LUL,...,detected during active surveillance,tissue sampling for molecular analysis (alread...,220001,20802811825,2018-06-29 00:00:00,2.3,2.5,LUL,Solid,2019-06-10
3,209,10133197,Malignant,61,male,2.5,Former,No,No,LUL,...,detected during active surveillance,tissue sampling for molecular analysis (alread...,220002,20802811825,2018-06-29 00:00:00,1.2,1.2,RUL,Solid,2019-06-10
4,376,10227932,Benign,57,female,2.4,Former,No,No,LLL,...,incidentally detected,diagnostic,376,24540308812,2018-11-01 00:00:00,2.4,2.6,LLL,Solid,2018-10-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
443,285,9317355,Benign,79,male,2.1,Former,No,No,LLL,...,incidentally detected,diagnostic,285,34087001117,2019-03-15 00:00:00,2.1,2.3,LLL,Solid,2019-02-20
444,107,9608969,Malignant,59,male,1.5,Former,No,No,LUL,...,incidentally detected,diagnostic,107,5434231673,2018-05-16 00:00:00,1.5,1.4,LUL,Solid,2018-04-17
445,287,9683517,Benign,56,male,3.6,Former,No,No,LLL,...,incidentally detected,diagnostic,287,450204236,2019-03-13 00:00:00,3.6,3.9,LLL,Solid,2019-03-01
446,69,9822032,Benign,75,female,1.9,Former,No,No,RML,...,incidentally detected,diagnostic,69,41666093991,2018-03-01 00:00:00,1.9,1.9,RML,Solid,2018-02-21


In [3]:
xnat_path = '/home/local/VANDERBILT/litz/data/ajrccm/xnat20221201/MCL_CT.csv'
xnat = pd.read_csv(xnat_path)
def impute_date(x):
    if pd.isnull(x['Date']):
        try:
            return pd.to_datetime(x['XNAT_CTSESSIONDATA ID'].split('_')[1], format='%Y-%m-%d')
        except:
            return None
    else:
        return pd.to_datetime(x['Date'], format='%Y-%m-%d')
xnat['Date'] = xnat.apply(lambda x: impute_date(x), axis=1)
xnat['Date'] = pd.to_datetime(xnat['Date'], format='%Y-%m-%d')
sessions['Ref CT date'] = pd.to_datetime(sessions['Ref CT date'], format='%Y-%m-%d')


In [36]:
bronch_xnat = xnat.merge(sessions, left_on=['Subject', 'Date'], right_on=['MCL ID', 'Ref CT date'])
print(f"num sessions: {len(bronch_xnat)}")
print(f"num subjects: {len(bronch_xnat['MCL ID'].unique())}")
sess_str =','.join(bronch_xnat['XNAT_CTSESSIONDATA ID'].tolist())
download = "/home/local/VANDERBILT/litz/data/bronch/bronch_xnat_sessions.txt"
with open(download, 'w') as f:
    f.write(sess_str)


num sessions: 475
num subjects: 379


### Cases and controls
1. match scan with nodule record via Record ID
2. group by subject 

In [4]:
print(len(dates))
print(len(dates['MCL ID'].unique()))
print(len(cli))

454
391
450


In [5]:
cli_dates = cli.merge(dates, on=['MRN','Record ID'])

# cross sectional cohort
cli_dates['lung_cancer'] = cli_dates['Final nodule diagnosis'].apply(lambda x: int(x=='Malignant'))
cs = cli_dates.iloc[cli_dates.groupby(['MRN'])['lung_cancer'].idxmax()]
cs

Unnamed: 0,Record ID,MRN,Final nodule diagnosis,Age,Sex,Size of Nodule (Specific) (cm),Smoking Status,Does the patient have a history of extrathoracic cancer?,History of cancer within the past five years?,Location of nodule/mass,...,How was the nodule detected? Ensure you check the original reference scan in PACS,Primary purpose of reference bronchoscopy,MCL ID,Date of Procedure,Size of Nodule (cm),HM size,Location,Density_y,Ref CT date,lung_cancer
339,407,10053866,Benign,62,female,0.8,Current,No,No,RUL,...,screen-detected,diagnostic,27054566930,2018-10-12 00:00:00,0.8,8.5,RUL,Solid,2018-10-04,0
167,209,10133197,Malignant,61,male,2.5,Former,No,No,LUL,...,detected during active surveillance,tissue sampling for molecular analysis (alread...,20802811825,2018-07-27 00:00:00,2.5,2.6,LUL,Solid,2018-06-12,1
311,376,10227932,Benign,57,female,2.4,Former,No,No,LLL,...,incidentally detected,diagnostic,24540308812,2018-11-01 00:00:00,2.4,2.6,LLL,Solid,2018-10-25,0
298,363,10244135,Malignant,71,female,3.2,Former,No,No,LLL,...,incidentally detected,diagnostic,29479908093,2018-12-06 00:00:00,3.2,3.0,LLL,Solid,2018-12-03,1
304,369,10567634,Benign,74,female,3.4,Never,Yes,Yes,RUL,...,incidentally detected,diagnostic,14583998995,2018-12-28 00:00:00,3.4,3.4,RUL,Solid,2018-12-07,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
228,285,9317355,Benign,79,male,2.1,Former,No,No,LLL,...,incidentally detected,diagnostic,34087001117,2019-03-15 00:00:00,2.1,2.3,LLL,Solid,2019-02-20,0
82,107,9608969,Malignant,59,male,1.5,Former,No,No,LUL,...,incidentally detected,diagnostic,5434231673,2018-05-16 00:00:00,1.5,1.4,LUL,Solid,2018-04-17,1
230,287,9683517,Benign,56,male,3.6,Former,No,No,LLL,...,incidentally detected,diagnostic,450204236,2019-03-13 00:00:00,3.6,3.9,LLL,Solid,2019-03-01,0
50,69,9822032,Benign,75,female,1.9,Former,No,No,RML,...,incidentally detected,diagnostic,41666093991,2018-03-01 00:00:00,1.9,1.9,RML,Solid,2018-02-21,0


### Radiologic features

In [6]:
def get_spiculation(x):
    map = {'Checked':1, 'Unchecked':0}
    return map[x]
def get_upper_lobe(x):
    map = {
        'RUL':1,
        'RML':0,
        'RLL':0,
        'LUL':1,
        'Lingula':0,
        'LLL':0
    }
    return map[x]
def get_nodule_type(x):
    map = {
        'Pure GGO': 0,
        'Part-solid': 1,
        'Solid': 2
    }

cs['spiculation'] = cs['Nodule edge characteristics (choice=Spiculation)'].apply(lambda x: get_spiculation(x))
cs['upper_lobe'] = cs['Location'].apply(lambda x: get_upper_lobe(x))
cs['nodule_size'] = cs['Size of Nodule (Specific) (cm)'].astype(float)
cs['nodule_type'] = cs['Density_x'].apply(lambda x: get_nodule_type(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cs['spiculation'] = cs['Nodule edge characteristics (choice=Spiculation)'].apply(lambda x: get_spiculation(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cs['upper_lobe'] = cs['Location'].apply(lambda x: get_upper_lobe(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cs['nodule_size'] = cs['S

### Clinical features

In [7]:
def get_sex(x):
    map = {
        'male':1,
        'female':0
    }
    return map[x]
def parse_yesno(x):
    map = {
        'Yes':True,
        'No':False
    }
    return map[x]

def get_smo_status(x):
    map = {
        'Former': 0,
        'Current':1,
        'Never':2
    }
    return map[x]

cs['age'] = cs['Age']
cs['sex'] = cs['Sex'].apply(lambda x: get_sex(x))
cs['bmi'] = cs['BMI:'].astype(float)
cs['phist_extrathoracic'] = cs['Does the patient have a history of extrathoracic cancer?'].apply(lambda x: parse_yesno(x))
cs['phist_primary'] = cs['Does the patient have a history of prior primary lung cancer?'].apply(lambda x: parse_yesno(x))
cs['phist'] = (cs['phist_extrathoracic'] | cs['phist_primary']).astype(int)
cs['fhist'] = cs['Family history of lung cancer?'].apply(lambda x: parse_yesno(x)).astype(int)
cs['emphysema'] = cs['Radiographic emphysema present?'].apply(lambda x: parse_yesno(x)).astype(int)
cs['copd'] = cs['History of COPD?'].apply(lambda x: parse_yesno(x)).astype(int)
cs['smo_status'] = cs['Smoking Status'].apply(lambda x: get_smo_status(x))
cs['pkyr'] = cs['Pack year history of smoking:']
cs['quit_time'] = cs['Years since quitting smoking: Enter 0 if still active smoker at time of bronchoscopy']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cs['age'] = cs['Age']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cs['sex'] = cs['Sex'].apply(lambda x: get_sex(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cs['bmi'] = cs['BMI:'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,co

In [8]:
clinical_ft = ['pid', 'age', 'sex', 'bmi', 'emphysema', 'copd', 'phist', 'fhist', 'smo_status', 'quit_time', 'pkyr', 'lung_cancer']
cs['pid'] = cs['MCL ID']
cs = cs[clinical_ft]
cs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cs['pid'] = cs['MCL ID']


Unnamed: 0,pid,age,sex,bmi,emphysema,copd,phist,fhist,smo_status,quit_time,pkyr,lung_cancer
339,27054566930,62,0,21.6,1,1,0,0,1,0.0,40.0,0
167,20802811825,61,1,20.9,1,1,1,0,0,3.0,66.0,1
311,24540308812,57,0,22.1,1,1,0,0,0,0.0,60.0,0
298,29479908093,71,0,22.7,1,1,0,1,0,10.0,40.0,1
304,14583998995,74,0,25.3,0,0,1,0,2,,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
228,34087001117,79,1,25.8,0,0,0,1,0,40.0,20.0,0
82,5434231673,59,1,24.4,0,1,0,0,0,0.0,15.0,1
230,450204236,56,1,38.3,0,0,0,0,0,38.0,,0
50,41666093991,75,0,18.0,0,0,0,0,0,35.0,,0


### Get nifti scans

In [9]:
scan_dir = "/nfs/masi/MCL/xnat/xnat20221207_bronch/bronch"
rows = []
for mcl in os.listdir(scan_dir):
    for scanid in os.listdir(os.path.join(scan_dir, mcl)):
        try:
            scan_date = datetime.strptime(scanid, "%Y%m%d")
            for fname in os.listdir(os.path.join(scan_dir, mcl, scanid)):
                rows.append({'pid': mcl, 'scan_date': scan_date, 'filename':fname})
        except:
            continue

# prep_dir = "/home/local/VANDERBILT/litz/data/bronch/DeepLungScreening/prep/"
# rows = []
# for p in glob.glob(os.path.join(prep_dir, "*_clean.nii.gz")):
#     scanid = os.path.basename(p).split('_clean.nii.gz')[0]
#     mcl, date = scanid.split('time')
#     scan_date = datetime.strptime(date, "%Y%m%d")
#     rows.append({'pid': mcl, 'scan_date': scan_date, 'filename': scanid})


scan_df = pd.DataFrame(rows)
scan_df = scan_df.sort_values(by=['pid', 'scan_date'])
scan_df['session'] = scan_df.groupby(['pid'])['scan_date'].rank('dense', ascending=True) # assign a T0, T1, or T2 for each scan
scan_df['session'] = scan_df['session'] - 1
scan_df['session'] = scan_df['session'].astype(int)
scan_df

Unnamed: 0,pid,scan_date,filename,session
345,10140118338,2018-10-19,10140118338time20181019.nii.gz,0
231,10232218755,2019-02-04,10232218755time20190204.nii.gz,0
144,10302863632,2019-02-16,10302863632time20190216.nii.gz,0
7,10310069205,2018-10-05,10310069205time20181005.nii.gz,0
368,10501496583,2019-01-08,10501496583time20190108.nii.gz,0
...,...,...,...,...
29,8666755991,2017-12-01,8666755991time20171201.nii.gz,0
15,9057763661,2018-08-10,9057763661time20180810.nii.gz,0
227,9345567287,2018-07-13,9345567287time20180713.nii.gz,0
232,9473800086,2018-08-13,9473800086time20180813.nii.gz,0


In [10]:
cohort = cs.merge(scan_df, on='pid')
cohort['id'] = cohort['filename'].apply(lambda x: x.split('.nii.gz')[0])
cohort
cohort_path = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/bronch/bronch_v1.csv"
cohort.to_csv(cohort_path, index=False)

In [11]:
cohort

Unnamed: 0,pid,age,sex,bmi,emphysema,copd,phist,fhist,smo_status,quit_time,pkyr,lung_cancer,scan_date,filename,session,id
0,27054566930,62,0,21.6,1,1,0,0,1,0.0,40.0,0,2018-10-04,27054566930time20181004.nii.gz,0,27054566930time20181004
1,20802811825,61,1,20.9,1,1,1,0,0,3.0,66.0,1,2018-06-12,20802811825time20180612.nii.gz,0,20802811825time20180612
2,20802811825,61,1,20.9,1,1,1,0,0,3.0,66.0,1,2019-06-10,20802811825time20190610.nii.gz,1,20802811825time20190610
3,24540308812,57,0,22.1,1,1,0,0,0,0.0,60.0,0,2018-10-25,24540308812time20181025.nii.gz,0,24540308812time20181025
4,29479908093,71,0,22.7,1,1,0,1,0,10.0,40.0,1,2018-12-03,29479908093time20181203.nii.gz,0,29479908093time20181203
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
386,34087001117,79,1,25.8,0,0,0,1,0,40.0,20.0,0,2019-02-20,34087001117time20190220.nii.gz,0,34087001117time20190220
387,5434231673,59,1,24.4,0,1,0,0,0,0.0,15.0,1,2018-04-17,5434231673time20180417.nii.gz,0,5434231673time20180417
388,450204236,56,1,38.3,0,0,0,0,0,38.0,,0,2019-03-01,450204236time20190301.nii.gz,0,450204236time20190301
389,41666093991,75,0,18.0,0,0,0,0,0,35.0,,0,2018-02-21,41666093991time20180221.nii.gz,0,41666093991time20180221


In [None]:
ft =['pid', 'id','session', 'age', 'sex', 'bmi', 'emphysema', 'copd', 'phist', 'fhist', 'smo_status', 'quit_time', 'pkyr', 'lung_cancer', 'with_image', 'with_marker']

### available features
* demo - age, sex, BMI
* risk factors - fhist, phist of any cancer or past 5 years, COPD, emphysema, pkyr, smoking quit time, smoking status, nodule spiculation, nodule location
* nodule pathology

missing: race, education

In [5]:
bronch['Final nodule diagnosis'].value_counts(dropna=False)

Malignant    273
Benign       177
Name: Final nodule diagnosis, dtype: int64

In [6]:
# group nodules
bronch['lung_cancer'] = bronch['Final nodule diagnosis'].apply(lambda x: int(x=='Malignant'))
idxmax = bronch.groupby(['MRN'])['lung_cancer'].idxmax()
cohort = bronch.loc[idxmax]
cohort

Unnamed: 0,Record ID,MRN,Final nodule diagnosis,Age,Sex,Size of Nodule (Specific) (cm),Smoking Status,Does the patient have a history of extrathoracic cancer?,History of cancer within the past five years?,Location of nodule/mass,...,Pack year history of smoking:,History of COPD?,Peripheral 1/3 of lung,Growth of primary nodule noted on surveillance imaging? Choose No if no prior imaging,"Presence of pre-procedure symptoms? Includes: unexplained weight loss >5kg, dyspnea, pneumothorax, fatigue, pain or hemoptypsis",Pre-bronch percent predicted FEV1 (%): Enter as an integer 1-110,Years since quitting smoking: Enter 0 if still active smoker at time of bronchoscopy,How was the nodule detected? Ensure you check the original reference scan in PACS,Primary purpose of reference bronchoscopy,lung_cancer
343,407,10053866,Benign,62,female,0.8,Current,No,No,RUL,...,40.0,Yes,Yes,No,No,,0.0,screen-detected,diagnostic,0
170,209,10133197,Malignant,61,male,2.5,Former,No,No,LUL,...,66.0,Yes,Yes,Yes,No,20.0,3.0,detected during active surveillance,tissue sampling for molecular analysis (alread...,1
315,376,10227932,Benign,57,female,2.4,Former,No,No,LLL,...,60.0,Yes,Yes,No,No,,0.0,incidentally detected,diagnostic,0
302,363,10244135,Malignant,71,female,3.2,Former,No,No,LLL,...,40.0,Yes,Yes,No,Yes,56.0,10.0,incidentally detected,diagnostic,1
308,369,10567634,Benign,74,female,3.4,Never,Yes,Yes,RUL,...,0.0,No,Yes,Yes,No,,,incidentally detected,diagnostic,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232,285,9317355,Benign,79,male,2.1,Former,No,No,LLL,...,20.0,No,Yes,Yes,No,,40.0,incidentally detected,diagnostic,0
84,107,9608969,Malignant,59,male,1.5,Former,No,No,LUL,...,15.0,Yes,Yes,Yes,No,68.0,0.0,incidentally detected,diagnostic,1
234,287,9683517,Benign,56,male,3.6,Former,No,No,LLL,...,,No,Yes,Yes,No,75.0,38.0,incidentally detected,diagnostic,0
52,69,9822032,Benign,75,female,1.9,Former,No,No,RML,...,,No,Yes,Yes,No,,35.0,incidentally detected,diagnostic,0


In [7]:
cohort['lung_cancer'].value_counts()

1    236
0    151
Name: lung_cancer, dtype: int64

In [8]:
bronch_path = "/home/local/VANDERBILT/litz/data/bronch/Bronch MCL.csv"
bronch = pd.read_csv(bronch_path, dtype={'MCL ID': str})
bronch = bronch.rename(columns={'MCL ID': 'mcl_id'})