# NLST IPN cohort
* data pipeline for Mayo, Brock, and DeepLungIPN models
* cross-sectional cohort of subjects with IPNs on the latest scans
* feature set based off of: *Gao R, Li T, Tang Y, Xu K, Khan M, Kammer M, Antic SL, Deppen S, Huo Y, Lasko TA, Sandler KL, Maldonado F, Landman BA. Reducing uncertainty in cancer risk estimation for patients with indeterminate pulmonary nodules using an integrated deep learning model. Comput Biol Med. 2022 Sep 29;150:106113. doi: 10.1016/j.compbiomed.2022.106113. Epub ahead of print. PMID: 36198225.*

In [67]:
import pandas as pd
pd.options.mode.chained_assignment = None
import os
import glob
from os import path
import numpy as np
from math import isnan

### Radiology features

In [63]:
abnorm_df = pd.read_csv("/nfs/masi/NLST/package-nlst-7-2018.09.24/Spiral CT Abnormalities/sct_abnormalities.data.d100517.csv", dtype={'pid':str})
nodule_df = abnorm_df[abnorm_df['sct_ab_desc']==51] # 51="Non-calcified nodule or mass (opacity >= 4 mm diameter)"
nodule_df

Unnamed: 0,dataset_version,pid,sct_ab_desc,sct_ab_num,sct_epi_loc,sct_found_after_comp,sct_long_dia,sct_margins,sct_perp_dia,sct_pre_att,sct_slice_num,study_yr
3,2011.02.03/10.05.17,100004,51,1,1.0,0.0,4.0,2.0,3.0,1.0,26.0,0
6,2011.02.03/10.05.17,100004,51,1,1.0,0.0,4.0,2.0,3.0,1.0,22.0,1
9,2011.02.03/10.05.17,100005,51,1,1.0,0.0,6.0,2.0,6.0,1.0,32.0,0
13,2011.02.03/10.05.17,100005,51,1,1.0,0.0,6.0,2.0,4.0,1.0,38.0,1
18,2011.02.03/10.05.17,100005,51,1,1.0,0.0,6.0,2.0,6.0,1.0,38.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
177439,2011.02.03/10.05.17,218887,51,4,6.0,0.0,6.0,2.0,4.0,1.0,107.0,1
177442,2011.02.03/10.05.17,218887,51,1,6.0,0.0,6.0,2.0,4.0,1.0,106.0,2
177465,2011.02.03/10.05.17,218892,51,1,3.0,0.0,5.0,2.0,4.0,1.0,107.0,0
177467,2011.02.03/10.05.17,218892,51,1,3.0,0.0,5.0,2.0,5.0,1.0,104.0,1


In [64]:
# images that passed preprocessing
scan_dir = "/home/local/VANDERBILT/litz/data/nlst/DeepLungScreening/prep"
rows = []
for scan in glob.glob(os.path.join(scan_dir, "*_clean.nii.gz")):
    scanid = os.path.basename(scan).split("_clean.nii.gz")[0]
    pid, year = scanid.split("time")
    rows.append({"pid": pid, "year": year, "id": scanid})

scan_df = pd.DataFrame(rows)
scan_df = scan_df.sort_values(by=['pid', 'id'])
scan_df['session'] = scan_df.groupby(['pid'])['year'].rank('dense', ascending=True) # assign a T0, T1, or T2 for each scan
scan_df['session'] = scan_df['session'] - 1
scan_df['session'] = scan_df['session'].astype(int)
scan_df

Unnamed: 0,pid,year,id,session
13154,100004,1999,100004time1999,0
13245,100004,2000,100004time2000,1
5045,100004,2001,100004time2001,2
772,100012,1999,100012time1999,0
876,100012,2000,100012time2000,1
...,...,...,...,...
10414,218819,1999,218819time1999,0
808,218819,2000,218819time2000,1
8298,218819,2001,218819time2001,2
7533,218866,1999,218866time1999,0


In [65]:
# extract nodule features
nodule_df = nodule_df.rename(columns={'study_yr':'session'})
rads = scan_df.merge(nodule_df, on=['pid', 'session'])

def binary_spiculation(x):
    if x==1:
        return 1
    else:
        return 0

def is_upper_lobe(x):
    mapper = {
        1:1, #RUL
        2:0, #RML
        3:0, #RLL
        4:1, #LUL
        5:0, #Lingula
        6:0, #LLL
        8:0, #Other
    }
    return mapper[x]

def nodule_type(x):
    """map NLST nodule types to Brock nodule types (nonsolid or with ground-glass opacity, part-solid, solid)"""
    mapper = {
        1:2, # Soft Tissue -> Solid
        2:0, # Ground glass -> nonsolid
        3:1, # Mixed -> part-solid
        4:0, # Fluid/water -> nonsolid
        6:1, # Fat -> part-solid
        7:float("nan"), # Other -> Other
        9:float("nan"), # Unable to determine -> Other
    }
    return mapper[x]

rads['spiculation'] = rads['sct_margins'].apply(lambda x: binary_spiculation(x))
rads['upper_lobe'] = rads['sct_epi_loc'].apply(lambda x: is_upper_lobe(x))
rads = rads[~rads['sct_pre_att'].isnull()] # filter out nodule without a coded type
rads['nodule_type'] = rads['sct_pre_att'].apply(lambda x: nodule_type(x))
rads['nodule_size'] = rads['sct_long_dia']

# rads = rads.loc[rads.groupby('pid')['session'].idxmax()]
# rads = rads[['pid', 'year', 'id', 'session', 'spiculation', 'upper_lobe', 'nodule_size', 'nodule_type']]

In [66]:
# subjects may have multiple nodules. nodule features are on the patient level and aggregate radiology findings from multiple nodules
latest_scan = rads.loc[rads.groupby('pid')['session'].idxmax()] # get findings from latest scan only
latest_scan = latest_scan[['pid', 'session']]
rads_from_latest = rads.merge(latest_scan, on=['pid', 'session'])

rads = rads_from_latest.groupby('pid', as_index=False).agg(
    year=('year', 'max'),
    id=('id', 'max'),
    session=('session', 'max'),
    nodule_count=('sct_ab_desc', 'count'), # nodule count
    nodule_size=('nodule_size', 'max'), # largest nodule
    nodule_type=('nodule_type', 'min'), # matches the class in the order of nonsolid, part-solid, solid, other
    spiculation=('spiculation', 'max'), # True if at least one nodule spiculated
    upper_lobe=('upper_lobe', 'max') # True if at least one nodule in upper lobe
)
rads

Unnamed: 0,pid,year,id,session,nodule_count,nodule_size,nodule_type,spiculation,upper_lobe
0,100004,2000,100004time2000,1,1,4.0,2.0,0,1
1,100012,2000,100012time2000,1,1,15.0,0.0,1,1
2,100019,2000,100019time2000,1,1,14.0,2.0,1,1
3,100026,2001,100026time2001,2,2,5.0,2.0,0,0
4,100035,2001,100035time2001,2,1,5.0,2.0,0,0
...,...,...,...,...,...,...,...,...,...
5784,218391,2000,218391time2000,0,4,17.0,1.0,0,1
5785,218499,1999,218499time1999,0,1,12.0,1.0,1,1
5786,218510,2001,218510time2001,2,4,14.0,2.0,0,1
5787,218705,2001,218705time2001,2,2,6.0,2.0,0,1


### Clinical features

In [68]:
root_dir = "/nfs/masi/NLST/package-nlst-7-2018.09.24"
raw_cli = path.join(root_dir, "participant.data.d100517.csv")
raw_df = pd.read_csv(raw_cli, dtype={"pid": str})

# extract desired features - matching 
demo_ft = ['pid', 'age', 'gender', 'height', 'weight', 'rndgroup', 'educat', 'race', 'diagcopd']
pmh_ft = ['cancblad', 'cancbrea', 'canccerv', 'canccolo', 'cancesop', 'canckidn', 'canclary',
    'cancnasa', 'cancoral', 'cancpanc', 'cancphar', 'cancstom', 'cancthyr', 'canctran'] # past medical history features
fmh_ft = ['fambrother', 'famchild', 'famfather', 'fammother', 'famsister'] # family history of lung cancer
plc_ft = ['canclung']
smoking_ft = ['cigsmok', 'pkyr', 'age_quit', 'smokeday', 'smokeage']
lc_ft = ['can_scr', 'conflc', 'cancyr', 'candx_days'] # biopsy-confirmed lung cancer
sc_ft = demo_ft+pmh_ft+fmh_ft+plc_ft+smoking_ft+lc_ft
sc_df = raw_df[sc_ft]

# apply inclusion/exclusion criteria
sc_df = sc_df[sc_df['rndgroup']==1] # CT arm
sc_df = sc_df[(sc_df['age']>=55) & (sc_df['age'] <= 74)] # age
sc_df = sc_df[sc_df['pkyr'] >=30] # pkr
sc_df['canclung'] = sc_df['canclung'].fillna(0) # prior lung cancer
sc_df = sc_df[sc_df['canclung']==0]

# demo
sc_df['sex'] = sc_df['gender'].apply(lambda x: 1 if x==1 else 0)
def parse_edu(x):
    mapper = {
        1:1, # less than high school
        2:1,
        3:2, # high school
        4:3, # post high school training
        5:4, # some college
        6:5, # college degree
        7:6, # graduate degree
        8:0, # other, missing, decline to answer
        95:0,
        99:0,
        98:0,
    }
    return mapper[x]    
sc_df['education'] = sc_df['educat'].apply(lambda x: parse_edu(x))
def parse_race(x):
    mapper= {
        1:1, #White
        2:2, #Black
        3:4, #Asian
        4:5, #American Indian or Alaskan Native
        5:6, #Native Hawaiian or Other Pacific Islander
        6:0, #Mixed, other, missing, unkown, decline to answer
        7:0,
        95:0,
        96:0,
        98:0,
        99:0
    }
    return mapper[x]
sc_df['race'] = sc_df['race'].apply(lambda x: parse_race(x))

# calculate bmi = kg/m^2
sc_df['weight'] = 0.45359237*sc_df['weight'] # lb to kg
sc_df['height'] = 0.0254*sc_df['height'] # in to m
sc_df['bmi'] = sc_df['weight'].div(np.power(sc_df['height'], 2))

# aggregate PMH and FH to single features
sc_df[pmh_ft + fmh_ft] = sc_df[pmh_ft + fmh_ft].fillna(0)
sc_df[pmh_ft + fmh_ft] = sc_df[pmh_ft + fmh_ft].astype(bool) 
sc_df['phist'] = sc_df[pmh_ft].isin([1]).any(axis=1)
sc_df['fhist'] = sc_df[fmh_ft].isin([1]).any(axis=1)
print(f"PMH: \n {sc_df['phist'].value_counts()}")
print(f"FH: \n {sc_df['fhist'].value_counts()}")

#emphysema (Brock requires this)
emp_df = abnorm_df[abnorm_df['sct_ab_desc']==59]
emp_df = emp_df.loc[emp_df.groupby('pid', as_index="False")['study_yr'].idxmax()]
sc_df = sc_df.merge(emp_df, on=['pid'], how='left')
sc_df['emphysema']=~sc_df['sct_ab_desc'].isnull()
sc_df['copd'] = sc_df['diagcopd']

# pmh true if any pmh feature true
sc_df['phist'] = sc_df[pmh_ft].isin([1]).any(axis=1)
print(f"PMH: \n {sc_df['phist'].value_counts()}")

# smoking quit time: calc yrs bw smoking cessation age and screening age
sc_df = sc_df[sc_df['cigsmok'].notnull()] # drop subjects with a NaN smoking status
sc_df['cigsmok'] = sc_df['cigsmok'].astype(int)
sc_df['quit_time'] = sc_df['age'] - sc_df['age_quit']
sc_df['quit_time'] = sc_df['quit_time'].apply(lambda x: 0 if x < 0 else x).fillna(0)
sc_df['smo_intensity'] = sc_df['smokeday']
# smoking duration: number years smoked
def get_smo_duration(x):
    if not isnan(x['age_quit']):
        delta = x['age_quit'] - x['smokeage']
    else:
        delta = x['age'] - x['smokeage']
    return max(delta, 0)
sc_df['smo_duration'] = sc_df.apply(lambda x: get_smo_duration(x), axis=1)

# Identify Cases/Controls
pos = sc_df[sc_df['can_scr']!=0] # lung cancer report
pos = pos[pos['cancyr']<5]  # study year associated with cancer. drop year 5 and greater
pos['lung_cancer'] = 1 
pos = pos[['pid', 'lung_cancer']]
sc_df = sc_df.merge(pos, on=['pid'], how='left')
sc_df['lung_cancer'] = sc_df['lung_cancer'].fillna(0)
sc_df['lung_cancer'].value_counts()

  raw_df = pd.read_csv(raw_cli, dtype={"pid": str})


PMH: 
 False    25618
True      1080
Name: phist, dtype: int64
FH: 
 False    20889
True      5809
Name: fhist, dtype: int64
PMH: 
 False    25618
True      1080
Name: phist, dtype: int64


0.0    25807
1.0      891
Name: lung_cancer, dtype: int64

In [69]:
merged = sc_df.merge(rads, on=['pid'], how='inner')
merged

Unnamed: 0,pid,age,gender,height,weight,rndgroup,educat,race,diagcopd,cancblad,...,smo_duration,lung_cancer,year,id,session,nodule_count,nodule_size,nodule_type,spiculation,upper_lobe
0,100004,60,1,1.7780,92.986436,1,5,1,0.0,False,...,23.0,0.0,2000,100004time2000,1,1,4.0,2.0,0,1
1,100012,61,2,1.7018,64.410117,1,7,1,0.0,False,...,39.0,1.0,2000,100012time2000,1,1,15.0,0.0,1,1
2,100019,61,1,1.6510,65.317301,1,5,1,0.0,False,...,43.0,0.0,2000,100019time2000,1,1,14.0,2.0,1,1
3,100026,57,1,1.8034,114.305277,1,4,1,0.0,False,...,41.0,0.0,2001,100026time2001,2,2,5.0,2.0,0,0
4,100035,55,2,1.7780,69.853225,1,4,1,0.0,False,...,38.0,0.0,2001,100035time2001,2,1,5.0,2.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5784,218391,66,1,1.7780,74.842741,1,7,1,0.0,False,...,40.0,1.0,2000,218391time2000,0,4,17.0,1.0,0,1
5785,218499,63,1,1.8796,81.646627,1,3,1,0.0,False,...,48.0,1.0,1999,218499time1999,0,1,12.0,1.0,1,1
5786,218510,64,1,1.8288,92.986436,1,7,1,1.0,False,...,47.0,1.0,2001,218510time2001,2,4,14.0,2.0,0,1
5787,218705,68,1,1.7780,117.934016,1,5,1,0.0,False,...,43.0,0.0,2001,218705time2001,2,2,6.0,2.0,0,1


In [61]:
ft = ['pid', 'id', 'session', 'age', 'sex', 'race', 'education', 'bmi', 'phist', 'fhist','emphysema', 'copd', 'smo_status', 'quit_time', 'pkyr', 'smo_intensity',
    'smo_duration', 'spiculation', 'upper_lobe', 'nodule_size', 'nodule_type', 'nodule_count', 'with_image', 'with_marker', 'lung_cancer']
merged['smo_status'] = merged['cigsmok']
merged['with_image'] = True
merged['with_marker'] = True
cohort_df = merged[ft]
cohort_df

# cross sectional cohort
cs_path = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/nlst/nlst_ipn_v2.csv"
cohort_df.to_csv(cs_path, index=False)


## Longitudinal
two scans

In [65]:
all_label_path = "/home/local/VANDERBILT/litz/data/nlst/nlst_t2e_analysis.csv"
duration = pd.read_csv(all_label_path, dtype={'PID':str})
duration = duration[['filename', 'PID', 'Age', 'Session', 'Cancer', 'Duration', 'screen_result']]
duration = duration.dropna()
duration['id'] = duration['filename'].apply(lambda x: x.split('.nii.gz')[0])
duration = duration.rename(columns={'PID':'pid', 'Session':'session'})
duration

Unnamed: 0,filename,pid,Age,session,Cancer,Duration,screen_result,id
0,100002time1999.nii.gz,100002,66.035616,0,0,2378.0,0.0,100002time1999
1,100002time2000.nii.gz,100002,66.956164,1,0,2042.0,0.0,100002time2000
2,100002time2001.nii.gz,100002,68.002740,2,0,1660.0,0.0,100002time2001
3,100004time1999.nii.gz,100004,60.021918,0,0,2680.0,1.0,100004time1999
4,100004time2000.nii.gz,100004,61.238356,1,0,2236.0,1.0,100004time2000
...,...,...,...,...,...,...,...,...
69864,218893time2000.nii.gz,218893,70.035616,1,0,1927.0,0.0,218893time2000
69865,218893time2001.nii.gz,218893,71.071233,2,0,1549.0,0.0,218893time2001
69866,218894time1999.nii.gz,218894,57.000000,0,0,2547.0,0.0,218894time1999
69867,218894time2000.nii.gz,218894,58.112329,1,0,2141.0,0.0,218894time2000


In [80]:
scan_df[scan_df['pid']=='100004']

Unnamed: 0,pid,year,id,session
13154,100004,1999,100004time1999,0
13245,100004,2000,100004time2000,1
5045,100004,2001,100004time2001,2


In [78]:
# subjects with at least 2 scans
long = cohort_df.drop(columns=['id', 'session']).merge(duration, on='pid')
long = long.merge(scan_df, on=['pid', 'id', 'session']) # filter out scans that were not preprocessed
long_pids = long.groupby('pid', as_index=False)['id'].count()
long_pids = long_pids[long_pids['id']>1]
long = long.merge(long_pids['pid'], on='pid')

print(len(long['pid'].unique()))

# compute relative time distance between each scan
max_date = long.groupby('pid', as_index=False)['Duration'].min()
max_date = max_date.rename(columns={'Duration': 'latest_duration'})
long = long.merge(max_date, on='pid')
long['Duration'] = long['Duration'] - long['latest_duration']

# get 2 latest scan per subject
two_scan = long.loc[long.groupby('pid')['session'].nlargest(2).index.get_level_values(1)]

5098


In [79]:
two_scan_path = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/nlst/nlst_ipn_2scan_v2.csv"
two_scan = two_scan[ft+['Duration', 'filename']]
print(len(two_scan['pid'].unique()))
two_scan.to_csv(two_scan_path, index=False)
two_scan

5098


Unnamed: 0,pid,id,session,age,sex,bmi,phist,fhist,emphysema,smo_status,pkyr,spiculation,upper_lobe,nodule_size,nodule_type,nodule_count,lung_cancer,Duration,filename
2,100004,100004time2001,2,60,1,29.414135,False,False,False,0,34.0,0,1,4.0,2.0,1,0.0,0.0,100004time2001.nii.gz
1,100004,100004time2000,1,60,1,29.414135,False,False,False,0,34.0,0,1,4.0,2.0,1,0.0,291.0,100004time2000.nii.gz
4,100012,100012time2000,1,61,0,22.240116,False,False,False,1,37.0,1,1,15.0,0.0,1,1.0,0.0,100012time2000.nii.gz
3,100012,100012time1999,0,61,0,22.240116,False,False,False,1,37.0,1,1,15.0,0.0,1,1.0,347.0,100012time1999.nii.gz
7,100019,100019time2001,2,61,1,23.962608,False,False,True,0,78.0,1,1,14.0,2.0,1,0.0,0.0,100019time2001.nii.gz
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14899,218510,218510time2000,1,64,1,27.802713,False,False,True,0,70.5,0,1,14.0,2.0,4,1.0,396.0,218510time2000.nii.gz
14903,218705,218705time2001,2,68,1,37.305733,False,False,True,1,84.0,0,1,6.0,2.0,2,0.0,0.0,218705time2001.nii.gz
14902,218705,218705time2000,1,68,1,37.305733,False,False,True,1,84.0,0,1,6.0,2.0,2,0.0,390.0,218705time2000.nii.gz
14905,218866,218866time2000,1,57,1,23.055908,False,True,False,0,33.3,0,1,6.0,2.0,1,0.0,0.0,218866time2000.nii.gz


## Brock
age, gender, family cancer history, emphysema, nodule size, nodule type, upper lobe, nodule count, spiculation

In [9]:
brock_ft = ['pid', 'id', 'session', 'age', 'sex', 'fhist', 'emphysema', 
'spiculation', 'upper_lobe', 'nodule_size', 'nodule_type', 'nodule_count', 'lung_cancer']
brock_df = cohort_df[brock_ft]
brock_path = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/nlst_brock_v1.csv"
brock_df.to_csv(brock_path, index_label=False)
a = pd.read_csv(brock_path)
a

Unnamed: 0,pid,id,session,age,sex,fhist,emphysema,spiculation,upper_lobe,nodule_size,nodule_type,nodule_count,lung_cancer
0,100004,100004time2000,1,60,1,False,False,0,1,4.0,2.0,1,0.0
1,100012,100012time2000,1,61,0,False,False,1,1,15.0,0.0,1,1.0
2,100019,100019time2000,1,61,1,False,True,1,1,14.0,2.0,1,0.0
3,100026,100026time2001,2,57,1,False,False,0,0,5.0,2.0,2,0.0
4,100035,100035time2001,2,55,0,False,True,0,0,5.0,2.0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5784,218391,218391time2000,0,66,1,False,True,0,1,17.0,1.0,4,1.0
5785,218499,218499time1999,0,63,1,False,False,1,1,12.0,1.0,1,1.0
5786,218510,218510time2001,2,64,1,False,True,0,1,14.0,2.0,4,1.0
5787,218705,218705time2001,2,68,1,False,True,0,1,6.0,2.0,2,0.0


## DeepLungIPN and DeepLungScreening
reduce to feature set for DeepLungIPNs
* IPN feature set: age, BMI, smoking status, personal cancer history, pack years, nodule size, nodule spiculation, upper lobe
* Screening feature set: + sex, race, education, copd, family cancer history, smoking quit time

In [29]:
# Impute and normalize
scalars = ['age', 'bmi', 'pkyr', 'nodule_size']
dl_ipn = cohort_df.copy()
dl_ipn[scalars]  = cohort_df[scalars].astype(float)

# log nodule size
dl_ipn['nodule_size'] = np.log(dl_ipn['nodule_size'])

# normalize before imputation
dl_ipn[scalars] = (dl_ipn[scalars] - dl_ipn[scalars].min())/(dl_ipn[scalars].max() - dl_ipn[scalars].min())

# multiple linear imputation
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
features = ['age', 'bmi', 'smo_status', 'phist', 'pkyr', 'nodule_size', 'spiculation', 'upper_lobe']
dl_ipn['lung_cancer'] = dl_ipn['lung_cancer'].astype(int)
x = dl_ipn[features]
enc_features = x.columns
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(x)
imp_x = pd.DataFrame(imp.transform(x), columns=enc_features)
imp_ft = ['bmi', 'nodule_size']
ipn_enc = pd.merge(dl_ipn.drop(columns=imp_ft), imp_x[imp_ft], left_index=True, right_index=True)

ipn_enc
# # normalize again after imputation
# ipn_enc[scalars] = (ipn_enc[scalars] - ipn_enc[scalars].min())/(ipn_enc[scalars].max() - ipn_enc[scalars].min())
# ipn_enc['phist'] = ipn_enc['phist'].astype(int)
# ipn_enc

Unnamed: 0,pid,id,session,age,sex,phist,fhist,emphysema,smo_status,pkyr,spiculation,upper_lobe,nodule_type,nodule_count,lung_cancer,bmi,nodule_size
0,100004,100004time2000,1,0.263158,1,False,False,False,0,0.017391,0,1,2.0,1,0,0.274569,0.076330
1,100012,100012time2000,1,0.315789,0,False,False,False,1,0.030435,1,1,0.0,1,1,0.128352,0.427029
2,100019,100019time2000,1,0.315789,1,False,False,True,0,0.208696,1,1,2.0,1,0,0.163459,0.408723
3,100026,100026time2001,2,0.105263,1,False,False,False,0,0.136957,0,0,2.0,2,0,0.391403,0.135536
4,100035,100035time2001,2,0.000000,0,False,False,True,1,0.034783,0,0,2.0,1,0,0.125424,0.135536
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5784,218391,218391time2000,0,0.578947,1,False,False,True,0,0.043478,0,1,1.0,4,1,0.157593,0.460238
5785,218499,218499time1999,0,0.421053,1,False,False,False,1,0.176087,1,1,1.0,1,1,0.146090,0.367822
5786,218510,218510time2001,2,0.473684,1,False,False,True,0,0.176087,0,1,2.0,4,1,0.241726,0.408723
5787,218705,218705time2001,2,0.684211,1,False,False,True,1,0.234783,0,1,2.0,2,0,0.435411,0.183911


In [32]:
ipn_enc = ipn_enc.rename(columns={
    'age': 'norm_age',
    'bmi': 'norm_bmi',
    'pkyr':'norm_pky',
    'nodule_size': 'norm_logsize',
    'spiculation': 'spic',
    'upper_lobe': 'upper'
})
ipn_enc['subjwithkaggle'] = 1
ipn_enc['subjwithfactor'] = 1

# merge with DeepLungScreening feature set
dls_unique_ft = ['pid', 'id', 'with_image', 'with_marker', 'age', 'gender', 'race', 'education', 'bmi', 'copd','smo_duration', 'smo_intensity', 'quit_time', 'pkyr']
dls_path = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/nlst_cohort_prep_v2.csv"
dls = pd.read_csv(dls_path, dtype={'pid':str})
dls_merge = dls[dls_unique_ft].merge(ipn_enc, on=['pid', 'id'])
dls_merge
# dls_merge.to_csv("/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/nlst_cohort_prep_v2.csv")

Unnamed: 0,pid,id,with_image,with_marker,age,gender,race,education,bmi,copd,...,norm_pky,spic,upper,nodule_type,nodule_count,lung_cancer,norm_bmi,norm_logsize,subjwithkaggle,subjwithfactor
0,100004,100004time2000,True,True,60,1,1,4,29.414135,0.0,...,0.017391,0,1,2.0,1,0,0.274569,0.076330,1,1
1,100012,100012time2000,True,True,61,2,1,6,22.240116,0.0,...,0.030435,1,1,0.0,1,1,0.128352,0.427029,1,1
2,100019,100019time2000,True,True,61,1,1,4,23.962608,0.0,...,0.208696,1,1,2.0,1,0,0.163459,0.408723,1,1
3,100026,100026time2001,True,True,57,1,1,3,35.146505,0.0,...,0.136957,0,0,2.0,2,0,0.391403,0.135536,1,1
4,100035,100035time2001,True,True,55,2,1,3,22.096473,0.0,...,0.034783,0,0,2.0,1,0,0.125424,0.135536,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5784,218391,218391time2000,True,True,66,1,1,6,23.674792,0.0,...,0.043478,0,1,1.0,4,1,0.157593,0.460238,1,1
5785,218499,218499time1999,True,True,63,1,1,2,23.110395,0.0,...,0.176087,1,1,1.0,1,1,0.146090,0.367822,1,1
5786,218510,218510time2001,True,True,64,1,1,6,27.802713,1.0,...,0.176087,0,1,2.0,4,1,0.241726,0.408723,1,1
5787,218705,218705time2001,True,True,68,1,1,4,37.305733,0.0,...,0.234783,0,1,2.0,2,0,0.435411,0.183911,1,1


In [34]:
out_path = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/nlst/nlst_ipn_v2.csv"
dls_merge.to_csv(out_path, index_label=False)


## Mayo
age, smoking status, personal cancer history, nodule size, spiculation, upper lobe

In [10]:
mayo_ft = ['pid', 'id', 'session', 'age', 'phist', 'smo_status',
'spiculation', 'upper_lobe', 'nodule_size', 'lung_cancer']
mayo_df = cohort_df[mayo_ft]
mayo_path = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/nlst_mayo_v1.csv"
mayo_df.to_csv(mayo_path, index_label=False)
a = pd.read_csv(mayo_path)
a

Unnamed: 0,pid,id,session,age,phist,smo_status,spiculation,upper_lobe,nodule_size,lung_cancer
0,100004,100004time2000,1,60,False,0,0,1,4.0,0.0
1,100012,100012time2000,1,61,False,1,1,1,15.0,1.0
2,100019,100019time2000,1,61,False,0,1,1,14.0,0.0
3,100026,100026time2001,2,57,False,0,0,0,5.0,0.0
4,100035,100035time2001,2,55,False,1,0,0,5.0,0.0
...,...,...,...,...,...,...,...,...,...,...
5784,218391,218391time2000,0,66,False,0,0,1,17.0,1.0
5785,218499,218499time1999,0,63,False,1,1,1,12.0,1.0
5786,218510,218510time2001,2,64,False,0,0,1,14.0,1.0
5787,218705,218705time2001,2,68,False,1,0,1,6.0,0.0


### Characterize cohort

In [32]:
cohort_df

Unnamed: 0,pid,id,session,age,sex,education,bmi,phist,fhist,emphysema,...,quit_time,pkyr,spiculation,upper_lobe,nodule_size,nodule_type,nodule_count,with_image,with_marker,lung_cancer
0,100004,100004time2000,1,60,1,4,29.414135,False,False,False,...,15.0,34.0,0,1,4.0,2.0,1,True,True,0.0
1,100012,100012time2000,1,61,0,6,22.240116,False,False,False,...,0.0,37.0,1,1,15.0,0.0,1,True,True,1.0
2,100019,100019time2000,1,61,1,4,23.962608,False,False,True,...,3.0,78.0,1,1,14.0,2.0,1,True,True,0.0
3,100026,100026time2001,2,57,1,3,35.146505,False,False,False,...,0.0,61.5,0,0,5.0,2.0,2,True,True,0.0
4,100035,100035time2001,2,55,0,3,22.096473,False,False,True,...,0.0,38.0,0,0,5.0,2.0,1,True,True,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5784,218391,218391time2000,0,66,1,6,23.674792,False,False,True,...,10.0,40.0,0,1,17.0,1.0,4,True,True,1.0
5785,218499,218499time1999,0,63,1,2,23.110395,False,False,False,...,0.0,70.5,1,1,12.0,1.0,1,True,True,1.0
5786,218510,218510time2001,2,64,1,6,27.802713,False,False,True,...,0.0,70.5,0,1,14.0,2.0,4,True,True,1.0
5787,218705,218705time2001,2,68,1,4,37.305733,False,False,True,...,0.0,84.0,0,1,6.0,2.0,2,True,True,0.0


In [46]:
ft = 'education'
print(cohort_df[ft].mean())
print(cohort_df[ft].std())
print(cohort_df[ft].value_counts(dropna=False))

3.487648989462774
1.5534903354423604
2    1477
4    1320
5     917
3     880
6     726
1     375
0      94
Name: education, dtype: int64


In [38]:
m = scan_df.merge(cohort_df, on='pid')
m['lung_cancer'].value_counts()

0.0    14489
1.0     1323
Name: lung_cancer, dtype: int64