# NLST clinical data pipeline
Parse NLST data dictionaries to get labels and biomarkers

In [13]:
import pandas as pd
pd.options.mode.chained_assignment = None
import os
import glob
from os import path
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# patient dictionaries
root_dir = "/nfs/masi/NLST/package-nlst-7-2018.09.24"
raw_cli = path.join(root_dir, "participant.data.d100517.csv")
raw_df = pd.read_csv(raw_cli, dtype={"pid": str})
raw_df

  raw_df = pd.read_csv(raw_cli, dtype={"pid": str})


Unnamed: 0,cen,dataset_version,elig,ineligible,pid,rndgroup,study,age,educat,ethnic,...,progsite_pleura_ever,progsite_pleura_num,progsite_skin_1st,progsite_skin_days,progsite_skin_ever,progsite_skin_num,progsite_unk_1st,progsite_unk_days,progsite_unk_ever,progsite_unk_num
0,BG,2011.02.03/10.05.17,2,,100001,2,1,70,2,2,...,,,,,,,,,,
1,AF,2011.02.03/10.05.17,2,,100002,1,1,66,3,2,...,,,,,,,,,,
2,AR,2011.02.03/10.05.17,2,,100003,2,1,64,3,2,...,,,,,,,,,,
3,AF,2011.02.03/10.05.17,2,,100004,1,1,60,5,2,...,,,,,,,,,,
4,AA,2011.02.03/10.05.17,2,,100005,1,1,64,2,2,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53447,AE,2011.02.03/10.05.17,2,,218890,1,2,73,3,2,...,,,,,,,,,,
53448,BA,2011.02.03/10.05.17,2,,218891,2,3,66,2,2,...,,,,,,,,,,
53449,BF,2011.02.03/10.05.17,2,,218892,1,2,56,5,2,...,,,,,,,,,,
53450,AJ,2011.02.03/10.05.17,2,,218893,1,3,69,4,2,...,,,,,,,,,,


### Find images that pass QA or preprocessing
1. First step get all images from the nfs directory
2. After preprocessing, revise cohort with images that were able to be preprocessed

In [3]:
# QA'd imaging
qa_dirs = [
    "/nfs/masi/NLST/nifti/NIFTI_cancer/", "/nfs/masi/NLST/nifti/NIFTI_nocancer/",
    "/nfs/masi/NLST/nifti/NIFTI_notobtain", "/nfs/masi/NLST/nifti/NIFTI_pending"
]
rows = []
# recursively get all files in all subdirectories
for qa_dir in qa_dirs:  
    for pid in os.listdir(qa_dir):
        for year in os.listdir(os.path.join(qa_dir, pid)):
            for filename in os.listdir(os.path.join(qa_dir, pid, year)):
                if "time" in filename:
                    rows.append({"pid":pid, "year": year, "id": filename.split(".nii.gz")[0]})

# print(scans[:10])


In [18]:
# images that passed preprocessing
scan_dir = "/home/local/VANDERBILT/litz/data/nlst/DeepLungScreening/prep"
rows = []
for scan in glob.glob(os.path.join(scan_dir, "*_clean.nii.gz")):
    scanid = os.path.basename(scan).split("_clean.nii.gz")[0]
    pid, year = scanid.split("time")
    rows.append({"pid": pid, "year": year, "id": scanid})


In [19]:
scan_df = pd.DataFrame(rows)
scan_df = scan_df.sort_values(by=['pid', 'id'])
scan_df['session'] = scan_df.groupby(['pid'])['year'].rank('dense', ascending=True) # assign a T0, T1, or T2 for each scan
scan_df['session'] = scan_df['session'] - 1
scan_df['session'] = scan_df['session'].astype(int)

In [20]:
scan_df

Unnamed: 0,pid,year,id,session
13154,100004,1999,100004time1999,0
13245,100004,2000,100004time2000,1
5045,100004,2001,100004time2001,2
772,100012,1999,100012time1999,0
876,100012,2000,100012time2000,1
...,...,...,...,...
10414,218819,1999,218819time1999,0
808,218819,2000,218819time2000,1
8298,218819,2001,218819time2001,2
7533,218866,1999,218866time1999,0


## Lung screening cohort
Riqiang/PLCO biomarkers: age, education ,bmi, personal cancer history, family lung cancer history, tobacco use, tobacco use quit time, pack years

**Gao R, Tang Y, Khan MS, Xu K, Paulson AB, Sullivan S, Huo Y, Deppen S, Massion PP, Sandler KL, Landman BA. Cancer Risk Estimation Combining Lung Screening CT with Clinical Data Elements. Radiol Artif Intell. 2021 Oct 13;3(6):e210032. doi: 10.1148/ryai.2021210032. PMID: 34870220; PMCID: PMC8637232.**

In [21]:

# extract desired features - matching 
demo_ft = ['pid', 'age', 'educat', 'race', 'ethnic', 'height', 'weight',]
copd_ft = ['diagcopd']
pmh_ft = ['cancblad', 'cancbrea', 'canccerv', 'canccolo', 'cancesop', 'canckidn', 'canclary',
    'cancnasa', 'cancoral', 'cancpanc', 'cancphar', 'cancstom', 'cancthyr', 'canctran'] # past medical history features
plc_ft = ['canclung']
fmh_ft = ['fambrother', 'famchild', 'famfather', 'fammother', 'famsister'] # family history of lung cancer
smoking_ft = ['cigsmok', 'age_quit', 'pkyr']
lc_ft = ['conflc', 'cancyr', 'candx_days'] # biopsy-confirmed lung cancer
sc_ft = demo_ft + copd_ft+pmh_ft + plc_ft+fmh_ft+smoking_ft+lc_ft
sc_df = raw_df[sc_ft]
sc_df

Unnamed: 0,pid,age,educat,race,ethnic,height,weight,diagcopd,cancblad,cancbrea,...,famchild,famfather,fammother,famsister,cigsmok,age_quit,pkyr,conflc,cancyr,candx_days
0,100001,70,2,1,2,70.0,134.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,,99.00,2,,
1,100002,66,3,1,2,68.0,175.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,,52.00,0,,
2,100003,64,3,1,2,70.0,180.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,,66.00,2,,
3,100004,60,5,1,2,70.0,205.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,45.0,34.00,2,,
4,100005,64,2,1,2,67.0,220.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,61.0,92.00,3,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53447,218890,73,3,1,2,64.0,206.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,,120.00,0,,
53448,218891,66,2,1,2,71.0,175.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1,,50.00,0,,
53449,218892,56,5,1,2,72.0,172.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,56.0,38.00,0,,
53450,218893,69,4,1,2,68.0,210.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,68.0,41.25,0,,


### apply inclusion/exclusion criteria
![inclusion criteria!](./screening_cohort_criteria.png)

In [22]:
# apply inclusion/exclusion criteria
sc_df = sc_df[(sc_df['age']>=55) & (sc_df['age'] <= 74)] # age
sc_df = sc_df[sc_df['pkyr'] >=30] # pkr
sc_df['canclung'] = sc_df['canclung'].fillna(0) # prior lung cancer
sc_df = sc_df[sc_df['canclung']==0]
# assume NLST excluded subjects w/ chest ct within 18 mo, hemoptysis, weight loss



### Preprocess and derive features
- BMI
- PMH and FH collapse into single features
- tobacco use status: current or former smoker?
- smoking quit time: calc yrs bw smoking cessation age and screening age
- pack-years

In [23]:
# demo
sc_df['age'] = sc_df['age']
def parse_edu(x):
    mapper = {
        1:1, # less than high school
        2:1,
        3:2, # high school
        4:3, # post high school training
        5:4, # some college
        6:5, # college degree
        7:6, # graduate degree
        8:0, # other, missing, decline to answer
        95:0,
        99:0,
        98:0,
    }
    return mapper[x]    
sc_df['educat'] = sc_df['educat'].apply(lambda x: parse_edu(x))
def parse_race(x):
    mapper= {
        1:1, #White
        2:2, #Black
        3:4, #Asian
        4:5, #American Indian or Alaskan Native
        5:6, #Native Hawaiian or Other Pacific Islander
        6:0, #Mixed, other, missing, unkown, decline to answer
        7:0,
        95:0,
        96:0,
        98:0,
        99:0
    }
    return mapper[x]
sc_df['race'] = sc_df['race'].apply(lambda x: parse_race(x))

# set race to 3 if ethnicity is hispanic
def parse_ethnic(x):
    mapper = {
        1:1, # hispanic
        2:0, # neither hispanic nor latino, missing, decline to answer
        7:0,
        95:0,
        98:0,
        99:0
    }
    return mapper[x]
sc_df['ethnic'] = sc_df['ethnic'].apply(lambda x: parse_ethnic(x))
sc_df.loc[sc_df['ethnic']==1, 'race'] = 3

# calculate bmi = kg/m^2
sc_df['weight'] = 0.45359237*sc_df['weight'] # lb to kg
sc_df['height'] = 0.0254*sc_df['height'] # in to m
sc_df['bmi'] = sc_df['weight'].div(np.power(sc_df['height'], 2))
sc_df['race'].value_counts(dropna=False)

1    47861
2     2356
4     1071
3      935
0      835
6      184
5      161
Name: race, dtype: int64

### COPD and emphysema

In [24]:
# copd
sc_df['copd'] = sc_df['diagcopd']
print(sc_df['copd'].value_counts(dropna=False))

# emphysema as a reported abnormality from radiology read
abnorm_df = pd.read_csv("/nfs/masi/NLST/package-nlst-7-2018.09.24/Spiral CT Abnormalities/sct_abnormalities.data.d100517.csv")
abnorm_df

0.0    50383
1.0     2684
NaN      336
Name: copd, dtype: int64


Unnamed: 0,dataset_version,pid,sct_ab_desc,sct_ab_num,sct_epi_loc,sct_found_after_comp,sct_long_dia,sct_margins,sct_perp_dia,sct_pre_att,sct_slice_num,study_yr
0,2011.02.03/10.05.17,100002,65,1,,0.0,,,,,,0
1,2011.02.03/10.05.17,100002,64,1,,0.0,,,,,,1
2,2011.02.03/10.05.17,100002,65,1,,0.0,,,,,,2
3,2011.02.03/10.05.17,100004,51,1,1.0,0.0,4.0,2.0,3.0,1.0,26.0,0
4,2011.02.03/10.05.17,100004,64,2,,0.0,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
177482,2011.02.03/10.05.17,218894,65,2,,0.0,,,,,,0
177483,2011.02.03/10.05.17,218894,52,1,,0.0,,,,,,1
177484,2011.02.03/10.05.17,218894,53,2,,0.0,,,,,,1
177485,2011.02.03/10.05.17,218894,65,3,,0.0,,,,,,1


In [25]:
# aggregate PMH and FH to single features

# fill na and convert to intergers
sc_df[pmh_ft + fmh_ft] = sc_df[pmh_ft + fmh_ft].fillna(0)
sc_df[pmh_ft + fmh_ft] = sc_df[pmh_ft + fmh_ft].astype(bool) 

# pmh true if any pmh feature true
sc_df['phist'] = sc_df[pmh_ft].isin([1]).any(axis=1)

# fh true if any fh feature true
sc_df['fhist'] = sc_df[fmh_ft].isin([1]).any(axis=1)

print(f"PMH: \n {sc_df['phist'].value_counts()}")
print(f"FH: \n {sc_df['fhist'].value_counts()}")


PMH: 
 False    51116
True      2287
Name: phist, dtype: int64
FH: 
 False    41794
True     11609
Name: fhist, dtype: int64


In [26]:
# smoking quit time: calc yrs bw smoking cessation age and screening age
sc_df = sc_df[sc_df['cigsmok'].notnull()] # drop subjects with a NaN smoking status
sc_df['cigsmok'] = sc_df['cigsmok'].astype(int)
sc_df['quit_time'] = sc_df['age'] - sc_df['age_quit']
sc_df['quit_time'] = sc_df['quit_time'].apply(lambda x: 0 if x < 0 else x).fillna(0)
sc_df['quit_time'].unique()

array([ 0., 15.,  3., 14.,  2., 13.,  8.,  5.,  1.,  7.,  6., 11., 12.,
       10.,  9.,  4., 16., 17., 18., 21., 22., 19., 23., 24., 25., 20.,
       30., 31., 34., 26., 60., 29., 40., 53.])

### Positive/Negative labels
positive = biopsy-confirmed diagnosis of lung cancer within 2 years of imaging date

In [27]:
sc_df = sc_df.mask(sc_df['conflc'].isin([0, 3, 4]))
sc_df = sc_df[sc_df['conflc'].notnull()] # remove subjects whos lung cancer status unavailable

pos = sc_df[sc_df['conflc']==1.0]
pos_merged = pos.merge(scan_df, on=["pid"], how='inner')
# for each pid, get latest scan session
max_scan = pos_merged.groupby(['pid']).agg({'session':max})
max_scan = max_scan.merge(pos, on=['pid'], how='inner')
# # (session + 2)*365 = 2 years after latest date in days from randomization 
pos = max_scan[max_scan['candx_days'] <= ((max_scan['session']+2)*365)]
pos['lung_cancer'] = 1 
pos = pos[['pid', 'lung_cancer']]
sc_df = sc_df.merge(pos, on=['pid'], how='left')
sc_df['lung_cancer'] = sc_df['lung_cancer'].fillna(0) # all subjects without biopsy within 2 years are neg
sc_df

Unnamed: 0,pid,age,educat,race,ethnic,height,weight,diagcopd,cancblad,cancbrea,...,pkyr,conflc,cancyr,candx_days,bmi,copd,phist,fhist,quit_time,lung_cancer
0,100001,70.0,1.0,1.0,0.0,1.7780,60.781378,0.0,False,False,...,99.00,2.0,,,19.226801,0.0,False,False,0.0,0.0
1,100003,64.0,2.0,1.0,0.0,1.7780,81.646627,0.0,False,False,...,66.00,2.0,,,25.827046,0.0,False,False,0.0,0.0
2,100004,60.0,4.0,1.0,0.0,1.7780,92.986436,0.0,False,False,...,34.00,2.0,,,29.414135,0.0,False,False,15.0,0.0
3,100007,69.0,5.0,1.0,0.0,1.7526,108.862169,0.0,False,False,...,36.75,2.0,,,35.441441,0.0,False,False,0.0,0.0
4,100012,61.0,6.0,1.0,0.0,1.7018,64.410117,0.0,False,False,...,37.00,1.0,1.0,454.0,22.240116,0.0,False,False,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9531,218838,56.0,2.0,1.0,0.0,1.7780,88.450512,0.0,False,False,...,60.00,1.0,2.0,1077.0,27.979300,0.0,False,False,0.0,0.0
9532,218866,57.0,6.0,1.0,0.0,1.8288,77.110703,0.0,False,False,...,33.30,2.0,,,23.055908,0.0,False,True,4.0,0.0
9533,218873,61.0,3.0,1.0,0.0,1.8288,86.182550,0.0,False,False,...,67.50,1.0,4.0,1632.0,25.768368,0.0,False,False,0.0,0.0
9534,218874,56.0,2.0,2.0,0.0,1.7526,63.502932,0.0,False,False,...,70.00,2.0,,,20.674174,0.0,False,False,6.0,0.0


In [28]:
print(sc_df['lung_cancer'].value_counts(dropna=False))


0.0    8846
1.0     690
Name: lung_cancer, dtype: int64


In [30]:
merged = sc_df.merge(scan_df, on='pid', how='inner')
merged

Unnamed: 0,pid,age,educat,race,ethnic,height,weight,diagcopd,cancblad,cancbrea,...,candx_days,bmi,copd,phist,fhist,quit_time,lung_cancer,year,id,session
0,100004,60.0,4.0,1.0,0.0,1.7780,92.986436,0.0,False,False,...,,29.414135,0.0,False,False,15.0,0.0,1999,100004time1999,0
1,100004,60.0,4.0,1.0,0.0,1.7780,92.986436,0.0,False,False,...,,29.414135,0.0,False,False,15.0,0.0,2000,100004time2000,1
2,100004,60.0,4.0,1.0,0.0,1.7780,92.986436,0.0,False,False,...,,29.414135,0.0,False,False,15.0,0.0,2001,100004time2001,2
3,100012,61.0,6.0,1.0,0.0,1.7018,64.410117,0.0,False,False,...,454.0,22.240116,0.0,False,False,0.0,1.0,1999,100012time1999,0
4,100012,61.0,6.0,1.0,0.0,1.7018,64.410117,0.0,False,False,...,454.0,22.240116,0.0,False,False,0.0,1.0,2000,100012time2000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16853,218819,66.0,3.0,1.0,0.0,1.7272,58.967008,0.0,False,False,...,,19.766230,0.0,False,False,0.0,0.0,1999,218819time1999,0
16854,218819,66.0,3.0,1.0,0.0,1.7272,58.967008,0.0,False,False,...,,19.766230,0.0,False,False,0.0,0.0,2000,218819time2000,1
16855,218819,66.0,3.0,1.0,0.0,1.7272,58.967008,0.0,False,False,...,,19.766230,0.0,False,False,0.0,0.0,2001,218819time2001,2
16856,218866,57.0,6.0,1.0,0.0,1.8288,77.110703,0.0,False,False,...,,23.055908,0.0,False,True,4.0,0.0,1999,218866time1999,0


In [31]:
# get final features and rename for DeepLungScreening pipeline
ft = ['pid', 'id', 'session', 'age', 'race', 'education',  'bmi',  'copd', 'phist', 'fhist', 'smo_status', 'quit_time', 'pkyr', 'lung_cancer']
merged['education'] = merged['educat']
merged['smo_status'] = merged['cigsmok']
# sc_df['duration'] = sc_df['candx_days']
cohort_df = merged[ft]
cohort_df

Unnamed: 0,pid,id,session,age,race,education,bmi,copd,phist,fhist,smo_status,quit_time,pkyr,lung_cancer
0,100004,100004time1999,0,60.0,1.0,4.0,29.414135,0.0,False,False,0.0,15.0,34.0,0.0
1,100004,100004time2000,1,60.0,1.0,4.0,29.414135,0.0,False,False,0.0,15.0,34.0,0.0
2,100004,100004time2001,2,60.0,1.0,4.0,29.414135,0.0,False,False,0.0,15.0,34.0,0.0
3,100012,100012time1999,0,61.0,1.0,6.0,22.240116,0.0,False,False,1.0,0.0,37.0,1.0
4,100012,100012time2000,1,61.0,1.0,6.0,22.240116,0.0,False,False,1.0,0.0,37.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16853,218819,218819time1999,0,66.0,1.0,3.0,19.766230,0.0,False,False,1.0,0.0,52.0,0.0
16854,218819,218819time2000,1,66.0,1.0,3.0,19.766230,0.0,False,False,1.0,0.0,52.0,0.0
16855,218819,218819time2001,2,66.0,1.0,3.0,19.766230,0.0,False,False,1.0,0.0,52.0,0.0
16856,218866,218866time1999,0,57.0,1.0,6.0,23.055908,0.0,False,True,0.0,4.0,33.3,0.0


In [6]:
out_path = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/nlst_cohort_prep_v2.csv"
# cohort_df.to_csv(out_path, index_label=False)
a = pd.read_csv(out_path)

### Cohort statistics

In [352]:
grp = cohort_df.groupby('pid').max()
grp['lung_cancer'].value_counts()

0.0    5604
1.0     724
Name: lung_cancer, dtype: int64

In [23]:
cohort_df = pd.read_csv(out_path)
len(cohort_df['id'].tolist())

Unnamed: 0,pid,id,age,education,bmi,phist,fhist,smo_status,quit_time,pkyr,lung_cancer
0,100004,100004time1999,60.0,5.0,29.414135,False,False,0.0,15.0,34.0,0.0
1,100004,100004time2000,60.0,5.0,29.414135,False,False,0.0,15.0,34.0,0.0
2,100004,100004time2001,60.0,5.0,29.414135,False,False,0.0,15.0,34.0,0.0
3,100012,100012time1999,61.0,7.0,22.240116,False,False,1.0,,37.0,1.0
4,100012,100012time2000,61.0,7.0,22.240116,False,False,1.0,,37.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
17327,218819,218819time1999,66.0,4.0,19.766230,False,False,1.0,,52.0,0.0
17328,218819,218819time2000,66.0,4.0,19.766230,False,False,1.0,,52.0,0.0
17329,218819,218819time2001,66.0,4.0,19.766230,False,False,1.0,,52.0,0.0
17330,218866,218866time1999,57.0,7.0,23.055908,False,True,0.0,4.0,33.3,0.0


In [20]:
grp = cohort_df.groupby('pid').max()
grp['education'].value_counts(dropna=False)

3.0     1606
5.0     1455
6.0     1013
4.0      950
7.0      774
2.0      312
1.0      107
8.0       96
99.0      12
95.0       3
Name: education, dtype: int64

In [19]:
print(grp['pkyr'].mean())
print(grp['pkyr'].std())

58.45636061946903
25.604840129798834
