# Multi-Institute MCL Cohort


In [775]:
import pandas as pd
pd.options.mode.chained_assignment = None
import os
import glob
from os import path
import numpy as np
from datetime import datetime
import math

In [776]:
cli_path = "/nfs/masi/MCL/file/clinical/LabelFile/20210615MK/20210615_nopassword.csv"
cli_df = pd.read_csv(cli_path, dtype={'MCL.ID':str, 'Patient.ID': str, 'Patient.ID.1':str, 'MRN':str, 'MCL_ID':str, 'Smoking.Status': str})
cli_df = cli_df.rename(columns={'MCL_ID':'mcl_id'})

# decamp mcl_ids
decamp_path = "/nfs/masi/MCL/file/clinical/LabelFile/DECAMP_2020JUN12.xlsx"
decamp = pd.read_excel(decamp_path, dtype={'MCL_ID':str})
decamp_merge = cli_df.merge(decamp, left_on='mcl_id', right_on='Subject')
decamp_merge['mcl_id'] = decamp_merge['MCL_ID'] # replace local IDs with MCL_ID
decamp_merge = decamp_merge[cli_df.columns.tolist()]

# remove initial decamp rows and replace with new ones
cli_df = cli_df[cli_df['Cohort.1.Vandy.2.UPMC.3.DECAMP.4.UCD']!='3']
cli_df = pd.concat([cli_df, decamp_merge])

cli_df['Cohort.1.Vandy.2.UPMC.3.DECAMP.4.UCD'].value_counts(dropna=False)

1       527
NaN     249
2       158
3       143
4       118
V-II      1
Name: Cohort.1.Vandy.2.UPMC.3.DECAMP.4.UCD, dtype: int64

In [777]:

# get scans from root data 
# root_dir = "/nfs/masi/MCL/nifti/combine"
# root_mcls = os.listdir(root_dir)
# mcls = cli_df[cli_df['mcl_id'].isin(root_mcls)]
# print(mcls['Cohort.1.Vandy.2.UPMC.3.DECAMP.4.UCD'].value_counts())
# print(mcls['Sub.Cohort'].value_counts())

# rows = []
# for mcl in os.listdir(root_dir):
#     for scanid in os.listdir(os.path.join(root_dir, mcl)):
#         try:
#             scan_date = datetime.strptime(scanid, "%Y%m%d")
#             for fname in os.listdir(os.path.join(root_dir, mcl, scanid)):
#                 rows.append({'mcl_id': mcl, 'scan_date': scan_date, 'filename':fname})
#         except:
#             continue
        
prep_dir = "/home/local/VANDERBILT/litz/data/multi_mcl/DeepLungScreening/prep/"
rows = []
for p in glob.glob(os.path.join(prep_dir, "*_clean.nii.gz")):
    scanid = os.path.basename(p).split('_clean.nii.gz')[0]
    mcl, date = scanid.split('time')
    scan_date = datetime.strptime(date, "%Y%m%d")
    rows.append({'mcl_id': mcl, 'scan_date': scan_date, 'filename': scanid})

scan_df = pd.DataFrame(rows)
scan_df = scan_df.sort_values(by=['mcl_id', 'scan_date'])
scan_df['session'] = scan_df.groupby(['mcl_id'])['scan_date'].rank('dense', ascending=True) # assign a T0, T1, or T2 for each scan
scan_df['session'] = scan_df['session'] - 1
scan_df['session'] = scan_df['session'].astype(int)
scan_df


Unnamed: 0,mcl_id,scan_date,filename,session
317,10198136207,2016-04-07,10198136207time20160407,0
215,10227752708,2012-01-01,10227752708time20120101,0
1087,10249443060,2017-01-01,10249443060time20170101,0
553,10250356363,2013-03-27,10250356363time20130327,0
50,10250356363,2013-09-23,10250356363time20130923,1
...,...,...,...,...
588,9794774018,2006-07-25,9794774018time20060725,0
336,9868652359,2008-08-06,9868652359time20080806,0
135,9906725564,2015-02-22,9906725564time20150222,0
517,9906725564,2015-03-04,9906725564time20150304,1


In [778]:
# merge clinical with scans
merged = cli_df.merge(scan_df, on='mcl_id', how='left')
merged = merged[~merged['Cohort.1.Vandy.2.UPMC.3.DECAMP.4.UCD'].isnull()]

### Radiology features

In [779]:
merged.columns.tolist()

['Patient.ID',
 'MRN',
 'MCL.ID',
 'Patient.ID.1',
 'Sample.Barcode',
 'QC',
 'Changelog & Notes',
 'Sub.Cohort',
 'Cohort.1.Vandy.2.UPMC.3.DECAMP.4.UCD',
 'Has.Cyfra',
 'Has.HM',
 'Has.Both',
 'Hard.Exclude',
 'Excl.ST',
 'Excl.Contrast',
 'Excl.Size',
 'Excl.CT<>BD',
 'Has.CYFRA.No.Exclusion',
 'Has.HM.No.Exclusion',
 'Has.Both.No.Exclusion',
 'CancerNo',
 'Flag.IPN',
 'Flag.ST.2.5',
 'Flag.ST.3',
 'Flag.Contrast',
 'CBM Risk',
 'Radiomics Risk',
 'Mayo.Risk',
 'Age',
 'Smoking.Status.1.CURRENT.OR.FORMER.0.NEVER',
 'Extrathoracic.Cancer.More.Then.5.Years.Prior.1.1..0.0',
 'Nodule.Size.MM',
 'Spiculation.1.spiculated.0.not.spiculated.',
 'Upper.Lobe.1.yes.0.no',
 'Mayo.Logit',
 'Brock.Risk',
 'Age.Brock',
 'Sex.Brock',
 'Family History.Brock',
 'Emphysema.Brock',
 'Nodule Size.Brock',
 'Nodule Nonsolid or GGO.Brock',
 'Nodule Part-Solid.Brock',
 'Upper Lobe.Brock',
 'Nodule Count.Brock',
 'Nodule Spiculation.Brock',
 'Brock LOGIT',
 '1.Incidental.0.Screening.',
 'Diagnosis.Date.1',
 '

In [780]:
# most rad features already available

def get_type(x):
    if x['Nodule Nonsolid or GGO.Brock'] != 0:
        return 0 # nonsolid or ground glass opacity
    elif x['Nodule Part-Solid.Brock'] != 0:
        return 1 # part solid
    else:
        return 2 # solid

merged['spiculation'] = merged['Nodule Spiculation.Brock']
merged['upper_lobe'] = merged['Upper Lobe.Brock']
merged['nodule_size'] = merged['Nodule Size.Brock']
merged['nodule_type'] = merged.apply(lambda x: get_type(x), axis=1)


### Clinical features
we don't apply any inclusion/exclusion criteria because these cohorts were prospectively enrolled, so we assume criteria are already applied

In [781]:
print(cli_df['Cohort.1.Vandy.2.UPMC.3.DECAMP.4.UCD'].value_counts(dropna=False))

print(merged['Cohort.1.Vandy.2.UPMC.3.DECAMP.4.UCD'].value_counts(dropna=False))
# a = merged.groupby('mcl_id').max()
# print(a['Cohort.1.Vandy.2.UPMC.3.DECAMP.4.UCD'].value_counts(dropna=False))
merged
# merged[merged['Cohort.1.Vandy.2.UPMC.3.DECAMP.4.UCD']=='1']

1       527
NaN     249
2       158
3       143
4       118
V-II      1
Name: Cohort.1.Vandy.2.UPMC.3.DECAMP.4.UCD, dtype: int64
1       940
3       245
2       158
4       118
V-II      1
Name: Cohort.1.Vandy.2.UPMC.3.DECAMP.4.UCD, dtype: int64


Unnamed: 0,Patient.ID,MRN,MCL.ID,Patient.ID.1,Sample.Barcode,QC,Changelog & Notes,Sub.Cohort,Cohort.1.Vandy.2.UPMC.3.DECAMP.4.UCD,Has.Cyfra,...,BMayoBrock,more_MCLID,mcl_id,scan_date,filename,session,spiculation,upper_lobe,nodule_size,nodule_type
0,376873159,,376873159,376873159,02898-6,,,upmc,2,1,...,1.0,,376873159,NaT,,,0,1.0,21.9,2
1,398957338,,398957338,398957338,2011-636,,,upmc,2,1,...,1.0,,398957338,2011-01-01,398957338time20110101,0.0,0,1.0,11,2
2,1759365513,,1759365513,1759365513,2010-578,,,upmc,2,1,...,1.0,,1759365513,2010-01-01,1759365513time20100101,0.0,0,1.0,24.6,2
3,2368954232,,2368954232,2368954232,2012-649,,,upmc,2,1,...,1.0,,2368954232,2012-01-01,2368954232time20120101,0.0,1,1.0,25.7,2
4,2962357115,,2962357115,2962357115,2012-666,,,upmc,2,1,...,1.0,,2962357115,2011-01-01,2962357115time20110101,0.0,0,0.0,6.3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1706,4703-77,,,4703-77,4797-013,,2019.08.09 Diam from 12 to 39.5,DECAMP-1,3,1,...,1.0,,20449484282,2014-02-18,20449484282time20140218,1.0,1,0.0,39.5,2
1707,4703-78,,,4703-78,4202-007,,,DECAMP-1,3,1,...,1.0,,20719819907,NaT,,,0,1.0,18,2
1708,4703-88,,,4703-88,4794-003,,,DECAMP-1,3,1,...,1.0,,19182262002,2014-02-25,19182262002time20140225,0.0,1,1.0,13,2
1709,4703-88,,,4703-88,4794-003,,,DECAMP-1,3,1,...,1.0,,19182262002,2014-05-09,19182262002time20140509,1.0,1,1.0,13,2


In [782]:
# cohort
def get_cohort(x):
    mapper = {
        '1': 1,
        '2': 2,
        '3': 3,
        '4': 4,
        'V-II': np.nan
    }
    return mapper[x]
merged['cohort'] = merged['Cohort.1.Vandy.2.UPMC.3.DECAMP.4.UCD'].apply(lambda x: get_cohort(x))
merged = merged[~merged['cohort'].isnull()] # remove subjects not belonging to a cohort

# demo
def get_sex(x):
    mapper = {
        0:0,
        1:1,
        73:np.nan
    }
    return mapper[x]
merged['sex'] = merged['Sex.Brock'].apply(lambda x: get_sex(x))
merged['age'] = merged['Age.Brock']

def get_race(x):
    mapper = {
        'Caucasian': 1,
        'NaN': np.nan,
        'White': 1,
        'African American': 2,
        'Asian': 4,
        'Native Hawaiian or Other Pacific Islander': 6,
        'Native American': 5,
        'Unknown/Other': 0,
        '2017-06-02 00:00:00': np.nan,
        '2012-05-03 00:00:00': np.nan,
    }
    if pd.isnull(x):
        return np.nan
    else:
        return mapper[x]
merged['race'] = merged['Race'].apply(lambda x: get_race(x))
merged['bmi'] = pd.to_numeric(merged['BMI'], errors='coerce')

# cancer history
def get_phist(x):
    mapper = {
        '0': 0,
        '1': 1,
        'No': 0,
        'Yes': 1,
    }
    if pd.isnull(x):
        return np.nan
    else:
        return mapper[x]

merged['phist'] = merged['Extrathoracic.Cancer.More.Then.5.Years.Prior.1.1..0.0'].apply(lambda x: get_phist(x))
merged['fhist'] = merged['Family History.Brock']

#emphysema (Brock requires this)
merged['emphysema'] = merged['Emphysema.Brock']

# smoking vars
def get_smo_status(x):
    # current, former, never, or NaN
    mapper = {
        'Ex-smoker': 0,
        'Current smoker': 1,
        'NaN': np.nan,
        'Never smoker': 2,
        'ex-smoker quit at least 12 months ago': 0,
        'current smoker': 1,
        '71.477302266779': np.nan,
        'never smoker': 2,
        'Former smoker': 0,
        'M': np.nan,
        'P': np.nan,
    }
    if pd.isnull(x):
        return np.nan
    else:
        return mapper[x]
merged['smo_status'] = merged['Smoking.Status'].apply(lambda x: get_smo_status(x))
merged['smo_status'].value_counts(dropna=False)
merged['pkyr'] = merged['PKY']

# CYFRA
merged['cyfra'] = merged['CSI..CYFRA21.1..ng.mL.']
merged['with_marker'] = ~merged['cyfra'].isnull()

### Cases and controls

In [783]:
def get_cancer(x):
    mapper={
        '1': 1,
        '0': 0,
        'Yes': 1,
        'No': 0
    }
    return mapper[x]

merged['lung_cancer'] = merged['CancerNo'].apply(lambda x: get_cancer(x))

### Final cohort
* each row uniquely identifies a session and subject
* missing COPD, edu, and smoking quit time

In [784]:
merged[merged['mcl_id']=='19182262002'][['CT.Date', 'scan_date', 'session']]

Unnamed: 0,CT.Date,scan_date,session
1708,2014-02-25,2014-02-25,0.0
1709,2014-02-25,2014-05-09,1.0
1710,2014-02-25,2015-04-07,2.0


In [790]:
# final features and rename
def parse_id(x):
    try:
        return x.split('.nii.gz')[0]
    except:
        return x
merged['pid'] = merged['mcl_id']
merged['id'] = merged['filename'].apply(lambda x: parse_id(x))
merged['with_image'] = ~merged['filename'].isnull()
merged['mayo_risk'] = merged['Mayo.Risk']
merged['CBM_risk'] = merged['CBM Risk']
merged['radiomics_risk'] = merged['Radiomics Risk']
merged['brock_risk'] = merged['Brock.Risk']

# this cohort doesn't have COPD or edu
ft = ['pid', 'id', 'filename', 'session', 'scan_date', 'cohort', 'with_image', 'with_marker', 'age', 'race', 'bmi', 'emphysema', 
    'phist', 'fhist', 'smo_status', 'pkyr', 'cyfra', 'lung_cancer']
risks_ft = ['mayo_risk', 'CBM_risk', 'radiomics_risk', 'brock_risk']
cohort = merged[ft+risks_ft]
cohort

Unnamed: 0,pid,id,filename,session,scan_date,cohort,with_image,with_marker,age,race,...,phist,fhist,smo_status,pkyr,cyfra,lung_cancer,mayo_risk,CBM_risk,radiomics_risk,brock_risk
0,376873159,,,,NaT,2.0,False,True,64.0,1.0,...,0,0,0.0,70.0,0.268318,0,0.510189,0.366301,0.742949,0.278794
1,398957338,398957338time20110101,398957338time20110101,0.0,2011-01-01,2.0,True,True,66.0,1.0,...,0,0,1.0,50.0,1.185300,0,0.219309,0.093341,0.335967,0.084037
2,1759365513,1759365513time20100101,1759365513time20100101,0.0,2010-01-01,2.0,True,True,80.0,1.0,...,0,0,0.0,10.0,1.545812,1,0.733087,0.863324,0.795813,0.429047
3,2368954232,2368954232time20120101,2368954232time20120101,0.0,2012-01-01,2.0,True,True,52.0,1.0,...,0,0,1.0,50.0,3.212954,1,0.749581,0.966498,0.822853,0.588785
4,2962357115,2962357115time20110101,2962357115time20110101,0.0,2011-01-01,2.0,True,True,68.0,1.0,...,0,0,0.0,94.5,0.995075,0,0.070825,0.039126,0.299580,0.009568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1706,20449484282,20449484282time20140218,20449484282time20140218,1.0,2014-02-18,3.0,True,True,77.0,,...,0,0,0.0,94.5,0.046962,1,0.954705,,,0.614620
1707,20719819907,,,,NaT,3.0,False,True,76.0,,...,0,0,1.0,29.5,1.489888,0,0.503275,,,0.272698
1708,19182262002,19182262002time20140225,19182262002time20140225,0.0,2014-02-25,3.0,True,True,51.0,,...,0,0,1.0,30.0,0.083823,1,0.363386,,,0.163168
1709,19182262002,19182262002time20140509,19182262002time20140509,1.0,2014-05-09,3.0,True,True,51.0,,...,0,0,1.0,30.0,0.083823,1,0.363386,,,0.163168


In [769]:
# export
# out_path = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/multi_mcl_v1.csv"
out_path = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/multi_mcl/multi_mcl_prep_v1.csv"
# cohort.to_csv(out_path, index=False)
a = pd.read_csv(out_path, index=False)
a

TypeError: read_csv() got an unexpected keyword argument 'index'

### Cross sectional imaging cohort
get latest scan from each subject

In [772]:
nonan_cohort = cohort[~cohort['session'].isnull()]
idxmax = nonan_cohort.groupby('pid')['session'].idxmax()
latest_scan = nonan_cohort.loc[idxmax]
latest_scan
out_path = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/multi_mcl/multi_mcl_latest_scan_v1.csv"
latest_scan.to_csv(out_path, index=False)

Unnamed: 0,pid,id,filename,session,scan_date,cohort,with_image,with_marker,age,race,...,phist,fhist,smo_status,pkyr,cyfra,lung_cancer,mayo_risk,CBM_risk,radiomics_risk,brock_risk
0,10198136207,10198136207time20160407,10198136207time20160407,0.0,2016-04-07,3.0,True,True,65.0,,...,1,0,0.0,30.0,0.133729,0,0.980870,,,0.759329
1,10227752708,10227752708time20120101,10227752708time20120101,0.0,2012-01-01,4.0,True,True,65.0,1.0,...,0,0,1.0,50.0,0.363379,0,0.233606,0.257367,0.735378,0.125234
2,10249443060,10249443060time20170101,10249443060time20170101,0.0,2017-01-01,4.0,True,True,72.0,1.0,...,0,0,0.0,15.0,0.330331,0,0.264568,0.138162,0.593763,0.100351
3,10250356363,10250356363time20130923,10250356363time20130923,1.0,2013-09-23,1.0,True,True,82.0,1.0,...,1,1,,8.0,1.820069,1,0.913006,0.904306,0.784516,0.599193
4,10325219903,10325219903time20170620,10325219903time20170620,0.0,2017-06-20,1.0,True,True,82.0,1.0,...,1,0,0.0,35.0,1.364643,1,0.961702,,,0.586237
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
784,9759774468,9759774468time20060101,9759774468time20060101,0.0,2006-01-01,2.0,True,True,70.0,1.0,...,0,0,0.0,80.0,0.290620,0,0.562147,0.287875,0.655700,0.311100
785,9794774018,9794774018time20060725,9794774018time20060725,0.0,2006-07-25,1.0,True,True,73.0,1.0,...,1,0,0.0,25.0,0.265997,0,0.584871,,,0.100850
786,9868652359,9868652359time20080806,9868652359time20080806,0.0,2008-08-06,1.0,True,True,65.0,1.0,...,1,0,1.0,25.0,1.014378,1,0.786507,,,0.269155
787,9906725564,9906725564time20150304,9906725564time20150304,1.0,2015-03-04,3.0,True,True,68.0,,...,0,0,0.0,44.0,0.340891,1,0.108835,0.155835,0.693003,0.037683


### Longitudinal Imaging Cohort with time distance
get subset of subjects with at least 2 scans

In [791]:
# subjects with at least 2 scans
long = cohort[~cohort['filename'].isnull()]
long_pids = long.groupby('pid', as_index=False)['id'].count()
long_pids = long_pids[long_pids['id']>1]
long = long.merge(long_pids['pid'], on='pid')

# num subjects w/ longitudinal img per cohort
a = long.groupby('pid').max()
print(a['cohort'].value_counts())
print(len(long))

# compute relative time distance between each scan
max_date = long.groupby('pid', as_index=False)['scan_date'].max()
max_date = max_date.rename(columns={'scan_date': 'latest_scan_date'})
long = long.merge(max_date, on='pid')
long['Duration'] = long['latest_scan_date'] - long['scan_date']
long['Duration'] = long['Duration'].apply(lambda x: x.days)

# get 2 latest scan per subject
two_scan = long.loc[long.groupby('pid')['session'].nlargest(2).index.get_level_values(1)]
two_scan

1.0    143
3.0     46
Name: cohort, dtype: int64
704


Unnamed: 0,pid,id,filename,session,scan_date,cohort,with_image,with_marker,age,race,...,smo_status,pkyr,cyfra,lung_cancer,mayo_risk,CBM_risk,radiomics_risk,brock_risk,latest_scan_date,Duration
371,10250356363,10250356363time20130923,10250356363time20130923,1.0,2013-09-23,1.0,True,True,82.0,1.0,...,,8.0,1.820069,1,0.913006,0.904306,0.784516,0.599193,2013-09-23,0
370,10250356363,10250356363time20130327,10250356363time20130327,0.0,2013-03-27,1.0,True,True,82.0,1.0,...,,8.0,1.820069,1,0.913006,0.904306,0.784516,0.599193,2013-09-23,180
577,10491077387,10491077387time20170614,10491077387time20170614,2.0,2017-06-14,3.0,True,True,68.0,,...,0.0,50.0,0.482718,1,0.290842,0.258759,0.672441,0.028431,2017-06-14,0
576,10491077387,10491077387time20170605,10491077387time20170605,1.0,2017-06-05,3.0,True,True,68.0,,...,0.0,50.0,0.482718,1,0.290842,0.258759,0.672441,0.028431,2017-06-14,9
168,10492942672,10492942672time20100924,10492942672time20100924,1.0,2010-09-24,1.0,True,True,58.0,1.0,...,0.0,37.5,1.722980,1,0.454848,0.786468,0.727257,0.353116,2010-09-24,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,9420311481,9420311481time20150624,9420311481time20150624,3.0,2015-06-24,1.0,True,True,84.0,1.0,...,0.0,30.0,6.862000,1,0.869903,0.992393,0.812131,0.391713,2015-08-05,42
426,9663495080,9663495080time20171030,9663495080time20171030,3.0,2017-10-30,1.0,True,True,52.0,1.0,...,2.0,0.0,0.211000,0,0.060871,0.052780,0.553849,0.080170,2017-10-30,0
425,9663495080,9663495080time20170227,9663495080time20170227,2.0,2017-02-27,1.0,True,True,52.0,1.0,...,2.0,0.0,0.211000,0,0.060871,0.052780,0.553849,0.080170,2017-10-30,245
560,9906725564,9906725564time20150304,9906725564time20150304,1.0,2015-03-04,3.0,True,True,68.0,,...,0.0,44.0,0.340891,1,0.108835,0.155835,0.693003,0.037683,2015-03-04,0


In [792]:
two_scan_out = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/multi_mcl/multi_mcl_2scan.csv"
two_scan.to_csv(two_scan_out)

In [720]:
rgao_long_path = "/home/local/VANDERBILT/litz/github/MASILab/RNNLung/compare/mcl2021/data/20210615_nopassword.csv"
rgao_long = pd.read_csv(rgao_long_path, dtype={'Patient.ID':str, 'Patient.ID.1':str, 'MCL.ID':str})
rgao_long.columns.tolist()

['Patient.ID',
 'MRN',
 'MCL.ID',
 'Patient.ID.1',
 'Sub.Cohort',
 'Cohort_ori',
 'Cohort',
 'Has.Cyfra',
 'Has.HM',
 'Has.Both',
 'Hard.Exclude',
 'Excl.ST',
 'Excl.Contrast',
 'Excl.Size',
 'Excl.CT<>BD',
 'Has.CYFRA.No.Exclusion',
 'Has.HM.No.Exclusion',
 'Has.Both.No.Exclusion',
 'CancerNo',
 'Flag.IPN',
 'Flag.ST.2.5',
 'Flag.ST.3',
 'Flag.Contrast',
 'CBM_Risk',
 'Radiomics Risk',
 'Mayo.Risk',
 'Age',
 'Smoking.Status.1.CURRENT.OR.FORMER.0.NEVER',
 'Extrathoracic.Cancer.More.Then.5.Years.Prior.1.1..0.0',
 'Nodule.Size.MM',
 'spic',
 'upper',
 'Mayo.Logit',
 'Brock.Risk',
 'Age.Brock',
 'Sex.Brock',
 'Family History.Brock',
 'Emphysema.Brock',
 'Nodule Size.Brock',
 'Nodule Nonsolid or GGO.Brock',
 'Nodule Part-Solid.Brock',
 'Upper Lobe.Brock',
 'Nodule Count.Brock',
 'Nodule Spiculation.Brock',
 'Brock LOGIT',
 '1.Incidental.0.Screening.',
 'Diagnosis.Date.1',
 'Sample Blood Draw Date',
 'Blood.Draw.Date',
 'BD/CT',
 'BD/CT<>Diag',
 'BD<>Diag 120',
 'CT.Date',
 'LDKA',
 'Death.

### Characterize cohort

In [523]:
grp = cohort.groupby('pid', as_index=False).max()
print('# subjects per cohort')
print(grp['cohort'].value_counts())

print('# scans per cohort')
with_scan = cohort[cohort['with_image']]
print(with_scan['cohort'].value_counts())

print('# subjects with scans per cohort')
with_scan_grp = with_scan.groupby('pid', as_index=False).max()
print(with_scan_grp['cohort'].value_counts())

print('# subjects with cyfra')
with_marker = cohort[cohort['with_marker']]
with_marker = with_marker.groupby('pid', as_index=False).max()
print(with_marker['cohort'].value_counts())

print('# subjects with both')
both = cohort[cohort['with_marker'] & cohort['with_image']] 
both = both.groupby('pid', as_index=False).max()
print(both['cohort'].value_counts())

print('# subjects with neither')
neither = cohort[~cohort['with_marker'] & ~cohort['with_image']] 
neither = neither.groupby('pid', as_index=False).max()
print(neither['cohort'].value_counts())
# cbm = grp[~grp['CBM_risk'].isnull()]
# cbm['cohort'].value_counts()


# subjects per cohort
1    527
2    158
3    143
4    118
Name: cohort, dtype: int64
# scans per cohort
1    833
3    265
2    154
4    115
Name: cohort, dtype: int64
# subjects with scans per cohort
1    403
2    154
3    141
4    115
Name: cohort, dtype: int64
# subjects with cyfra
1    447
2    154
3    142
4    118
Name: cohort, dtype: int64
# subjects with both
1    373
2    150
3    140
4    115
Name: cohort, dtype: int64
# subjects with neither
1    50
Name: cohort, dtype: int64


In [532]:
# case control breakdown
label = 4
a = grp[grp['cohort']==label]
print(a['lung_cancer'].value_counts())

wa = with_scan[with_scan['cohort']==label]
print(wa['lung_cancer'].value_counts())


0    59
1    59
Name: lung_cancer, dtype: int64
0    58
1    57
Name: lung_cancer, dtype: int64


In [502]:
# num scans per subject in each cohort
counts = cohort.groupby(['pid']).size().reset_index(name='counts')
grp_counts = grp.merge(counts, on='pid')
grp_counts.groupby('cohort').agg({'counts': [np.mean, np.std]})

Unnamed: 0_level_0,counts,counts
Unnamed: 0_level_1,mean,std
cohort,Unnamed: 1_level_2,Unnamed: 2_level_2
1,1.815939,1.888099
2,1.0,0.0
3,1.867133,1.440059
4,1.0,0.0


In [537]:
# summary of scalar values
grp.groupby(['cohort']).agg({'age': [np.mean, np.std]})

Unnamed: 0_level_0,age,age
Unnamed: 0_level_1,mean,std
cohort,Unnamed: 1_level_2,Unnamed: 2_level_2
1,68.927894,10.422196
2,67.708861,8.791501
3,67.811189,7.795765
4,65.686441,8.254187


In [543]:
# summary of categorical, make sure to check for nulls
covar = 'smo_status'
print(len(grp[grp[covar].isnull()]))
grp.groupby(['cohort'])[covar].value_counts()

33


cohort  smo_status
1       0.0           260
        1.0           200
        2.0            45
2       0.0           103
        1.0            55
3       0.0            74
        1.0            58
4       0.0            54
        1.0            40
        2.0            24
Name: smo_status, dtype: int64

In [548]:
def parse_id(x):
    try:
        return x.split('.nii.gz')[0]
    except:
        return x
a = cohort['filename'].apply(lambda x: parse_id(x))

Series([], Name: filename, dtype: object)

In [554]:
cohort.iloc[138:150]

Unnamed: 0,pid,id,filename,session,scan_date,cohort,with_image,with_marker,age,race,...,phist,fhist,smo_status,pkyr,cyfra,lung_cancer,mayo_risk,CBM_risk,radiomics_risk,brock_risk
138,19040409199,19040409199time20150101,19040409199time20150101.nii.gz,0.0,2015-01-01,4,True,True,67.0,1.0,...,1,0,0.0,72.0,2.810831,1,0.666909,0.957421,0.841403,0.281728
139,467250240,467250240time20120101,467250240time20120101.nii.gz,0.0,2012-01-01,4,True,True,65.0,1.0,...,0,0,1.0,50.0,1.56851,1,0.318434,0.558521,0.621676,0.162946
140,5646816557,5646816557time20150101,5646816557time20150101.nii.gz,0.0,2015-01-01,4,True,True,69.0,1.0,...,0,0,0.0,53.0,0.666322,1,0.284331,0.195759,0.565707,0.209689
141,10617382127,10617382127time20160101,10617382127time20160101.nii.gz,0.0,2016-01-01,4,True,True,66.0,1.0,...,1,0,1.0,24.0,7.029161,1,0.868406,0.986458,0.691569,0.354492
142,14650366062,14650366062time20160101,14650366062time20160101.nii.gz,0.0,2016-01-01,4,True,True,64.0,1.0,...,0,0,1.0,23.0,2.332667,1,0.550428,0.874442,0.711002,0.281791
143,18801678155,18801678155time20110101,18801678155time20110101.nii.gz,0.0,2011-01-01,4,True,True,64.0,1.0,...,0,0,0.0,100.0,1.487534,1,0.595016,0.86248,0.839245,0.321925
144,20330042946,20330042946time20160101,20330042946time20160101.nii.gz,0.0,2016-01-01,4,True,True,70.0,1.0,...,0,0,2.0,,0.719942,1,0.320316,0.494646,0.782643,0.245187
145,39479816584,,,,NaT,4,False,True,64.0,1.0,...,0,0,2.0,,0.519972,1,0.1682,0.290407,0.755305,0.160985
146,16509940243,16509940243time20170101,16509940243time20170101.nii.gz,0.0,2017-01-01,4,True,True,47.0,1.0,...,0,0,2.0,,1.822617,1,0.097001,0.635386,0.743756,0.178904
147,869728078,869728078time20160101,869728078time20160101.nii.gz,0.0,2016-01-01,4,True,True,74.0,1.0,...,1,0,0.0,50.0,0.254863,0,0.654007,0.286109,0.64504,0.148345
