In [1]:
import os
import pandas as pd
import numpy as np
import math
import scipy.stats
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import statsmodels.formula.api as smf
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', None)

In [33]:
cohort_df = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/bronch/bronch_cs_prep_v1.csv"
cohort_df = pd.read_csv(cohort_df)
cohort_df

Unnamed: 0,pid,age,sex,bmi,emphysema,copd,phist,fhist,smo_status,quit_time,...,upper_lobe,nodule_size,nodule_type,nodule_count,scan_date,filename,session,id,with_marker,with_image
0,10140118338,62,1,25.8,0,0,0,0,2,,...,1,1.2,0,1,2018-10-19,10140118338time20181019,0,10140118338time20181019,1,1
1,10232218755,64,0,23.0,0,0,1,0,1,0.0,...,1,1.9,2,1,2019-02-04,10232218755time20190204,0,10232218755time20190204,1,1
2,10302863632,65,0,18.9,0,1,1,1,2,,...,0,1.2,2,1,2019-02-16,10302863632time20190216,0,10302863632time20190216,1,1
3,10310069205,62,0,18.2,0,1,0,0,1,0.0,...,1,2.6,2,1,2018-10-05,10310069205time20181005,0,10310069205time20181005,1,1
4,10501496583,41,1,30.0,0,0,1,1,0,8.0,...,0,1.8,2,1,2019-01-08,10501496583time20190108,0,10501496583time20190108,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,8666755991,69,1,28.1,0,1,1,0,0,17.0,...,1,1.4,2,1,2017-12-01,8666755991time20171201,0,8666755991time20171201,1,1
363,9057763661,67,1,19.8,0,0,1,0,0,16.0,...,1,1.4,2,1,2018-08-10,9057763661time20180810,0,9057763661time20180810,1,1
364,9345567287,37,0,20.2,0,0,0,0,2,,...,1,3.2,2,1,2018-07-13,9345567287time20180713,0,9345567287time20180713,1,1
365,9473800086,66,0,25.5,1,0,0,1,0,15.0,...,1,2.1,2,1,2018-08-13,9473800086time20180813,0,9473800086time20180813,1,1


### DL predictions

In [34]:
cscnn = "/home/local/VANDERBILT/litz/github/MASILab/DSB2017/models/config_bronch_1212/preds.csv"
cscnn = pd.read_csv(cscnn)
cscnn['cscnn_risk'] = cscnn['pred']
cscnn['id'] = cscnn['id'].apply(lambda x: x.split("'")[1])
cohort_df = cohort_df.merge(cscnn[['id', 'cscnn_risk']], on='id')

In [35]:
def Brock(age, sex, fhist, emphysema, nodule_size, upper_lobe, nodule_count, spiculation, nodule_nonsolid, nodule_partsolid, nodule_solid):
    """Full Model, with Spiculation from DOI: 10.1056/NEJMoa1214726"""
    age_var = 0.0287*(age - 62)
    sex_var = 0.6011* sex
    fhist_var = 0.2961*fhist
    emphysema_var = 0.2953*emphysema
    nodule_size_var = -5.3854* (np.sqrt(10/max(nodule_size -4, 1e-6))) -1.58113883 # nonlinear transform
    nodule_nonsolid_var = -0.1276*nodule_nonsolid
    nodule_partsolid_var = 0.3770*nodule_partsolid
    nodule_solid_var = 0*nodule_solid
    upper_lobe_var = 0.6581*upper_lobe
    nodule_count_var = -0.0824*(nodule_count - 4)
    spiculation_var = 0.7729*spiculation
    res = age_var + sex_var + fhist_var + emphysema_var + nodule_size_var + nodule_nonsolid_var + nodule_partsolid_var \
        + nodule_solid_var + upper_lobe_var + nodule_count_var + spiculation_var - 6.7892
    res = np.exp(res) / (1 + np.exp(res))
    return res

features = ['age', 'sex', 'fhist', 'emphysema', 
'spiculation', 'upper_lobe', 'nodule_size', 'nodule_type', 'nodule_count']
cohort_df['lung_cancer'] = cohort_df['lung_cancer'].astype(int)
features_df = cohort_df[features]

x = pd.get_dummies(features_df, columns=['nodule_type'])
enc_features = x.columns
# Multiple linear imputation
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(x)
imp_x = pd.DataFrame(imp.transform(x), columns=enc_features)

brock_enc = pd.merge(cohort_df[['pid','id','session','lung_cancer']], imp_x, left_index=True, right_index=True)

# nodule size must be >=4 to be considered nodule
# brock_enc = brock_enc[brock_enc['nodule_size']>=4]

brock_enc['brock_risk'] = brock_enc.apply(lambda x: Brock(age=x['age'], sex=x['sex'], fhist=x['fhist'], emphysema=x['emphysema'], \
    nodule_size=x['nodule_size'], upper_lobe=x['upper_lobe'], nodule_count=x['nodule_count'], spiculation=x['spiculation'], \
    nodule_nonsolid=x['nodule_type_0'], nodule_partsolid=x['nodule_type_1'], nodule_solid=x['nodule_type_2']), axis=1)

y, y_hat = brock_enc['lung_cancer'].to_numpy().astype(float), brock_enc['brock_risk'].to_numpy().astype(float)
roc = roc_auc_score(y, y_hat)
print(f"AUC: {roc}")

cohort_df = cohort_df.merge(brock_enc[['pid', 'brock_risk']], on='pid')
cohort_df

AUC: 0.4989593041317179


Unnamed: 0,pid,age,sex,bmi,emphysema,copd,phist,fhist,smo_status,quit_time,...,nodule_type,nodule_count,scan_date,filename,session,id,with_marker,with_image,cscnn_risk,brock_risk
0,10140118338,62,1,25.8,0,0,0,0,2,,...,0,1,2018-10-19,10140118338time20181019,0,10140118338time20181019,1,1,0.990012,0.0
1,10232218755,64,0,23.0,0,0,1,0,1,0.0,...,2,1,2019-02-04,10232218755time20190204,0,10232218755time20190204,1,1,0.824158,0.0
2,10302863632,65,0,18.9,0,1,1,1,2,,...,2,1,2019-02-16,10302863632time20190216,0,10302863632time20190216,1,1,0.624185,0.0
3,10310069205,62,0,18.2,0,1,0,0,1,0.0,...,2,1,2018-10-05,10310069205time20181005,0,10310069205time20181005,1,1,0.982637,0.0
4,10501496583,41,1,30.0,0,0,1,1,0,8.0,...,2,1,2019-01-08,10501496583time20190108,0,10501496583time20190108,1,1,0.999945,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,8666755991,69,1,28.1,0,1,1,0,0,17.0,...,2,1,2017-12-01,8666755991time20171201,0,8666755991time20171201,1,1,0.998855,0.0
363,9057763661,67,1,19.8,0,0,1,0,0,16.0,...,2,1,2018-08-10,9057763661time20180810,0,9057763661time20180810,1,1,0.995421,0.0
364,9345567287,37,0,20.2,0,0,0,0,2,,...,2,1,2018-07-13,9345567287time20180713,0,9345567287time20180713,1,1,0.687220,0.0
365,9473800086,66,0,25.5,1,0,0,1,0,15.0,...,2,1,2018-08-13,9473800086time20180813,0,9473800086time20180813,1,1,0.849994,0.0


In [36]:
def Mayo(age, phist, smo_status, nodule_size, spiculation, upper_lobe):
    """Full Model, with Spiculation from DOI: 10.1056/NEJMoa1214726"""
    age_var = 0.0391*age
    phist_var = 1.3388*int(phist)
    smo_status_var = 0.7917*smo_status
    nodule_size_var = 0.1274*nodule_size # nonlinear transform
    upper_lobe_var = 0.7838*upper_lobe
    spiculation_var = 1.0407*spiculation
    res = age_var + phist_var+ smo_status_var + nodule_size_var + upper_lobe_var + spiculation_var - 6.8272
    res = np.exp(res) / (1 + np.exp(res))
    return res

features = ['age', 'phist', 'smo_status', 'spiculation', 'upper_lobe', 'nodule_size', 'lung_cancer']
mayo_df = cohort_df.copy()
mayo_df['lung_cancer'] = mayo_df['lung_cancer'].astype(int)
features_df = mayo_df[features]
x = features_df
enc_features = x.columns
# Multiple linear imputation
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(x)
imp_x = pd.DataFrame(imp.transform(x), columns=enc_features)

mayo_enc = pd.merge(mayo_df.drop(columns=['nodule_size']), imp_x['nodule_size'], left_index=True, right_index=True)

mayo_enc['mayo_risk'] = mayo_enc.apply(lambda x: Mayo(age=x['age'], phist=x['phist'], smo_status=x['smo_status'],
    nodule_size=x['nodule_size'], spiculation=x['spiculation'], upper_lobe=x['upper_lobe']), axis=1)

y, y_hat = mayo_enc['lung_cancer'].to_numpy().astype(float), mayo_enc['mayo_risk'].to_numpy().astype(float)
roc = roc_auc_score(y, y_hat)
print(f"AUC: {roc}")

cohort_df = cohort_df.merge(mayo_enc[['pid', 'mayo_risk']], on='pid')

AUC: 0.6162938800869835


Unnamed: 0,pid,age,sex,bmi,emphysema,copd,phist,fhist,smo_status,quit_time,...,nodule_count,scan_date,filename,session,id,with_marker,with_image,cscnn_risk,brock_risk,mayo_risk
0,10140118338,62,1,25.8,0,0,0,0,2,,...,1,2018-10-19,10140118338time20181019,0,10140118338time20181019,1,1,0.990012,0.0,0.132054
1,10232218755,64,0,23.0,0,0,1,0,1,0.0,...,1,2019-02-04,10232218755time20190204,0,10232218755time20190204,1,1,0.824158,0.0,0.468108
2,10302863632,65,0,18.9,0,1,1,1,2,,...,1,2019-02-16,10302863632time20190216,0,10302863632time20190216,1,1,0.624185,0.0,0.457622
3,10310069205,62,0,18.2,0,1,0,0,1,0.0,...,1,2018-10-05,10310069205time20181005,0,10310069205time20181005,1,1,0.982637,0.0,0.189147
4,10501496583,41,1,30.0,0,0,1,1,0,8.0,...,1,2019-01-08,10501496583time20190108,0,10501496583time20190108,1,1,0.999945,0.0,0.025185
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
362,8666755991,69,1,28.1,0,1,1,0,0,17.0,...,1,2017-12-01,8666755991time20171201,0,8666755991time20171201,1,1,0.998855,0.0,0.138436
363,9057763661,67,1,19.8,0,0,1,0,0,16.0,...,1,2018-08-10,9057763661time20180810,0,9057763661time20180810,1,1,0.995421,0.0,0.129370
364,9345567287,37,0,20.2,0,0,0,0,2,,...,1,2018-07-13,9345567287time20180713,0,9345567287time20180713,1,1,0.687220,0.0,0.068778
365,9473800086,66,0,25.5,1,0,0,1,0,15.0,...,1,2018-08-13,9473800086time20180813,0,9473800086time20180813,1,1,0.849994,0.0,0.103906


### Eval models

In [37]:
andom_seed=56
n_bootstrap = 1000 # number of bootstrap samples with replacement (the samples are same size as cohort)

model_names = ['mayo', 'brock', 'cscnn']

def compute_model_auc(sample, model):
    y = sample['lung_cancer']
    y_prob = sample[f"{model}_risk"]
    return roc_auc_score(y, y_prob)

def compute_ci(data, confidence=0.95):
    a = 1.0*np.array(data)
    n = len(a)
    mu, se = np.mean(a), scipy.stats.sem(a)
    h = se*scipy.stats.t.ppf((1+confidence)/2.0, n-1)
    # print(mu)
    return mu, mu-h, mu+h

nonnull_cohort = cohort_df[~cohort_df['mayo_risk'].isnull() & ~cohort_df['brock_risk'].isnull() & ~cohort_df['cscnn_risk'].isnull()]

dfrows= []
for model in model_names:
    # calculate 95% CI with bootstrap sampling
    aucs = []
    for i in range(n_bootstrap):
        sample = nonnull_cohort.sample(frac=1.0, replace=True)
        aucs.append(compute_model_auc(sample, model))

    mean_auc, ci_low, ci_high = compute_ci(aucs)
    dfrows.append({'model': model, 'mean_AUC': mean_auc, 'ci_low':ci_low, 'ci_high':ci_high, 'std': np.std(aucs)})

metrics = pd.DataFrame(dfrows)
metrics

Unnamed: 0,model,mean_AUC,ci_low,ci_high,std
0,mayo,0.614675,0.612696,0.616653,0.031869
1,brock,0.498898,0.498059,0.499737,0.013508
2,cscnn,0.545109,0.543203,0.547015,0.0307
