# Evaluate models on IPN cohorts

In [3]:
import os
import pandas as pd
import numpy as np
import math
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import statsmodels.formula.api as smf
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_colwidth', None)

In [5]:
cohort_path = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/nlst/nlst_ipn_v2.csv"
cohort_df = pd.read_csv(cohort_path)
cohort_df

Unnamed: 0,pid,id,session,age,sex,education,bmi,phist,fhist,emphysema,...,quit_time,pkyr,spiculation,upper_lobe,nodule_size,nodule_type,nodule_count,with_image,with_marker,lung_cancer
0,100004,100004time2000,1,60,1,4,29.414135,False,False,False,...,15.0,34.0,0,1,4.0,2.0,1,True,True,0.0
1,100012,100012time2000,1,61,0,6,22.240116,False,False,False,...,0.0,37.0,1,1,15.0,0.0,1,True,True,1.0
2,100019,100019time2000,1,61,1,4,23.962608,False,False,True,...,3.0,78.0,1,1,14.0,2.0,1,True,True,0.0
3,100026,100026time2001,2,57,1,3,35.146505,False,False,False,...,0.0,61.5,0,0,5.0,2.0,2,True,True,0.0
4,100035,100035time2001,2,55,0,3,22.096473,False,False,True,...,0.0,38.0,0,0,5.0,2.0,1,True,True,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5784,218391,218391time2000,0,66,1,6,23.674792,False,False,True,...,10.0,40.0,0,1,17.0,1.0,4,True,True,1.0
5785,218499,218499time1999,0,63,1,2,23.110395,False,False,False,...,0.0,70.5,1,1,12.0,1.0,1,True,True,1.0
5786,218510,218510time2001,2,64,1,6,27.802713,False,False,True,...,0.0,70.5,0,1,14.0,2.0,4,True,True,1.0
5787,218705,218705time2001,2,68,1,4,37.305733,False,False,True,...,0.0,84.0,0,1,6.0,2.0,2,True,True,0.0


## DL models

In [8]:
dls_path = "/home/local/VANDERBILT/litz/data/nlst/DeepLungScreening/pred/ipn_pred_v2.csv"
dls = pd.read_csv(dls_path)
dls['dls_risk'] = dls['pred']
cohort_df = cohort_df.merge(dls[['id', 'dls_risk']], on='id')


In [None]:
dlstm_path = ""

## Brock

In [75]:
def Brock(age, sex, fhist, emphysema, nodule_size, upper_lobe, nodule_count, spiculation, nodule_nonsolid, nodule_partsolid, nodule_solid):
    """Full Model, with Spiculation from DOI: 10.1056/NEJMoa1214726"""
    age_var = 0.0287*(age - 62)
    sex_var = 0.6011* sex
    fhist_var = 0.2961*fhist
    emphysema_var = 0.2953*emphysema
    nodule_size_var = -5.3854* (np.sqrt(10/max(nodule_size -4, 1e-6))) -1.58113883 # nonlinear transform
    nodule_nonsolid_var = -0.1276*nodule_nonsolid
    nodule_partsolid_var = 0.3770*nodule_partsolid
    nodule_solid_var = 0*nodule_solid
    upper_lobe_var = 0.6581*upper_lobe
    nodule_count_var = -0.0824*(nodule_count - 4)
    spiculation_var = 0.7729*spiculation
    res = age_var + sex_var + fhist_var + emphysema_var + nodule_size_var + nodule_nonsolid_var + nodule_partsolid_var \
        + nodule_solid_var + upper_lobe_var + nodule_count_var + spiculation_var - 6.7892
    res = np.exp(res) / (1 + np.exp(res))
    return res


In [76]:
brock_path = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/nlst_brock_v1.csv"
brock_df = pd.read_csv(brock_path, dtype={'pid':str})
brock_df

Unnamed: 0,pid,id,session,age,sex,fhist,emphysema,spiculation,upper_lobe,nodule_size,nodule_type,nodule_count,lung_cancer
0,100004,100004time2000,1,60,1,False,False,0,1,4.0,2.0,1,0.0
1,100012,100012time2000,1,61,0,False,False,1,1,15.0,0.0,1,1.0
2,100019,100019time2000,1,61,1,False,True,1,1,14.0,2.0,1,0.0
3,100026,100026time2001,2,57,1,False,False,0,0,5.0,2.0,2,0.0
4,100035,100035time2001,2,55,0,False,True,0,0,5.0,2.0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5784,218391,218391time2000,0,66,1,False,True,0,1,17.0,1.0,4,1.0
5785,218499,218499time1999,0,63,1,False,False,1,1,12.0,1.0,1,1.0
5786,218510,218510time2001,2,64,1,False,True,0,1,14.0,2.0,4,1.0
5787,218705,218705time2001,2,68,1,False,True,0,1,6.0,2.0,2,0.0


In [77]:
features = ['age', 'sex', 'fhist', 'emphysema', 
'spiculation', 'upper_lobe', 'nodule_size', 'nodule_type', 'nodule_count']
brock_df['lung_cancer'] = brock_df['lung_cancer'].astype(int)
features_df = brock_df[features]
x = pd.get_dummies(features_df, columns=['nodule_type'])
enc_features = x.columns
# Multiple linear imputation
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(x)
imp_x = pd.DataFrame(imp.transform(x), columns=enc_features)

brock_enc = pd.merge(brock_df[['pid','id','session','lung_cancer']], imp_x, left_index=True, right_index=True)

# nodule size must be >=4 to be considered nodule
brock_enc = brock_enc[brock_enc['nodule_size']>=4]

In [79]:
brock_enc['brock'] = brock_enc.apply(lambda x: Brock(age=x['age'], sex=x['sex'], fhist=x['fhist'], emphysema=x['emphysema'], \
    nodule_size=x['nodule_size'], upper_lobe=x['upper_lobe'], nodule_count=x['nodule_count'], spiculation=x['spiculation'], \
    nodule_nonsolid=x['nodule_type_0.0'], nodule_partsolid=x['nodule_type_1.0'], nodule_solid=x['nodule_type_2.0']), axis=1)

y, y_hat = brock_enc['lung_cancer'].to_numpy().astype(float), brock_enc['brock'].to_numpy().astype(float)
roc = roc_auc_score(y, y_hat)
print(f"AUC: {roc}")

AUC: 0.8411565736761094


In [8]:
# fitting a lienar model instead of using original weights

scalars = ['age', 'nodule_size', 'nodule_count']
brock_enc[scalars] = brock_enc[scalars].astype(float)
brock_enc[scalars].min()
brock_enc[scalars] = (brock_enc[scalars] - brock_enc[scalars].min())/(brock_enc[scalars].max() - brock_enc[scalars].min())

import statsmodels.formula.api as smf
# formula_str = 'age+bmi+copd+phist+fhist+smo_status+quit_time+pkyr+C(race, Treatment(reference=1))+C(education, Treatment(reference=1))'
formula_str = 'age+sex+fhist+emphysema+spiculation+upper_lobe+nodule_size+nodule_type+nodule_count'
smf_lr = smf.logit(f"lung_cancer ~ {formula_str}", data=brock_enc).fit()
smf_lr.summary()

odds = pd.DataFrame({"OR": smf_lr.params, "Lower CI": smf_lr.conf_int()[0], "Upper CI": smf_lr.conf_int()[1]})
odds = np.exp(odds)
odds["OR"] = odds["OR"].apply(lambda x: '{:.4f}'.format(x))

# report model performance
X, y = brock_enc[features], brock_enc['lung_cancer'].to_numpy().ravel()
y_prob = smf_lr.predict(brock_enc[features])
y_hat = round(y_prob)

# accuracy_score
roc = roc_auc_score(y, y_prob)
report = classification_report(y, y_hat)
acc = accuracy_score(y, y_hat)
print(f"AUC: {roc}")
print(report)
print(f"Accuracy: {acc}")
print(f"Chi-squared statistic: {smf_lr.llr} p:{smf_lr.llr_pvalue}")

## Mayo

In [87]:
def Mayo(age, phist, smo_status, nodule_size, spiculation, upper_lobe):
    """Full Model, with Spiculation from DOI: 10.1056/NEJMoa1214726"""
    age_var = 0.0391*age
    phist_var = 1.3388*int(phist)
    smo_status_var = 0.7917*smo_status
    nodule_size_var = 0.1274*nodule_size # nonlinear transform
    upper_lobe_var = 0.7838*upper_lobe
    spiculation_var = 1.0407*spiculation
    res = age_var + phist_var+ smo_status_var + nodule_size_var + upper_lobe_var + spiculation_var - 6.8272
    res = np.exp(res) / (1 + np.exp(res))
    return res

In [88]:
mayo_path = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/nlst_mayo_v1.csv"
mayo_df = pd.read_csv(mayo_path, dtype={'pid':str})

features = ['age', 'phist', 'smo_status', 'spiculation', 'upper_lobe', 'nodule_size', 'lung_cancer']
mayo_df['lung_cancer'] = mayo_df['lung_cancer'].astype(int)
features_df = mayo_df[features]
x = features_df
enc_features = x.columns
# Multiple linear imputation
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(x)
imp_x = pd.DataFrame(imp.transform(x), columns=enc_features)

mayo_enc = pd.merge(mayo_df.drop(columns=['nodule_size']), imp_x['nodule_size'], left_index=True, right_index=True)


In [90]:
mayo_enc['mayo'] = mayo_enc.apply(lambda x: Mayo(age=x['age'], phist=x['phist'], smo_status=x['smo_status'],
    nodule_size=x['nodule_size'], spiculation=x['spiculation'], upper_lobe=x['upper_lobe']), axis=1)

y, y_hat = mayo_enc['lung_cancer'].to_numpy().astype(float), mayo_enc['mayo'].to_numpy().astype(float)
roc = roc_auc_score(y, y_hat)
print(f"AUC: {roc}")

AUC: 0.8329631974188083


: 

In [13]:
# fitting a linear model instead of using original weights

scalars = ['age', 'nodule_size']
mayo_enc[scalars] = mayo_enc[scalars].astype(float)
mayo_enc[scalars].min()
mayo_enc[scalars] = (mayo_enc[scalars] - mayo_enc[scalars].min())/(mayo_enc[scalars].max() - mayo_enc[scalars].min())

import statsmodels.formula.api as smf
# formula_str = 'age+bmi+copd+phist+fhist+smo_status+quit_time+pkyr+C(race, Treatment(reference=1))+C(education, Treatment(reference=1))'
formula_str = 'age+phist+spiculation+upper_lobe+nodule_size'
smf_lr = smf.logit(f"lung_cancer ~ {formula_str}", data=mayo_enc).fit()
smf_lr.summary()

odds = pd.DataFrame({"OR": smf_lr.params, "Lower CI": smf_lr.conf_int()[0], "Upper CI": smf_lr.conf_int()[1]})
odds = np.exp(odds)
odds["OR"] = odds["OR"].apply(lambda x: '{:.4f}'.format(x))
odds

Unnamed: 0,OR,Lower CI,Upper CI
Intercept,0.0181,0.01429,0.02294381
phist[T.True],1.6943,1.176479,2.439914
age,2.8807,2.074507,4.000129
spiculation,3.9296,3.202045,4.822584
upper_lobe,1.9693,1.623425,2.388929
nodule_size,1371255.5907,293343.016998,6410045.0


In [14]:
# report model performance
X, y = mayo_enc[features], mayo_enc['lung_cancer'].to_numpy().ravel()
y_prob = smf_lr.predict(mayo_enc[features])
y_hat = round(y_prob)

# accuracy_score
roc = roc_auc_score(y, y_prob)
report = classification_report(y, y_hat)
acc = accuracy_score(y, y_hat)
print(f"AUC: {roc}")
print(report)
print(f"Accuracy: {acc}")
print(f"Chi-squared statistic: {smf_lr.llr} p:{smf_lr.llr_pvalue}")

AUC: 0.8377482504857672
              precision    recall  f1-score   support

           0       0.91      0.98      0.94      5113
           1       0.66      0.25      0.36       676

    accuracy                           0.90      5789
   macro avg       0.78      0.62      0.65      5789
weighted avg       0.88      0.90      0.88      5789

Accuracy: 0.8970461219554328
Chi-squared statistic: 981.6339867773659 p:5.687978103138167e-210
