# Logistic regression models w/ multiple imputation

In [2]:
import os 
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

In [3]:
def PLCOm2012(age, race, education, body_mass_index, copd, phist, fhist,
              smoking_status, smoking_intensity, duration, quit_time):
    def get_race(x):
        d = {
            0: 0, # unknown
            1: 0, # white
            2: 0.3944778, # black
            3: -0.7434744, # hispanic
            4: -0.466585, # asian
            5: 0, # american indian or alaskan native
            6: 1.027152 # Native hawaiian or pacific islander
        }
        return d[x]
    age_item = 0.0778868 * (age - 62)
    edu_item = -0.0812744 * (education - 4)
    bmi_item = -0.0274194 * (body_mass_index - 27)
    copd_item = 0.3553063 * int(copd)
    phist_item = 0.4589971 * int(phist)
    fhist_item = 0.587185 * int(fhist)
    sstatus_item = 0.2597431 * (smoking_status - 1)
    sint_item = - 1.822606 * (10 / smoking_intensity - 0.4021541613)
    duration_item = 0.0317321 * (duration - 27)
    qt_item = -0.0308572 * (quit_time - 10)
    res = age_item + get_race(race) + edu_item + bmi_item \
          + copd_item + phist_item + fhist_item + sstatus_item \
          + sint_item + duration_item + qt_item - 4.532506
    res = np.exp(res) / (1 + np.exp(res))
    return res

# VLSP

In [5]:
vlsp = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/vlsp_cohort_v1.csv"
cohort_df = pd.read_csv(vlsp, dtype={'pid':str})
cohort = cohort_df.groupby('pid', as_index=False).max()

### Missing data Imputation

In [6]:


# one-hot encoding
features = ['age', 'race', 'education',  'bmi',  'copd', 'phist', 'fhist', 'smo_status', 'quit_time', 'pkyr']
label = ['lung_cancer']
features_df = cohort[features]
x = pd.get_dummies(features_df, columns=['race', 'education'])
enc_features = x.columns
# Multiple linear imputation
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(x)
imp_x = pd.DataFrame(imp.transform(x), columns=enc_features)
imp_x[['copd', 'phist', 'fhist']] = imp_x[['copd', 'phist', 'fhist']].round()

# merge imputed values with cohort labels using an index join
cohort_enc = pd.merge(cohort[['pid', 'id', 'session', 'lung_cancer', 'race', 'education']], imp_x, left_index=True, right_index=True)
cohort_enc

Unnamed: 0,pid,id,session,lung_cancer,race,education,age,bmi,copd,phist,...,race_4,race_5,race_6,education_0,education_1,education_2,education_3,education_4,education_5,education_6
0,00000009,00000009time20180517,2,False,1,2,77.0,25.253111,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,00000010,00000010time20180321,4,False,1,5,69.0,31.617330,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,00000012,00000012time20180411,1,False,1,3,61.0,31.783737,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,00000013,00000013time20140226,0,False,1,0,62.0,21.613634,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00000014,00000014time20140226,0,False,1,0,62.0,21.143772,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
857,00001124,00001124time20180105,0,False,1,4,59.0,21.256828,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
858,00001125,00001125time20180426,0,False,1,3,67.0,37.703191,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
859,00001126,00001126time20180214,0,False,1,1,63.0,20.231186,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
860,00001127,00001127time20180307,0,False,1,6,65.0,19.052911,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [13]:
cohort_enc.quit_time

0      6.0
1      2.0
2      0.0
3      0.0
4      7.0
      ... 
857    2.0
858    0.0
859    0.0
860    0.0
861    0.0
Name: quit_time, Length: 862, dtype: float64

In [28]:


# normalize by subtracting mean and dividing by std
scalars = ['age', 'bmi', 'quit_time', 'pkyr']
cohort_enc[scalars].min()
cohort_enc[scalars] = (cohort_enc[scalars] - cohort_enc[scalars].min())/(cohort_enc[scalars].max() - cohort_enc[scalars].min())
cohort_enc


Unnamed: 0,pid,id,session,lung_cancer,age,bmi,copd,phist,fhist,smo_status,...,race_4,race_5,race_6,education_0,education_1,education_2,education_3,education_4,education_5,education_6
0,00000009,00000009time20180517,2,False,0.93750,0.235697,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,00000010,00000010time20180321,4,False,0.68750,0.391412,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,00000012,00000012time20180411,1,False,0.43750,0.395483,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,00000013,00000013time20140226,0,False,0.46875,0.146649,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00000014,00000014time20140226,0,False,0.46875,0.135153,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
857,00001124,00001124time20180105,0,False,0.37500,0.137919,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
858,00001125,00001125time20180426,0,False,0.62500,0.540316,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
859,00001126,00001126time20180214,0,False,0.50000,0.112824,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
860,00001127,00001127time20180307,0,False,0.56250,0.083995,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [29]:
# fit model
train, test = train_test_split(cohort_enc, test_size=0.2)
X_train, y_train = train[enc_features].to_numpy(), train[label].to_numpy().ravel()
lr = LogisticRegression(solver='sag', random_state=0).fit(X_train, y_train)

In [30]:
# score model with test set
X_test, y_test = test[enc_features].to_numpy(), test[label].to_numpy().ravel()
lr.predict(X_test)
lr.score(X_test, y_test)

0.976878612716763

In [42]:
y_prob_test = lr.predict_proba(X_test)[:, 1]
y_hat_test = lr.predict(X_test)
# accuracy_score
roc = roc_auc_score(y_test, y_prob_test)
report = classification_report(y_test, y_hat_test)
acc = accuracy_score(y_test, y_hat_test)
print(f"AUC: {roc}")
print(report)
print(f"Accuracy: {acc}")

AUC: 0.7573964497041421
              precision    recall  f1-score   support

       False       0.98      1.00      0.99       169
        True       0.00      0.00      0.00         4

    accuracy                           0.98       173
   macro avg       0.49      0.50      0.49       173
weighted avg       0.95      0.98      0.97       173

Accuracy: 0.976878612716763


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# NLST

In [30]:
def PLCOm2012(age, race, education, body_mass_index, copd, phist, fhist,
              smoking_status, smoking_intensity, duration, quit_time):
    def get_race(x):
        d = {
            0: 0, # unknown
            1: 0, # white
            2: 0.3944778, # black
            3: -0.7434744, # hispanic
            4: -0.466585, # asian
            5: 0, # american indian or alaskan native
            6: 1.027152 # Native hawaiian or pacific islander
        }
        return d[x]
    age_item = 0.0778868 * (age - 62)
    edu_item = -0.0812744 * (education - 4)
    bmi_item = -0.0274194 * (body_mass_index - 27)
    copd_item = 0.3553063 * int(copd)
    phist_item = 0.4589971 * int(phist)
    fhist_item = 0.587185 * int(fhist)
    sstatus_item = 0.2597431 * (smoking_status - 1)
    sint_item = - 1.822606 * (10 / smoking_intensity - 0.4021541613)
    duration_item = 0.0317321 * (duration - 27)
    qt_item = -0.0308572 * (quit_time - 10)
    res = age_item + get_race(race) + edu_item + bmi_item \
          + copd_item + phist_item + fhist_item + sstatus_item \
          + sint_item + duration_item + qt_item - 4.532506
    res = np.exp(res) / (1 + np.exp(res))
    return res

In [31]:
nlst = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/nlst_cohort_prep_v2.csv"
cohort_df = pd.read_csv(nlst, dtype={'pid':str})
cohort = cohort_df.groupby('pid', as_index=False).max()
# one-hot encoding
features = ['age', 'race', 'education',  'bmi',  'copd', 'phist', 'fhist', 'smo_status', 'smo_intensity', 'smo_duration', 'quit_time']
label = ['lung_cancer']
features_df = cohort[features]
x = pd.get_dummies(features_df, columns=['race', 'education'])
enc_features = x.columns
# Multiple linear imputation
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(x)
imp_x = pd.DataFrame(imp.transform(x), columns=enc_features)
imp_x[['copd', 'phist', 'fhist']] = imp_x[['copd', 'phist', 'fhist']].round()

# merge imputed values with cohort labels using an index join
cohort_enc = pd.merge(cohort[['pid', 'id', 'session', 'lung_cancer', 'race', 'education']], imp_x, left_index=True, right_index=True)

cohort_enc['plco'] = cohort_enc.apply(lambda item: PLCOm2012(age = item['age'], race = item['race'], education = item['education'], body_mass_index = item['bmi'], \
    copd = item['copd'], phist = item['phist'], fhist = item['fhist'], smoking_status = item['smo_status'], smoking_intensity = item['smo_intensity'], \
    duration = item['smo_duration'], quit_time = item['quit_time']), axis=1)

In [50]:
y, y_hat = cohort_enc['lung_cancer'].to_numpy().astype(float), cohort_enc['plco'].to_numpy().astype(float)
roc = roc_auc_score(y, y_hat)
print(f"AUC: {roc}")

AUC: 0.6630832496267891


In [51]:
len(cohort_enc)

6230

In [None]:
# normalize by subtracting mean and dividing by std
scalars = ['age', 'bmi', 'quit_time', 'pkyr']
cohort_enc[scalars].min()
cohort_enc[scalars] = (cohort_enc[scalars] - cohort_enc[scalars].min())/(cohort_enc[scalars].max() - cohort_enc[scalars].min())

# fit model
train, test = train_test_split(cohort_enc, test_size=0.2)
X_train, y_train = train[enc_features].to_numpy(), train[label].to_numpy().ravel()
lr = LogisticRegression(solver='sag', random_state=0).fit(X_train, y_train)

# score model with test set
X_test, y_test = test[enc_features].to_numpy(), test[label].to_numpy().ravel()
lr.predict(X_test)
lr.score(X_test, y_test)

In [45]:
y_prob_test = lr.predict_proba(X_test)[:, 1]
y_hat_test = lr.predict(X_test)
# accuracy_score
roc = roc_auc_score(y_test, y_prob_test)
report = classification_report(y_test, y_hat_test)
acc = accuracy_score(y_test, y_hat_test)
print(f"AUC: {roc}")
print(report)
print(f"Accuracy: {acc}")

AUC: 0.6709620716973659
              precision    recall  f1-score   support

         0.0       0.89      1.00      0.94      1122
         1.0       0.00      0.00      0.00       144

    accuracy                           0.89      1266
   macro avg       0.44      0.50      0.47      1266
weighted avg       0.79      0.89      0.83      1266

Accuracy: 0.8862559241706162


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
cohort['lung_cancer'].value_counts()

0.0    5604
1.0     724
Name: lung_cancer, dtype: int64