# VLSP clinical data pipeline

In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import os
from os import path
import numpy as np
import matplotlib.pyplot as plt
import glob 

### demographic features

In [2]:
# demographic csv
raw_cli = "/nfs/masi/SPORE/file/clinical/combine_17and18.xlsx"
raw_df = pd.read_excel(raw_cli)
print(raw_df.columns)
print(len(raw_df))
# raw_df

Index(['sub_name', 'sex', 'race', 'hispanic', 'accession', 'proccode', 'dob',
       'studydate', 'Age', 'LungRADS', 'ctscannermake', 'ctscannermodel',
       'ctdivol', 'dlp', 'reconstructedwidth', 'tubecurrenttime',
       'tubevoltage', 'scanningtime', 'scanningvolume', 'pitch',
       'smokingfrequency', 'firstLCS', 'heightinches', 'weightpounds',
       'smokingstatus', 'packsperday', 'packyearsreported', 'yearssincequit',
       'lastsmoked', 'cessationcounseling', 'shareddecisionmaking', 'symptoms',
       'comorbidities', 'occupationalexposure', 'copd', 'education',
       'riskcalculation', 'Coronary Artery Calc', 'Lung Nodule Clinic refer',
       'historycomplete', 'Qualifies USPSTF', 'Qualify USPSTF',
       'personal_cancer_history', 'family_cancer_history', 'therapy_history',
       'If on Xnat (no=0, yes=1)'],
      dtype='object')
1688


In [3]:
ft = ['sub_name', 'sex', 'race', 'Age', 'studydate', 'heightinches', 'weightpounds', 'education']
hist_ft = ['personal_cancer_history', 'family_cancer_history']
smoking_ft = ['smokingstatus', 'packsperday', 'packyearsreported', 'yearssincequit', 'lastsmoked', 'copd']
raw_df = raw_df[ft+hist_ft+smoking_ft]
raw_df['pid'] = raw_df['sub_name'].apply(lambda x: x.split('_')[-1])
raw_df

Unnamed: 0,sub_name,sex,race,Age,studydate,heightinches,weightpounds,education,personal_cancer_history,family_cancer_history,smokingstatus,packsperday,packyearsreported,yearssincequit,lastsmoked,copd,pid
0,SPORE_00000001,M,Caucasian/ White,59,2013-12-05,72,214,,,,9,0.0,999.0,0,,,00000001
1,SPORE_00000002,F,Caucasian/ White,67,2013-12-13,61,157,,,,9,0.0,999.0,0,,,00000002
2,SPORE_00000003,F,Caucasian/ White,46,2014-01-09,66,194,,,,9,0.0,999.0,0,,,00000003
3,SPORE_00000004,F,Caucasian/ White,59,2014-01-13,65,145,,,,9,0.0,999.0,0,,,00000004
4,SPORE_00000005,M,Caucasian/ White,61,2014-01-14,70,270,,,,9,0.0,999.0,11,,,00000005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1683,SPORE_00000604,F,Caucasian/ White,56,2018-04-18,68,209,9-11th grade,The patient has no personal history of other c...,The patient has no family history of other can...,Current Smoker,0.0,92.0,0,Less than 24 hours,No,00000604
1684,SPORE_00001128,F,Caucasian/ White,58,2018-05-31,61,164,Associate degree some,The patient has no personal history of other c...,,Current Smoker,0.0,92.0,0,Less than 24 hours,Yes,00001128
1685,SPORE_00000411,M,Hispanic/ Latnio,76,2018-01-19,67,130,High school graduate,The patient has no personal history of other c...,The patient has no family history of other can...,Current Smoker,0.0,96.0,0,Less than 24 hours,No,00000411
1686,SPORE_00000459,M,Caucasian/ White,66,2018-04-25,70,148,Associate degree some,The patient has no personal history of other c...,The patient has no family history of other can...,Current Smoker,0.0,96.0,0,Less than 24 hours,No,00000459


### available scans

In [18]:
# scan info
scan_dir = "/nfs/masi/SPORE/nifti/combine"
rows = []
for pid in os.listdir(scan_dir):
    for scan_date in os.listdir(os.path.join(scan_dir, pid)):
        for filename in os.listdir(os.path.join(scan_dir, pid, scan_date)):
            if "time" in filename:
                scanid = os.path.basename(filename).split(".nii.gz")[0]
                rows.append({"pid":pid, "scan_date": scan_date, "id": scanid})


In [19]:
scan_df = pd.DataFrame(rows)
scan_df = scan_df.sort_values(by=['pid', 'id'])
# number each session
scan_df['session'] = scan_df.groupby(['pid'])['scan_date'].rank('dense', ascending=True) # assign a T0, T1, or T2 for each scan
scan_df['session'] = scan_df['session'] - 1
scan_df['session'] = scan_df['session'].astype(int)

# conver scan_date str to datetime
scan_df['studydate'] = scan_df['scan_date'].apply(lambda x: pd.to_datetime(x, format='%Y%m%d'))
scan_df

Unnamed: 0,pid,scan_date,id,session,studydate
1373,00000001,20131205,00000001time20131205,0,2013-12-05
982,00000002,20131213,00000002time20131213,0,2013-12-13
200,00000003,20140109,00000003time20140109,0,2014-01-09
227,00000004,20140113,00000004time20140113,0,2014-01-13
831,00000005,20140114,00000005time20140114,0,2014-01-14
...,...,...,...,...,...
654,00001132,20180412,00001132time20180412,0,2018-04-12
869,00002002,20180223,00002002time20180223,0,2018-02-23
868,00002002,20180709,00002002time20180709,1,2018-07-09
852,00002008,20180206,00002008time20180206,0,2018-02-06


In [83]:
# images that passed preprocessing
scan_dir = "/home/local/VANDERBILT/litz/data/vlsp/DeepLungScreening/prep"
rows = []
for scan in glob.glob(os.path.join(scan_dir, "*_clean.nii.gz")):
    scanid = os.path.basename(scan).split("_clean.nii.gz")[0]
    pid, scan_date = scanid.split("time")
    rows.append({"pid": pid, "scan_date": scan_date, "id": scanid})
scan_df = pd.DataFrame(rows)
scan_df = scan_df.sort_values(by=['pid', 'id'])
scan_df['session'] = scan_df.groupby(['pid'])['scan_date'].rank('dense', ascending=True) # assign a T0, T1, or T2 for each scan
scan_df['session'] = scan_df['session'] - 1
scan_df['session'] = scan_df['session'].astype(int)
scan_df['studydate'] = scan_df['scan_date'].apply(lambda x: pd.to_datetime(x, format='%Y%m%d'))
scan_df



Unnamed: 0,pid,scan_date,id,session,studydate
985,00000009,20160420,00000009time20160420,0,2016-04-20
1115,00000009,20170517,00000009time20170517,1,2017-05-17
882,00000009,20180517,00000009time20180517,2,2018-05-17
684,00000010,20140214,00000010time20140214,0,2014-02-14
232,00000010,20160127,00000010time20160127,1,2016-01-27
...,...,...,...,...,...
1117,00001122,20180706,00001122time20180706,0,2018-07-06
649,00001123,20180713,00001123time20180713,0,2018-07-13
588,00001125,20180426,00001125time20180426,0,2018-04-26
295,00001127,20180307,00001127time20180307,0,2018-03-07


### lung cancer labels

In [20]:
# lung cancer labels label csv
raw_label_path = "/nfs/masi/SPORE/file/clinical/SPORE_diaginfo_v1.xlsx"
raw_label_df = pd.read_excel(raw_label_path)
print(raw_label_df.columns)
print(len(raw_label_df))

Index(['SPORE', 'Path', 'Stage', 'Column1'], dtype='object')
49


In [21]:
raw_label_df['pid'] = raw_label_df['SPORE'].apply(lambda x: x.split('_')[-1])
lmerged = raw_df.merge(raw_label_df, on='pid', how='left')
lmerged['lung_cancer'] = ~lmerged['SPORE'].isnull()
print(lmerged['lung_cancer'].value_counts())

False    1633
True       55
Name: lung_cancer, dtype: int64


### merge all three

In [22]:
sc_df = lmerged.merge(scan_df, on=['pid', 'studydate'], how='inner')
sc_df = sc_df.groupby(['pid', 'studydate'], as_index=False).min()
print(sc_df['lung_cancer'].value_counts())
sc_df

False    1222
True       30
Name: lung_cancer, dtype: int64


Unnamed: 0,pid,studydate,sub_name,sex,race,Age,heightinches,weightpounds,education,personal_cancer_history,...,lastsmoked,copd,SPORE,Path,Stage,Column1,lung_cancer,scan_date,id,session
0,00000001,2013-12-05,SPORE_00000001,M,Caucasian/ White,59,72,214,,,...,,,,,,,False,20131205,00000001time20131205,0
1,00000002,2013-12-13,SPORE_00000002,F,Caucasian/ White,67,61,157,,,...,,,,,,,False,20131213,00000002time20131213,0
2,00000003,2014-01-09,SPORE_00000003,F,Caucasian/ White,46,66,194,,,...,,,,,,,False,20140109,00000003time20140109,0
3,00000004,2014-01-13,SPORE_00000004,F,Caucasian/ White,59,65,145,,,...,,,,,,,False,20140113,00000004time20140113,0
4,00000005,2014-01-14,SPORE_00000005,M,Caucasian/ White,61,70,270,,,...,,,,,,,False,20140114,00000005time20140114,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1247,00001124,2018-01-05,SPORE_00001124,F,Caucasian/ White,59,63,120,Associate degree some,The patient has the following personal history...,...,Greater than 24 months bu,No,,,,,False,20180105,00001124time20180105,0
1248,00001125,2018-04-26,SPORE_00001125,M,Caucasian/ White,67,72,278,Post high school trai,The patient has no personal history of other c...,...,Less than 24 hours,Yes,,,,,False,20180426,00001125time20180426,0
1249,00001126,2018-02-14,SPORE_00001126,M,Caucasian/ White,63,70,141,9-11th grade,The patient has no personal history of other c...,...,Less than 24 hours,No,,,,,False,20180214,00001126time20180214,0
1250,00001127,2018-03-07,SPORE_00001127,F,Caucasian/ White,65,64,111,Graduate or professio,The patient has the following personal history...,...,Less than 24 hours,No,,,,,False,20180307,00001127time20180307,0


### Preprocess and derive features
We do NOT have smoking intensity (average packs per day) and smoking duration for VLSP

![PLCO](/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/PLCOm2012_feature_set.png)

In [23]:
# calculate bmi = kg/m^2
sc_df['bmi'] = 0.45359237*sc_df['weightpounds'].div(np.power(0.0254*sc_df['heightinches'], 2))
sc_df = sc_df[~sc_df['bmi'].isnull()] # there are 2 cases of missing BMI


In [24]:
# parse race
def parse_race(x):
    map={
        'Caucasian/ White': 1, 
        'African American/ Black': 2, 
        'Hispanic/ Latnio': 3, 
        'Asian': 4, 
        'American Indian, Alaska Native': 5, 
        'Native Hawaiian/ Pacific Islan': 6, 
        'Unknown': 0
    }
    return map[x]
sc_df['race'] = sc_df['race'].apply(lambda x : parse_race(x))
sc_df['race'].value_counts()

1    1054
2     164
3      13
4      11
0       3
5       3
6       1
Name: race, dtype: int64

In [25]:
sc_df['education'].value_counts(dropna=False)

Associate                166
High schoo               165
Bachelors                145
Associate degree some    129
NaN                      122
High school graduate     119
Graduate o               107
Bachelors degree          99
9-11th gra                54
Graduate or professio     54
9-11th grade              33
Post high                 31
Post high school trai     21
Unknown-pr                 2
Otherth gr                 1
Unknown-prefer not to      1
Name: education, dtype: int64

In [26]:
# parse education
def parse_edu(x):
    mapper = {
        "Associate": 4,
        "High schoo": 2,
        "Bachelors": 5,
        "Associate degree some": 4,
        "High school graduate": 2,
        np.nan: 0,
        "Graduate o": 6,
        "Bachelors degree": 5,
        "9-11th gra": 1,
        "Graduate or professio": 6,
        "9-11th grade": 1,              
        "Post high": 3,                 
        "Post high school trai": 3,     
        "Unknown-pr": 0,                
        "Otherth gr": 3,                 
        "Unknown-prefer not to": 0,
    }
    return mapper[x]

sc_df['education'] = sc_df['education'].apply(lambda x: parse_edu(x))

In [27]:
# parse PMH and FH
def parse_phist(x):
    if x=='The patient has no personal history of other cancer.' or x=='No':
        return 0
    elif pd.isnull(x):
        return np.nan
    else:
        return 1

def parse_fhist(x):
    if x=='The patient has no family history of other cancer.':
        return 0
    elif pd.isnull(x):
        return np.nan
    else: 
        return 1

sc_df['phist'] = sc_df['personal_cancer_history'].apply(lambda x: parse_phist(x))
sc_df['fhist'] = sc_df['family_cancer_history'].apply(lambda x: parse_fhist(x))


In [28]:
# copd
def parse_copd(x):
    if x=="Yes":
        return 1
    elif x=="No":
        return 0
    else:
        return np.nan

sc_df['copd'] = sc_df['copd'].apply(lambda x: parse_copd(x))
sc_df['copd'].value_counts(dropna=False)

0.0    817
1.0    312
NaN    120
Name: copd, dtype: int64

In [29]:
# smoking status
# drop subjects who never smoked
sc_df = sc_df[(sc_df['smokingstatus']=='Current Smoker') | (sc_df['smokingstatus']=='Former Smoker')]

def smoker_to_int(x):
    return 1 if x=='Current Smoker' else 0

sc_df['smo_status']=sc_df['smokingstatus'].apply(lambda x: smoker_to_int(x))
print(sc_df['smo_status'].value_counts(dropna=False))\

# pack years
# drop subjects without pack years
sc_df = sc_df[sc_df['packyearsreported']!=999]

# smoking quit time
sc_df['quit_time'] = sc_df['yearssincequit']

1    746
0    493
Name: smo_status, dtype: int64


In [30]:
# sc_df['id'] = sc_df['filename'].apply(lambda x: x.split('.nii.gz')[0])
sc_df['age'] = sc_df['Age']
sc_df['pkyr'] = sc_df['packyearsreported']
sc_df['with_image'] = 1
sc_df['with_marker']=1
sc_df['scan_date'] = sc_df['studydate']
ft = ['pid', 'id', 'session', 'scan_date', 'age', 'race', 'education',  'bmi',  'copd', 'phist', 'fhist', 'smo_status', 'quit_time', 'pkyr', 'lung_cancer', 'with_image', 'with_marker']
cohort_df = sc_df[ft]
cohort_df

Unnamed: 0,pid,id,session,scan_date,age,race,education,bmi,copd,phist,fhist,smo_status,quit_time,pkyr,lung_cancer,with_image,with_marker
8,00000009,00000009time20160420,0,2016-04-20,75,1,0,24.822661,,,,0,4,62.50,False,1,1
9,00000009,00000009time20170517,1,2017-05-17,76,1,2,25.253111,0.0,0.0,,0,5,62.50,False,1,1
10,00000009,00000009time20180517,2,2018-05-17,77,1,2,24.679177,0.0,0.0,0.0,0,6,62.50,False,1,1
11,00000010,00000010time20140214,0,2014-02-14,65,1,4,26.125899,0.0,0.0,,1,0,31.00,False,1,1
12,00000010,00000010time20160127,1,2016-01-27,67,1,4,26.125899,,0.0,,0,1,31.80,False,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1247,00001124,00001124time20180105,0,2018-01-05,59,1,4,21.256828,0.0,1.0,,0,2,82.75,False,1,1
1248,00001125,00001125time20180426,0,2018-04-26,67,1,3,37.703191,1.0,0.0,0.0,1,0,88.00,False,1,1
1249,00001126,00001126time20180214,0,2018-02-14,63,1,1,20.231186,0.0,0.0,1.0,1,0,88.25,False,1,1
1250,00001127,00001127time20180307,0,2018-03-07,65,1,6,19.052911,0.0,1.0,0.0,1,0,90.00,False,1,1


In [31]:
# reset index and save as csv
cohort_df = cohort_df.reset_index(drop=True)
# out_path = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/vlsp/vlsp_cohort_v1.csv"
# cohort_df.to_csv(out_path, index=False)

### Cross sectional cohort
* remove subjects without full clinical features
* take the latest scan from each subject

In [142]:
cohort_df = pd.read_csv(out_path, dtype={'pid': str})
max_session = cohort_df.groupby('pid')['session'].idxmax()
cs_df = cohort_df.iloc[max_session]
cs_df = cs_df.reset_index(drop=True)

print(f"subjects with null values: {len(cs_df[cs_df.isnull().any(axis=1)])} / {len(cs_df)}")


subjects with null values: 385 / 844


### Impute missing values

In [149]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# one-hot encoding
features = ['age', 'race', 'education',  'bmi',  'copd', 'phist', 'fhist', 'smo_status', 'quit_time', 'pkyr']
label = ['lung_cancer']
features_df = cs_df[features]
x = pd.get_dummies(features_df, columns=['race', 'education'])
enc_features = x.columns
# Multiple linear imputation
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(x)
imp_x = pd.DataFrame(imp.transform(x), columns=enc_features)
imp_x[['copd', 'phist', 'fhist']] = imp_x[['copd', 'phist', 'fhist']].round()
imp_x = imp_x[['age', 'bmi',  'copd', 'phist', 'fhist', 'smo_status', 'quit_time', 'pkyr']] # drop categorical 
# merge imputed values with cohort labels using an index join
cs_imp = pd.merge(cs_df[['pid', 'id', 'session', 'lung_cancer', 'race', 'education']], imp_x, left_index=True, right_index=True)
cs_imp

Unnamed: 0,pid,id,session,lung_cancer,race,education,age,bmi,copd,phist,fhist,smo_status,quit_time,pkyr
0,00000009,00000009time20180517,2,False,1,2,77.0,24.679177,0.0,0.0,0.0,0.0,6.0,62.5
1,00000010,00000010time20180321,3,False,1,5,69.0,31.617330,0.0,0.0,0.0,0.0,2.0,31.8
2,00000012,00000012time20180411,1,False,1,3,61.0,31.783737,0.0,0.0,1.0,1.0,0.0,47.5
3,00000013,00000013time20140226,0,False,1,0,62.0,21.613634,0.0,0.0,0.0,1.0,0.0,40.0
4,00000014,00000014time20140226,0,False,1,0,62.0,21.143772,0.0,0.0,0.0,0.0,7.0,42.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
839,00001122,00001122time20180706,0,False,1,4,72.0,28.665190,0.0,0.0,1.0,1.0,0.0,81.0
840,00001123,00001123time20180713,0,False,2,2,55.0,32.931005,1.0,0.0,0.0,0.0,1.0,82.5
841,00001125,00001125time20180426,0,False,1,3,67.0,37.703191,1.0,0.0,0.0,1.0,0.0,88.0
842,00001127,00001127time20180307,0,False,1,6,65.0,19.052911,0.0,1.0,0.0,1.0,0.0,90.0


In [151]:
cs_imp['with_image'] = 1
cs_imp['with_marker'] = 1
cs_out_path = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/vlsp_cs_imp_v1.csv"
cs_imp.to_csv(cs_out_path, index=False)

In [161]:
cs_imp['lung_cancer'].value_counts()

False    821
True      23
Name: lung_cancer, dtype: int64

In [10]:
# characterize cohort
vlsp_path = "/home/local/VANDERBILT/litz/github/MASILab/DeepLungScreening/cohorts/vlsp_cohort_prep_v1.csv"
df = pd.read_csv(vlsp_path, dtype={'pid':str})
grp = df.groupby('pid', as_index=False).max()

In [15]:
covar = 'pkyr'
print(grp[covar].mean())
print(grp[covar].std())

48.404573459715635
20.61771441029971


In [19]:
covar = 'education'
print(grp[covar].value_counts())

4    229
2    198
5    181
6    112
1     68
3     40
0     16
Name: education, dtype: int64


### Longitudinal cohort
* remove subjects without full clinical features
* take the two latest scans from each subject

In [152]:
# remove subjects with less than two scans 
# cohort_df = cohort_df[~cohort_df.isnull().any(axis=1)]
scan_counts = cohort_df.groupby('pid', as_index=False)['session'].count()
scan_counts = scan_counts.rename(columns={'session':'scan_count'})
scan_counts = cohort_df.merge(scan_counts, on='pid')
two_scan = scan_counts[scan_counts['scan_count']>=2]

# gest two latest scans from each subject
two_latest = two_scan.groupby('pid')['session'].nlargest(2) # returns a multindex of two largest for each pid 
two_latest = two_latest.droplevel(level=0) # returns the row idx
two_scan.loc[two_latest.index] # use row idx to select from original df


# print(f"number of scans in multi-scan cohort: {len(scan_counts)}")
# print(f"total num scans in two-scan cohort: {len(two_scan)}")

Unnamed: 0,pid,id,session,age,race,education,bmi,copd,phist,fhist,smo_status,quit_time,pkyr,lung_cancer,with_image,with_marker,scan_count
2,00000009,00000009time20180517,2,77,1,2,24.679177,0.0,0.0,0.0,0,6,62.5,False,1,1,3
1,00000009,00000009time20170517,1,76,1,2,25.253111,0.0,0.0,,0,5,62.5,False,1,1,3
6,00000010,00000010time20180321,3,69,1,5,31.617330,0.0,0.0,0.0,0,2,31.8,False,1,1,4
5,00000010,00000010time20170118,2,68,1,4,26.125899,0.0,0.0,,1,1,31.8,False,1,1,4
8,00000012,00000012time20180411,1,61,1,3,31.783737,0.0,0.0,1.0,1,0,47.5,False,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
801,00000712,00000712time20170630,0,55,1,5,35.736786,0.0,0.0,1.0,0,7,31.0,False,1,1,2
811,00000722,00000722time20180711,1,70,1,4,23.515755,0.0,0.0,0.0,1,0,43.0,False,1,1,2
810,00000722,00000722time20170712,0,69,1,4,23.000811,0.0,0.0,,1,0,30.0,False,1,1,2
886,00000798,00000798time20180404,1,72,2,2,21.399360,1.0,0.0,0.0,1,0,55.0,False,1,1,2


In [158]:
a = two_scan.groupby('pid').max()
a['lung_cancer'].value_counts()

False    263
True       6
Name: lung_cancer, dtype: int64

### Missing data imputation

In [114]:
# check for NaN
print(len(cohort_df[cohort_df['age'].isnull()]))
print(len(cohort_df[cohort_df['race'].isnull()]))
print(len(cohort_df[cohort_df['education'].isnull()]))
print(len(cohort_df[cohort_df['bmi'].isnull()])) # removed 2 cases without
print(len(cohort_df[cohort_df['copd'].isnull()]))
print(len(cohort_df[cohort_df['phist'].isnull()]))
print(len(cohort_df[cohort_df['fhist'].isnull()]))
print(len(cohort_df[cohort_df['smo_status'].isnull()]))
print(len(cohort_df[cohort_df['quit_time'].isnull()]))
print(len(cohort_df[cohort_df['pkyr'].isnull()]))
print(len(cohort_df[cohort_df['lung_cancer'].isnull()]))

0
0
0
0
111
203
689
0
0
0
0


In [143]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# one-hot encoding
features = ['age', 'race', 'education',  'bmi',  'copd', 'phist', 'fhist', 'smo_status', 'quit_time', 'pkyr']
label = ['lung_cancer']
features_df = cohort_df[features]
x = pd.get_dummies(features_df, columns=['race', 'education'])
enc_features = x.columns
# Multiple linear imputation
imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(x)
imp_x = pd.DataFrame(imp.transform(x), columns=enc_features)
imp_x[['copd', 'phist', 'fhist']] = imp_x[['copd', 'phist', 'fhist']].round()

In [144]:
# merge imputed values with cohort labels using an index join
cohort_enc = pd.merge(cohort_df[['pid', 'id', 'session', 'lung_cancer']], imp_x, left_index=True, right_index=True)
cohort_enc

Unnamed: 0,pid,id,session,lung_cancer,age,bmi,copd,phist,fhist,smo_status,...,race_4,race_5,race_6,education_0,education_1,education_2,education_3,education_4,education_5,education_6
0,00000010,00000010time20140214,0,False,65.0,26.125899,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,00000013,00000013time20140226,0,False,62.0,21.613634,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00000014,00000014time20140226,0,False,62.0,21.143772,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00000015,00000015time20140307,0,False,65.0,25.318819,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,00000016,00000016time20140307,0,False,55.0,25.970529,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1238,00000604,00000604time20180418,0,False,56.0,31.778015,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1239,00001128,00001128time20180531,0,False,58.0,30.987211,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1240,00000411,00000411time20180119,1,False,76.0,20.360669,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1241,00000459,00000459time20180425,1,False,66.0,21.235571,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [145]:
cohort_enc.columns

Index(['pid', 'id', 'session', 'lung_cancer', 'age', 'bmi', 'copd', 'phist',
       'fhist', 'smo_status', 'quit_time', 'pkyr', 'race_0', 'race_1',
       'race_2', 'race_3', 'race_4', 'race_5', 'race_6', 'education_0',
       'education_1', 'education_2', 'education_3', 'education_4',
       'education_5', 'education_6'],
      dtype='object')

### Logistic Regression

In [147]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# normalize by subtracting mean and dividing by std
scalars = ['age', 'bmi', 'quit_time', 'pkyr']
cohort_enc[scalars].min()
cohort_enc[scalars] = (cohort_enc[scalars] - cohort_enc[scalars].min())/(cohort_enc[scalars].max() - cohort_enc[scalars].min())
cohort_enc


Unnamed: 0,pid,id,session,lung_cancer,age,bmi,copd,phist,fhist,smo_status,...,race_4,race_5,race_6,education_0,education_1,education_2,education_3,education_4,education_5,education_6
0,00000010,00000010time20140214,0,False,0.56250,0.278996,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,00000013,00000013time20140226,0,False,0.46875,0.171854,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00000014,00000014time20140226,0,False,0.46875,0.160697,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00000015,00000015time20140307,0,False,0.56250,0.259832,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,00000016,00000016time20140307,0,False,0.25000,0.275306,1.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1238,00000604,00000604time20180418,0,False,0.28125,0.413203,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1239,00001128,00001128time20180531,0,False,0.34375,0.394425,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1240,00000411,00000411time20180119,1,False,0.90625,0.142103,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1241,00000459,00000459time20180425,1,False,0.59375,0.162877,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [151]:
# fit model
train, test = train_test_split(cohort_enc, test_size=0.2)
X_train, y_train = train[enc_features].to_numpy(), train[label].to_numpy().ravel()
lr = LogisticRegression(solver='sag', random_state=0).fit(X_train, y_train)


In [152]:
# score model with test set
X_test, y_test = test[enc_features].to_numpy(), test[label].to_numpy().ravel()
lr.predict(X_test)
lr.score(X_test, y_test)

0.9678714859437751

In [154]:
# parameters
lr.coef_

array([[ 0.33259813,  0.15753824,  0.02066335,  0.54713064,  0.34667221,
         0.08009621,  0.15217798,  0.43773478, -0.02187434,  0.57821158,
        -0.29348304, -0.1221164 , -0.08431111, -0.04283327, -0.01402931,
         0.70726862,  0.51222347, -0.40347247, -0.61785786,  0.1054228 ,
         0.14098999, -0.44501044]])