This kernel is a fork from Jonne's kernel (https://www.kaggle.com/jonnedtc/cnn-segmentation-connected-components). Jonne creates a submission using a convolutional neural network. However, Jonne does not use any DICOM data for the prediction. I am creating this kernel to improve on Jonne's predictions by using the DICOM data. The model I am using is LightGBM, since it is fast, often accurate, and reliable.

This kernel is also a fork from jtlowery's kernel (https://www.kaggle.com/jtlowery/intro-eda-with-dicom-metadata). Jtlowery's kernel has functions I can copy to read in the DICOM data.[](http://)

In [1]:
from functools import partial
from collections import defaultdict
import pydicom
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

np.warnings.filterwarnings('ignore')

In [2]:
DATA_DIR = '../pneumonia_data'
IMAGE_DIR = '../pneumonia_data/test_images'
labels = pd.read_csv(DATA_DIR+'/stage_1_train_labels.csv')
details = pd.read_csv(DATA_DIR+'/stage_1_detailed_class_info.csv')
# duplicates in details just have the same class so can be safely dropped
details = details.drop_duplicates('patientId').reset_index(drop=True)
labels_w_class = labels.merge(details, how='inner', on='patientId')

In [3]:
# get lists of all train/test dicom filepaths
train_dcm_fps = glob.glob(IMAGE_DIR+'/*.dcm')
test_dcm_fps = glob.glob(IMAGE_DIR+'/*.dcm')

train_dcms = [pydicom.read_file(x, stop_before_pixels=True) for x in train_dcm_fps]
test_dcms = [pydicom.read_file(x, stop_before_pixels=True) for x in test_dcm_fps]

In [4]:
def parse_dcm_metadata(dcm):
    unpacked_data = {}
    group_elem_to_keywords = {}
    # iterating here to force conversion from lazy RawDataElement to DataElement
    for d in dcm:
        pass
    # keys are pydicom.tag.BaseTag, values are pydicom.dataelem.DataElement
    for tag, elem in dcm.items():
        tag_group = tag.group
        tag_elem = tag.elem
        keyword = elem.keyword
        group_elem_to_keywords[(tag_group, tag_elem)] = keyword
        value = elem.value
        unpacked_data[keyword] = value
    return unpacked_data, group_elem_to_keywords

train_meta_dicts, tag_to_keyword_train = zip(*[parse_dcm_metadata(x) for x in train_dcms])
test_meta_dicts, tag_to_keyword_test = zip(*[parse_dcm_metadata(x) for x in test_dcms])

In [5]:
# join all the dicts
unified_tag_to_key_train = {k:v for dict_ in tag_to_keyword_train for k,v in dict_.items()}
unified_tag_to_key_test = {k:v for dict_ in tag_to_keyword_test for k,v in dict_.items()}

# quick check to make sure there are no different keys between test/train
assert len(set(unified_tag_to_key_test.keys()).symmetric_difference(set(unified_tag_to_key_train.keys()))) == 0

tag_to_key = {**unified_tag_to_key_test, **unified_tag_to_key_train}
tag_to_key

{(8, 5): 'SpecificCharacterSet',
 (8, 22): 'SOPClassUID',
 (8, 24): 'SOPInstanceUID',
 (8, 32): 'StudyDate',
 (8, 48): 'StudyTime',
 (8, 80): 'AccessionNumber',
 (8, 96): 'Modality',
 (8, 100): 'ConversionType',
 (8, 144): 'ReferringPhysicianName',
 (8, 4158): 'SeriesDescription',
 (16, 16): 'PatientName',
 (16, 32): 'PatientID',
 (16, 48): 'PatientBirthDate',
 (16, 64): 'PatientSex',
 (16, 4112): 'PatientAge',
 (24, 21): 'BodyPartExamined',
 (24, 20737): 'ViewPosition',
 (32, 13): 'StudyInstanceUID',
 (32, 14): 'SeriesInstanceUID',
 (32, 16): 'StudyID',
 (32, 17): 'SeriesNumber',
 (32, 19): 'InstanceNumber',
 (32, 32): 'PatientOrientation',
 (40, 2): 'SamplesPerPixel',
 (40, 4): 'PhotometricInterpretation',
 (40, 16): 'Rows',
 (40, 17): 'Columns',
 (40, 48): 'PixelSpacing',
 (40, 256): 'BitsAllocated',
 (40, 257): 'BitsStored',
 (40, 258): 'HighBit',
 (40, 259): 'PixelRepresentation',
 (40, 8464): 'LossyImageCompression',
 (40, 8468): 'LossyImageCompressionMethod'}

In [6]:
# using from_records here since some values in the dicts will be iterables and some are constants
train_df = pd.DataFrame.from_records(data=train_meta_dicts)
test_df = pd.DataFrame.from_records(data=test_meta_dicts)
train_df['dataset'] = 'train'
test_df['dataset'] = 'test'
df = pd.concat([train_df, test_df])

In [7]:
df.head(1)

Unnamed: 0,AccessionNumber,BitsAllocated,BitsStored,BodyPartExamined,Columns,ConversionType,HighBit,InstanceNumber,LossyImageCompression,LossyImageCompressionMethod,...,SeriesDescription,SeriesInstanceUID,SeriesNumber,SpecificCharacterSet,StudyDate,StudyID,StudyInstanceUID,StudyTime,ViewPosition,dataset
0,,8,8,CHEST,1024,WSD,7,1,1,ISO_10918_1,...,view: AP,1.2.276.0.7230010.3.1.3.8323329.20023.15178744...,1,ISO_IR 100,19010101,,1.2.276.0.7230010.3.1.2.8323329.20023.15178744...,0.0,AP,train


In [8]:
# separating PixelSpacing list to single values
df['PixelSpacing_x'] = df['PixelSpacing'].apply(lambda x: x[0])
df['PixelSpacing_y'] = df['PixelSpacing'].apply(lambda x: x[1])
df = df.drop(['PixelSpacing'], axis='columns')

# x and y are always the same
assert sum(df['PixelSpacing_x'] != df['PixelSpacing_y']) == 0

In [9]:
# ReferringPhysicianName appears to just be empty strings
assert sum(df['ReferringPhysicianName'] != '') == 0

# SeriesDescription appears to be 'view: {}'.format(ViewPosition)
set(df['SeriesDescription'].unique())

# so these two columns don't have any useful info and can be safely dropped

{'view: AP', 'view: PA'}

In [10]:
nunique_all = df.aggregate('nunique')
nunique_all

AccessionNumber                   1
BitsAllocated                     1
BitsStored                        1
BodyPartExamined                  1
Columns                           1
ConversionType                    1
HighBit                           1
InstanceNumber                    1
LossyImageCompression             1
LossyImageCompressionMethod       1
Modality                          1
PatientAge                       79
PatientBirthDate                  1
PatientID                      1000
PatientName                    2000
PatientOrientation                1
PatientSex                        2
PhotometricInterpretation         1
PixelRepresentation               1
ReferringPhysicianName          875
Rows                              1
SOPClassUID                       1
SOPInstanceUID                 1000
SamplesPerPixel                   1
SeriesDescription                 2
SeriesInstanceUID              1000
SeriesNumber                      1
SpecificCharacterSet        

In [11]:
# drop constant cols and other two from above
#ReferringPhysicianName is all ''
#PatientName is the same as PatientID
#PixelSpacing_y is the same as PixelSpacing_x
#The series and SOP UID's are just random numbers / id's, so I'm deleting them too
df = df.drop(nunique_all[nunique_all == 1].index.tolist() + ['SeriesDescription', 'ReferringPhysicianName', 'PatientName', 'PixelSpacing_y', 'SOPInstanceUID','SeriesInstanceUID','StudyInstanceUID'], axis='columns')

# now that we have a clean metadata dataframe we can merge back to our initial tabular data with target and class info
df = df.merge(labels_w_class, how='left', left_on='PatientID', right_on='patientId')

df['PatientAge'] = df['PatientAge'].astype(int)

In [12]:
# df now has multiple rows for some patients (those with multiple bounding boxes in label_w_class)
# so creating one with no duplicates for patients
df_deduped = df.drop_duplicates('PatientID', keep='first')

In [13]:
df_deduped.head()

Unnamed: 0,PatientAge,PatientID,PatientSex,ViewPosition,dataset,PixelSpacing_x,patientId,x,y,width,height,Target,class
0,19,000924cf-0f8d-42bd-9158-1af53881a557,F,AP,train,0.139,,,,,,,
1,25,000db696-cf54-4385-b10b-6b16fbb3f985,F,AP,train,0.168,,,,,,,
2,40,000fe35a-2649-43d4-b027-e67796d412e0,M,AP,train,0.171,,,,,,,
3,57,001031d9-f904-4a23-b3e5-2c088acd19c6,M,PA,train,0.139,,,,,,,
4,56,0010f549-b242-4e94-87a8-57d79de215fc,M,PA,train,0.194311,,,,,,,


In [14]:
#Correct ages that are mistyped
df_deduped.loc[df_deduped['PatientAge'] > 140, 'PatientAge'] = df_deduped.loc[df_deduped['PatientAge'] > 140, 'PatientAge'] - 100

In [15]:
#Convert binary features from categorical to 0/1
# Categorical features with Binary encode (0 or 1; two categories)
for bin_feature in ['PatientSex', 'ViewPosition']:
    df_deduped[bin_feature], uniques = pd.factorize(df_deduped[bin_feature])

In [16]:
#Drop the duplicated column patientID
del df_deduped['patientId']

#Drop columns that are going to be repetitive
del df_deduped['dataset']

In [17]:
df_deduped.head()

Unnamed: 0,PatientAge,PatientID,PatientSex,ViewPosition,PixelSpacing_x,x,y,width,height,Target,class
0,19,000924cf-0f8d-42bd-9158-1af53881a557,0,0,0.139,,,,,,
1,25,000db696-cf54-4385-b10b-6b16fbb3f985,0,0,0.168,,,,,,
2,40,000fe35a-2649-43d4-b027-e67796d412e0,1,0,0.171,,,,,,
3,57,001031d9-f904-4a23-b3e5-2c088acd19c6,1,1,0.139,,,,,,
4,56,0010f549-b242-4e94-87a8-57d79de215fc,1,1,0.194311,,,,,,


Now that we have a data frame that links PatientID to DICOM data, let's merge this with train and the submission file.

In [18]:
#TODO: Input my on prediction

jonneoofs = pd.read_csv("../input/jonneoofs/jonne_oofs.csv")
jonneoofs = jonneoofs.sort_values('patientID').reset_index(drop=True)
andyharless_sub = pd.read_csv("../input/andyharless/submission (7).csv")

FileNotFoundError: File b'../input/jonneoofs/jonne_oofs.csv' does not exist

In [None]:
labels.head() #The real train

In [None]:
jonneoofs.head() #The oofs from Jonne's kernel

In [None]:
andyharless_sub.head() # The submission from Andy Harless, which is a fork from Jonne

In [None]:
jonneoofs['i_am_train'] = 1
andyharless_sub['i_am_train'] = 0
tr_te = jonneoofs.append(andyharless_sub)

In [None]:
del tr_te['confidence'] #Not used in grading

In [None]:
tr_te.columns = ['PatientID','x_guess','y_guess','width_guess','height_guess','i_am_train']
tr_te.head()

In [None]:
df_deduped.head()

In [None]:
merged_df = tr_te.merge(df_deduped, how='left', on='PatientID')
merged_df.head()

In [None]:
filledmerged_df = merged_df.fillna(-1) #Fill in missings

# Predict for x

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold

train_df = filledmerged_df[filledmerged_df['i_am_train']==1]
test_df = filledmerged_df[filledmerged_df['i_am_train']==0]
             
#Cross validate with K Fold, 5 splits
folds = KFold(n_splits= 5, shuffle=True, random_state=2222)

# Create arrays and dataframes to store results
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
             
feats = [f for f in train_df.columns if f not in ['PatientID', 'i_am_train', 'x','y','width','height','Target','class']]

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats])):
    dtrain = lgb.Dataset(data=train_df[feats].iloc[train_idx], 
                         label=train_df['x'].iloc[train_idx], 
                         free_raw_data=False, silent=True)
    dvalid = lgb.Dataset(data=train_df[feats].iloc[valid_idx], 
                         label=train_df['x'].iloc[valid_idx], 
                         free_raw_data=False, silent=True)

    params = {
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'nthread': 4,
        'learning_rate': 0.10, 
        'max_depth': 2,
        #'reg_alpha': 0,
        #'reg_lambda': 0,
        #'min_split_gain': 0.0222415,
        'seed': 15000,
        'verbose': 50,
        'metric': 'l2',
    }

    clf = lgb.train(
        params=params,
        train_set=dtrain,
        num_boost_round=1000,
        valid_sets=[dtrain, dvalid],
        early_stopping_rounds=50,
        verbose_eval=True
    )

    oof_preds[valid_idx] = clf.predict(dvalid.data)
    sub_preds += clf.predict(test_df[feats]) / folds.n_splits


In [None]:
xpreds_oof = oof_preds.copy()
xpreds_sub = sub_preds.copy()

# Predict for y

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold

train_df = filledmerged_df[filledmerged_df['i_am_train']==1]
test_df = filledmerged_df[filledmerged_df['i_am_train']==0]
             
#Cross validate with K Fold, 5 splits
folds = KFold(n_splits= 5, shuffle=True, random_state=2222)

# Create arrays and dataframes to store results
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
             
feats = [f for f in train_df.columns if f not in ['PatientID', 'i_am_train', 'x','y','width','height','Target','class']]

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats])):
    dtrain = lgb.Dataset(data=train_df[feats].iloc[train_idx], 
                         label=train_df['y'].iloc[train_idx], 
                         free_raw_data=False, silent=True)
    dvalid = lgb.Dataset(data=train_df[feats].iloc[valid_idx], 
                         label=train_df['y'].iloc[valid_idx], 
                         free_raw_data=False, silent=True)

    params = {
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'nthread': 4,
        'learning_rate': 0.10, 
        'max_depth': 2,
        #'reg_alpha': 0,
        #'reg_lambda': 0,
        #'min_split_gain': 0.0222415,
        'seed': 15000,
        'verbose': 50,
        'metric': 'l2',
    }

    clf = lgb.train(
        params=params,
        train_set=dtrain,
        num_boost_round=1000,
        valid_sets=[dtrain, dvalid],
        early_stopping_rounds=50,
        verbose_eval=True
    )

    oof_preds[valid_idx] = clf.predict(dvalid.data)
    sub_preds += clf.predict(test_df[feats]) / folds.n_splits


In [None]:
ypreds_oof = oof_preds.copy()
ypreds_sub = sub_preds.copy()

# Predict for width

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold

train_df = filledmerged_df[filledmerged_df['i_am_train']==1]
test_df = filledmerged_df[filledmerged_df['i_am_train']==0]
             
#Cross validate with K Fold, 5 splits
folds = KFold(n_splits= 5, shuffle=True, random_state=2222)

# Create arrays and dataframes to store results
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
             
feats = [f for f in train_df.columns if f not in ['PatientID', 'i_am_train', 'x','y','width','height','Target','class']]

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats])):
    dtrain = lgb.Dataset(data=train_df[feats].iloc[train_idx], 
                         label=train_df['width'].iloc[train_idx], 
                         free_raw_data=False, silent=True)
    dvalid = lgb.Dataset(data=train_df[feats].iloc[valid_idx], 
                         label=train_df['width'].iloc[valid_idx], 
                         free_raw_data=False, silent=True)

    params = {
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'nthread': 4,
        'learning_rate': 0.10, 
        'max_depth': 2,
        #'reg_alpha': 0,
        #'reg_lambda': 0,
        #'min_split_gain': 0.0222415,
        'seed': 15000,
        'verbose': 50,
        'metric': 'l2',
    }

    clf = lgb.train(
        params=params,
        train_set=dtrain,
        num_boost_round=1000,
        valid_sets=[dtrain, dvalid],
        early_stopping_rounds=50,
        verbose_eval=True
    )

    oof_preds[valid_idx] = clf.predict(dvalid.data)
    sub_preds += clf.predict(test_df[feats]) / folds.n_splits


In [None]:
widthpreds_oof = oof_preds.copy()
widthpreds_sub = sub_preds.copy()

# Predict for height

In [None]:
import lightgbm as lgb
from sklearn.model_selection import KFold, StratifiedKFold

train_df = filledmerged_df[filledmerged_df['i_am_train']==1]
test_df = filledmerged_df[filledmerged_df['i_am_train']==0]
             
#Cross validate with K Fold, 5 splits
folds = KFold(n_splits= 5, shuffle=True, random_state=2222)

# Create arrays and dataframes to store results
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])
             
feats = [f for f in train_df.columns if f not in ['PatientID', 'i_am_train', 'x','y','width','height','Target','class']]

for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats])):
    dtrain = lgb.Dataset(data=train_df[feats].iloc[train_idx], 
                         label=train_df['height'].iloc[train_idx], 
                         free_raw_data=False, silent=True)
    dvalid = lgb.Dataset(data=train_df[feats].iloc[valid_idx], 
                         label=train_df['height'].iloc[valid_idx], 
                         free_raw_data=False, silent=True)

    params = {
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'nthread': 4,
        'learning_rate': 0.10, 
        'max_depth': 2,
        #'reg_alpha': 0,
        #'reg_lambda': 0,
        #'min_split_gain': 0.0222415,
        'seed': 15000,
        'verbose': 50,
        'metric': 'l2',
    }

    clf = lgb.train(
        params=params,
        train_set=dtrain,
        num_boost_round=1000,
        valid_sets=[dtrain, dvalid],
        early_stopping_rounds=50,
        verbose_eval=True
    )

    oof_preds[valid_idx] = clf.predict(dvalid.data)
    sub_preds += clf.predict(test_df[feats]) / folds.n_splits


In [None]:
heightpreds_oof = oof_preds.copy()
heightpreds_sub = sub_preds.copy()

# Remove any boxes below a threshold

In [None]:
# What is the number of rows where we have a box?
train_df.loc[train_df['x'] > -1]['x'].shape[0] / train_df.shape[0]

0.22 rows have a box, so now let's cull our predictions until only there is 0.22

In [None]:
train_df['xpredsoof'] = xpreds_oof
train_df['ypredsoof'] = ypreds_oof
train_df['widthpredsoof'] = widthpreds_oof
train_df['heightpredsoof'] = heightpreds_oof

In [None]:
train_df.loc[train_df['widthpredsoof'] <= 100]

In [None]:
#train_df.loc[(train_df['xpredsoof'] > 130) & (train_df['ypredsoof'] > 134)].shape[0] / train_df.shape[0]
train_df.loc[(train_df['widthpredsoof'] > 100)].shape[0] / train_df.shape[0]

In [None]:
andyharless_sub['xpred'] = xpreds_sub
andyharless_sub['ypred'] = ypreds_sub
andyharless_sub['widthpred'] = widthpreds_sub
andyharless_sub['heightpred'] = heightpreds_sub

andyharless_sub['xpred'] = andyharless_sub['xpred'].round()
andyharless_sub['ypred'] = andyharless_sub['ypred'].round()
andyharless_sub['widthpred'] = andyharless_sub['widthpred'].round()
andyharless_sub['heightpred'] = andyharless_sub['heightpred'].round()

In [None]:
#andyharless_sub.loc[andyharless_sub['widthpred'] <= 100, 'xpred'] = ''
#andyharless_sub.loc[andyharless_sub['widthpred'] <= 100, 'ypred'] = ''
#andyharless_sub.loc[andyharless_sub['widthpred'] <= 100, 'heightpred'] = ''
#andyharless_sub.loc[andyharless_sub['widthpred'] <= 100, 'widthpred'] = ''
andyharless_sub['confidence'] = '1'

In [None]:
andyharless_sub.head()

In [None]:
#del andyharless_sub['x']
#del andyharless_sub['y']
#del andyharless_sub['width']
#del andyharless_sub['height']
#del andyharless_sub['i_am_train']

In [None]:
andyharless_sub['PredictionString'] = andyharless_sub['confidence'].map(str)+' '+andyharless_sub['xpred'].map(str)+' '+andyharless_sub['ypred'].map(str)+' '+andyharless_sub['widthpred'].map(str)+' '+andyharless_sub['heightpred'].map(str)

In [None]:
andyharless_sub.loc[andyharless_sub['PredictionString']=='1    ', 'PredictionString'] = '' #Correct empties

In [None]:
andyharless_sub.loc[andyharless_sub['x'].isnull(), 'PredictionString'] = '' #Remove boxes if we predicted there were none

In [None]:
andyharless_sub[['patientID','PredictionString']].to_csv('dicom_corrections.csv', index=False)