In [51]:
# Imports
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from os import listdir
from os.path import isfile, join
import matplotlib.pylab as plt
import os
import seaborn as sns
from tqdm import tqdm

from keras.applications import DenseNet121
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import Callback, ModelCheckpoint
from keras.preprocessing.image import ImageDataGenerator
from keras.initializers import Constant
from keras.models import Sequential
from keras.optimizers import Adam
from keras import layers

import os


# Any results you write to the current directory are saved as output.

Set the input path of the data for easy access when importing the files.

In [30]:
INPUT_PATH = "../input/rsna-intracranial-hemorrhage-detection/"
TRAIN_DIRECTORY = 'stage_1_train_images/'
TEST_DIRECTORY = 'stage_1_test_images/'

In [22]:
train_dataframe = pd.read_csv(INPUT_PATH + "stage_1_train.csv")
train_dataframe.head()

Unnamed: 0,ID,Label
0,ID_63eb1e259_epidural,0
1,ID_63eb1e259_intraparenchymal,0
2,ID_63eb1e259_intraventricular,0
3,ID_63eb1e259_subarachnoid,0
4,ID_63eb1e259_subdural,0


In [4]:
label = train_dataframe.Label.values

We need to separate the labels from the id in order to structure the data to better work with a CNN


Reformat the CSV's

In [24]:
train_dataframe['filename'] = train_dataframe['ID'].apply(lambda st: "ID_" + st.split('_')[1] + ".png")
train_dataframe['type'] = train_dataframe['ID'].apply(lambda st: st.split('_')[2])

In [25]:
train_dataframe.head()

Unnamed: 0,ID,Label,filename,type
0,ID_63eb1e259_epidural,0,ID_63eb1e259.png,epidural
1,ID_63eb1e259_intraparenchymal,0,ID_63eb1e259.png,intraparenchymal
2,ID_63eb1e259_intraventricular,0,ID_63eb1e259.png,intraventricular
3,ID_63eb1e259_subarachnoid,0,ID_63eb1e259.png,subarachnoid
4,ID_63eb1e259_subdural,0,ID_63eb1e259.png,subdural


We should reformat the data so that each column is a label, which will allow us to work with the data better in the CNN.

In [27]:
pivot_df = train_dataframe[['Label', 'filename', 'type']].drop_duplicates().pivot(
    index='filename', columns='type', values='Label').reset_index()
print(pivot_df.shape)
pivot_df.head()

(674258, 7)


type,filename,any,epidural,intraparenchymal,intraventricular,subarachnoid,subdural
0,ID_000039fa0.png,0,0,0,0,0,0
1,ID_00005679d.png,0,0,0,0,0,0
2,ID_00008ce3c.png,0,0,0,0,0,0
3,ID_0000950d7.png,0,0,0,0,0,0
4,ID_0000aee4b.png,0,0,0,0,0,0


In [7]:
train_dir = INPUT_PATH + "stage_1_train_images/"
train_files = os.listdir(train_dir)
train_size = len(train_files)
train_size

674258

**Image Examples, loading images**

In [12]:
#Get the training image directory
train_images_directory = '../input/rsna-intracranial-hemorrhage-detection/stage_1_train_images/'
#get the training images
train_images = [file for file in listdir(train_images_directory) if isfile(join(train_images_directory,file))]
#repeat for test images
test_images_directory = '../input/rsna-intracranial-hemorrhage-detection/stage_1_test_images/'
test_images = [file for file in listdir(test_images_directory) if isfile(join(test_images_directory, file))]
#check some image filenames
print('5 Training Image Files', train_images[:5] )

5 Training Image Files ['ID_ff816e9b6.dcm', 'ID_282a7f3cd.dcm', 'ID_f1c65b76e.dcm', 'ID_3b59681d3.dcm', 'ID_f506d79aa.dcm']


We need to do some Preprocessing before we can feed this into a NN. 

Source: https://www.kaggle.com/omission/eda-view-dicom-images-with-correct-windowing

We should rescale the data, resize the data, and convert to png



In [31]:
def window_image(img, window_center,window_width, intercept, slope, rescale=True):

    img = (img*slope +intercept)
    img_min = window_center - window_width//2
    img_max = window_center + window_width//2
    img[img<img_min] = img_min
    img[img>img_max] = img_max
    
    if rescale:
        # Extra rescaling to 0-1, not in the original notebook
        img = (img - img_min) / (img_max - img_min)
    
    return img
    
def get_first_of_dicom_field_as_int(x):
    #get x[0] as in int is x is a 'pydicom.multival.MultiValue', otherwise get int(x)
    if type(x) == pydicom.multival.MultiValue:
        return int(x[0])
    else:
        return int(x)

def get_windowing(data):
    dicom_fields = [data[('0028','1050')].value, #window center
                    data[('0028','1051')].value, #window width
                    data[('0028','1052')].value, #intercept
                    data[('0028','1053')].value] #slope
    return [get_first_of_dicom_field_as_int(x) for x in dicom_fields]

In [38]:
from tqdm import tqdm
import pydicom
import cv2
def save_and_resize(filenames, load_dir):    
    save_dir = '/kaggle/tmp/'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    for filename in tqdm(filenames):
        path = load_dir + filename
        new_path = save_dir + filename.replace('.dcm', '.png')
        
        dcm = pydicom.dcmread(path)
        window_center , window_width, intercept, slope = get_windowing(dcm)
        img = dcm.pixel_array
        img = window_image(img, window_center, window_width, intercept, slope)
        
        resized = cv2.resize(img, (224, 224))
        res = cv2.imwrite(new_path, resized)

In [52]:
#to save time with the commit I have commented this step out - it takes some time
#TODO : Speed this up or look at working directly with the dicom images? 

#save_and_resize(filenames=train_files, load_dir=INPUT_PATH + TRAIN_DIRECTORY)
#save_and_resize(filenames=os.listdir(INPUT_PATH + TEST_DIRECTORY), load_dir=INPUT_PATH + TEST_DIRECTORY)

In [47]:
#Lets try using DenseNet 121

densenet= DenseNet121(
    weights = None,
    include_top= False,
    input_shape=(224,224,3)
)

In [53]:
def build_dense_model():
    model = Sequential()
    model.add(densenet)
    model.add(layers.GlobalAveragePooling2D())
    model.add(layers.Dense(6, activation='sigmoid', 
                           bias_initializer=Constant(value=-5.5)))
    
    model.compile(
        loss='categorical_crossentropy',
        optimizer=Adam(lr=0.001),
        metrics=['accuracy']
    )
    
    return model

In [54]:
model = build_dense_model()
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
densenet121 (Model)          (None, 7, 7, 1024)        7037504   
_________________________________________________________________
global_average_pooling2d_1 ( (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 6150      
Total params: 7,043,654
Trainable params: 6,960,006
Non-trainable params: 83,648
_________________________________________________________________
