In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
data_path = '../input/'

images_path = data_path + 'stage_2_train_images'
labels_path = data_path + 'stage_2_train_labels.csv'

detailed_class_info_path = data_path + 'stage_2_detailed_class_info.csv'

In [None]:
df = pd.read_csv(labels_path)
number_of_patients = len(df)
print('# of patients', number_of_patients)

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
ids = df['patientId'].tolist()
labels = df['Target'].tolist()

In [None]:
detailed_class_df = pd.read_csv(detailed_class_info_path, index_col=0)

not_normal_ids = detailed_class_df.index[detailed_class_df['class'] == 'No Lung Opacity / Not Normal'].tolist()

healthy_ids = detailed_class_df.index[detailed_class_df['class'] == 'Normal'].tolist()

pnemonia_ids = detailed_class_df.index[detailed_class_df['class'] == 'Lung Opacity'].tolist()

In [None]:
summary = {}
for n, row in detailed_class_df.iterrows():
    if row['class'] not in summary:
        summary[row['class']] = 0
    summary[row['class']] += 1
    
print(summary)

In [None]:
import pydicom

def get_dicom_data(patient_id):
    dcm_file = images_path + '/%s.dcm' % patient_id
    dcm_data = pydicom.read_file(dcm_file)
    return dcm_data

def get_image_array(patient_id):
    dcm_data = get_dicom_data(patient_id)
    image = dcm_data.pixel_array
    image = np.array(image) 
    return image

In [None]:
def get_meta_data(patientId):
    dcm_data = get_dicom_data(patientId)
    return {'age': int(dcm_data.PatientAge), 
            'gender': dcm_data.PatientSex, 
            'pixel_spacing': float(dcm_data.PixelSpacing[0]),
            'mean_black_pixels': np.mean(dcm_data.pixel_array == 0)}

In [None]:
def extract_box(row):
    box = [row['x'], row['y'], row['height'], row['width']]
    return box

def parse_data(df):
    parsed = dict()
    for n, row in df.iterrows():
        patient_id = row['patientId']
        if patient_id not in parsed:
            metadata = get_meta_data(patient_id)
            patient_data = {
                'label': row['Target'],
                'boxes': []
            }
            patient_data.update(metadata)
            

        if patient_data['label'] == 1:
            box = extract_box(row)
            patient_data['boxes'].append(box)
        
        parsed[patient_id] = patient_data
        
    return parsed

parsed = parse_data(df)

In [None]:
def overlay_box(im, box, rgb, stroke=1):
    # --- Convert coordinates to integers
    box = [int(b) for b in box]

    # --- Extract coordinates
    x1, y1, height, width = box
    y2 = y1 + height
    x2 = x1 + width

    im[y1:y1 + stroke, x1:x2] = rgb
    im[y2:y2 + stroke, x1:x2] = rgb
    im[y1:y2, x1:x1 + stroke] = rgb
    im[y1:y2, x2:x2 + stroke] = rgb

    return im 

def get_image_with_boxes(patient_id):
    patient_data = parsed[patient_id]
    im = get_image_array(patient_id)
    
    for box in patient_data['boxes']:
        rgb = np.floor(np.random.rand(1) * 256).astype('int')
        im = overlay_box(im=im, box=box, rgb=rgb, stroke=6)

    return im 
    

In [None]:
from matplotlib import pyplot as plt
import random

def draw_random_images(ids):
    cols = 3
    rows = 3
    n = cols*rows
    fig, ax = plt.subplots(ncols=cols, nrows=rows, figsize=(20, 20))

    for i in range(n):
        random_id = random.choice(ids)
        print(i, random_id)
        image = get_image_with_boxes(random_id)
        ax.ravel()[i].imshow(image, cmap=plt.gray())
        ax.ravel()[i].set_title(i)
        ax.ravel()[i].set_axis_off()
        i += 1 
    plt.show()

In [None]:
draw_random_images(pnemonia_ids)

In [None]:
draw_random_images(healthy_ids)

In [None]:
draw_random_images(not_normal_ids)

In [None]:
# explore
# split
# generator
# load base model
# create model
# train
# evaluate 
