In [2]:
%load_ext autoreload
%autoreload 2

In [40]:
import math
import os
from glob import glob

import numpy as np
import pandas as pd

In [4]:
from xray import data, params, trainer, utils

In [5]:
csv_labels = "sample_labels.csv"
path_to_csv = os.path.join("../../raw_data/sample-data/", csv_labels)

In [6]:
df = data.get_data(
    path_to_csv,
)
df.head(3)

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImageWidth,OriginalImageHeight,OriginalImagePixelSpacing_x,OriginalImagePixelSpacing_y
0,00000013_005.png,Emphysema|Infiltration|Pleural_Thickening|Pneu...,5,13,060Y,M,AP,3056,2544,0.139,0.139
1,00000013_026.png,Cardiomegaly|Emphysema,26,13,057Y,M,AP,2500,2048,0.168,0.168
2,00000017_001.png,No Finding,1,17,077Y,M,AP,2500,2048,0.168,0.168


In [7]:
df["labels"] = df["Finding Labels"].map(lambda x: x.split("|"))
df.head(3)

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImageWidth,OriginalImageHeight,OriginalImagePixelSpacing_x,OriginalImagePixelSpacing_y,labels
0,00000013_005.png,Emphysema|Infiltration|Pleural_Thickening|Pneu...,5,13,060Y,M,AP,3056,2544,0.139,0.139,"[Emphysema, Infiltration, Pleural_Thickening, ..."
1,00000013_026.png,Cardiomegaly|Emphysema,26,13,057Y,M,AP,2500,2048,0.168,0.168,"[Cardiomegaly, Emphysema]"
2,00000017_001.png,No Finding,1,17,077Y,M,AP,2500,2048,0.168,0.168,[No Finding]


In [8]:
df = df[df["Finding Labels"] != "No Finding"]

In [10]:
path_to_png = "../../raw_data/sample-data/images"

In [11]:
utils.get_paths(df, path_to_png, return_relative=False)

In [12]:
from sklearn.preprocessing import MultiLabelBinarizer

In [13]:
mlb = MultiLabelBinarizer().fit(df.labels)
mlb_classes = mlb.classes_
mlb_classes.shape

(14,)

In [14]:
y = mlb.transform(df.labels).astype("int16")

In [15]:
df_train, df_val, df_test = data.split_df(
    df, "Patient ID", (0.65, 0.175, 0.175), total_filter=0.3
)

## `tf.data.Dataset`

In [19]:
import random
from glob import glob

import tensorflow as tf


def make_dataset(
    path, batch_size, filenames, label_array, img_size: tuple = (224, 224)
):
    """
    - path: root to image folders
    - batch_size: to iterate
    - filenames: nd.array with list of absolute paths (filenames), in same order as label_array
    - label_array: matching index as filenames
    """

    def parse_image(filename):
        image = tf.io.read_file(filename)
        image = tf.image.decode_jpeg(image, channels=3)
        image = tf.image.resize(image, img_size)
        return image

    def configure_for_performance(ds):
        ds = ds.shuffle(buffer_size=1000)
        ds = ds.batch(batch_size)
        ds = ds.repeat()
        ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
        return ds

    # classes = os.listdir(path)
    # filenames = glob(path + "/*/*")
    filenames = filenames.unique()
    random.shuffle(filenames)
    # it = np.nditer(filenames, flags=['refs_ok', 'c_index'], )
    # for file in it:
    #     labels =
    # labels = [classes.index(name.split("/")[-2]) for name in filenames]
    labels = y

    filenames_ds = tf.data.Dataset.from_tensor_slices(filenames)
    images_ds = filenames_ds.map(
        parse_image, num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    labels_ds = tf.data.Dataset.from_tensor_slices(labels)
    ds = tf.data.Dataset.zip((images_ds, labels_ds))
    ds = configure_for_performance(ds)

    return ds

In [27]:
ds_train = make_dataset(path_to_png, 32, df_train.path, y)
ds_val = make_dataset(path_to_png, 32, df_val.path, y)

In [23]:
classes_dict = pd.DataFrame(mlb.classes_).to_dict()[0]
classes = mlb.classes_
num_images = df.shape[0]

In [28]:
model = trainer.Trainer(ds_train, ds_val, "multilabel")

2021-10-18 00:31:59.810600: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 411041792 exceeds 10% of free system memory.
2021-10-18 00:31:59.961941: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 411041792 exceeds 10% of free system memory.
2021-10-18 00:32:00.022862: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 411041792 exceeds 10% of free system memory.
2021-10-18 00:32:00.135648: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 67108864 exceeds 10% of free system memory.
2021-10-18 00:32:00.165154: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 67108864 exceeds 10% of free system memory.


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5


In [29]:
img_size = (224, 224)

In [31]:
model.build_cnn(
    input_shape=img_size,
    output_shape=len(classes),
    dense_layer_geometry=(1024, 512, 256),
    dropout_layers=True,
    dropout_rate=0.25,
)

In [33]:
model.pipeline.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg16 (Functional)           (None, 7, 7, 512)         14714688  
_________________________________________________________________
flatten_1 (Flatten)          (None, 25088)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 1024)              25691136  
_________________________________________________________________
dropout_3 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_4 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 256)              

In [38]:
model.compile_model()

In [48]:
# model.fit(dataset, batch_size=32, epochs=5, steps_per_epoch=math.ceil(num_images/32))
batch_size = 32
epochs = 5
training_images = df_train.shape[0]
steps_per_epoch = math.ceil(training_images / 32)

validation_images = df_train.shape[0]
validation_steps = math.ceil(validation_images / 32)

In [49]:
steps_per_epoch

16

In [50]:
model.fit_model(
    epochs=epochs,
    batch_size=batch_size,
    steps_per_epoch=steps_per_epoch,
    validation_steps=validation_steps,
)

Epoch 1/5

Epoch 00001: val_loss improved from inf to 0.39249, saving model to best_weights.hdf5
Epoch 2/5

Epoch 00002: val_loss did not improve from 0.39249
Epoch 3/5

Epoch 00003: val_loss improved from 0.39249 to 0.38338, saving model to best_weights.hdf5
Epoch 4/5

Epoch 00004: val_loss did not improve from 0.38338
Epoch 5/5

Epoch 00005: val_loss did not improve from 0.38338


<keras.callbacks.History at 0x7f4a6dcb3c70>

# Aux

In [None]:
filenames[0:10]

In [None]:
df[df["Image Index"] == "00010162_000.png"].index[0]

In [None]:
for path in filenames[0:10]:
    name = path.split("/")[-1]
    idx = df[df["Image Index"] == name].index[0]
    label = y[idx]
    print(idx)
    print(label)

In [None]:
labels = [df[df["Image Index"] == path.split("/")[-1]].index[0] for path in filenames]

In [None]:
len(df["Image Index"])

In [None]:
y.shape

In [None]:
IMG_SIZE = 224

In [None]:
os.listdir(path_to_png)

In [None]:
df.drop(
    columns=[
        "Follow-up #",
        "Patient Age",
        "Patient Gender",
        "View Position",
        "OriginalImagePixelSpacing_x",
        "OriginalImagePixelSpacing_y",
        "OriginalImageWidth",
        "OriginalImageHeight",
        "OriginalImage[Width",
        "Height]",
        "OriginalImagePixelSpacing[x",
        "y]",
        "Count_diseases",
    ],
    inplace=True,
    errors="ignore",
)
df.head(3)