In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import math
import os
from glob import glob

import numpy as np
import pandas as pd

In [None]:
# Uncomment if on Collab or VM.

# !pip install git+https://github.com/Giovita/xray-exam-diagnosis-cnn.git

# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
from xray import data, params, trainer, utils

In [None]:
csv_labels = "sample_labels.csv"
path_to_csv = os.path.join("../../raw_data/sample-data/", csv_labels)

In [None]:
filename = "xray_df.csv"
df = data.get_data(
    path_to_csv,
)
# df.head(3)

In [None]:
df = data.get_data_from_gcp(filename)
df.head(3)

In [None]:
df["labels"] = df["Fixed_Labels"].map(lambda x: x.split("|"))
# df.head(3)

In [None]:
df = df[df["Enfermo"] != True]

In [None]:
path_to_png = "../../raw_data/sample-data/images"

In [None]:
utils.get_paths(df, path_to_png, return_relative=False)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
mlb = MultiLabelBinarizer().fit(df.labels)
mlb_classes = mlb.classes_
mlb_classes.shape

In [None]:
y = mlb.transform(df.labels).astype("int16")

In [None]:
df_train, df_val, df_test = data.split_df(
    df, "Patient ID", (0.65, 0.175, 0.175), total_filter=0.3
)

#### `tf.data.Dataset`

In [None]:
import random
from glob import glob

import tensorflow as tf


def make_dataset(
    path, batch_size, filenames, label_array, img_size: tuple = (224, 224)
):
    """
    - path: root to image folders
    - batch_size: to iterate
    - filenames: nd.array with list of absolute paths (filenames), in same order as label_array
    - label_array: matching index as filenames
    """

    def parse_image(filename):
        image = tf.io.read_file(filename)
        image = tf.image.decode_jpeg(image, channels=3)
        image = tf.image.resize(image, img_size)
        return image

    def configure_for_performance(ds):
        ds = ds.shuffle(buffer_size=1000)
        ds = ds.batch(batch_size)
        ds = ds.repeat()
        ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
        return ds

    # classes = os.listdir(path)
    # filenames = glob(path + "/*/*")
    filenames = filenames.unique()
    random.shuffle(filenames)
    # it = np.nditer(filenames, flags=['refs_ok', 'c_index'], )
    # for file in it:
    #     labels =
    # labels = [classes.index(name.split("/")[-2]) for name in filenames]
    labels = y

    filenames_ds = tf.data.Dataset.from_tensor_slices(filenames)
    images_ds = filenames_ds.map(
        parse_image, num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    labels_ds = tf.data.Dataset.from_tensor_slices(labels)
    ds = tf.data.Dataset.zip((images_ds, labels_ds))
    ds = configure_for_performance(ds)

    return ds

In [None]:
# # y = y.tolist()
# df_train = df_train.path.to_list()
# df_val = df_val.path.to_list()
# df_test = df_test.path.to_list()

ds_train = data.make_dataset(path_to_png, 32, df_train, y)
ds_val = data.make_dataset(path_to_png, 32, df_val, y)

In [None]:
# ds_train = data.make_dataset(path_to_png, 32, df_train.path, y)
# ds_val = data.make_dataset(path_to_png, 32, df_val.path, y)

In [None]:
classes_dict = pd.DataFrame(mlb.classes_).to_dict()[0]
classes = mlb.classes_
num_images = df.shape[0]

In [None]:
model = trainer.Trainer(ds_train, ds_val, "multilabel")

In [None]:
img_size = (224, 224)

In [None]:
model.build_cnn(
    input_shape=img_size,
    output_shape=len(classes),
    dense_layer_geometry=(1024, 512, 256),
    dropout_layers=True,
    dropout_rate=0.25,
)

In [None]:
model.pipeline.summary()

In [None]:
model.compile_model()

In [None]:
# model.fit(dataset, batch_size=32, epochs=5, steps_per_epoch=math.ceil(num_images/32))
batch_size = 32
epochs = 3
training_images = len(df_train)
steps_per_epoch = math.ceil(training_images / 32)

validation_images = len(df_val)
validation_steps = math.ceil(validation_images / 32)

In [None]:
steps_per_epoch

In [None]:
model.fit_model(
    epochs=epochs,
    batch_size=batch_size,
    steps_per_epoch=steps_per_epoch,
    validation_steps=validation_steps,
)

# Test GCP_BUCKETS

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math
import os
from glob import glob

import numpy as np
import pandas as pd

In [3]:
# Uncomment if on Collab or VM.

# !pip install git+https://github.com/Giovita/xray-exam-diagnosis-cnn.git

# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
from xray import data, params, trainer, utils

In [5]:

# df = data.get_data(
#     path_to_csv,
# )
# # df.head(3)

In [7]:
filename = 'xray_df.csv'
df = data.get_data_from_gcp(filename)
df.head(3)

Unnamed: 0,Image Index,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],...,Infiltration,Mass,No Finding,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax,Fixed_Labels,path,cat_col
0,00000001_000.png,0,1,58,M,PA,2682,2749,0.143,0.143,...,0,0,0,0,0,0,0,Cardiomegaly,/content/drive/MyDrive/Proyecto_Lewagon_Rayos_...,['Cardiomegaly']
1,00000001_001.png,1,1,58,M,PA,2894,2729,0.143,0.143,...,0,0,0,0,0,0,0,Cardiomegaly|Emphysema,/content/drive/MyDrive/Proyecto_Lewagon_Rayos_...,"['Cardiomegaly', 'Emphysema']"
2,00000001_002.png,2,1,58,M,PA,2500,2048,0.168,0.168,...,0,0,0,0,0,0,0,Cardiomegaly|Effusion,/content/drive/MyDrive/Proyecto_Lewagon_Rayos_...,"['Cardiomegaly', 'Effusion']"


In [22]:
# df["labels"] = df["Finding Labels"].map(lambda x: x.split("|"))
# df.head(3)

In [8]:
df = df[df["Fixed_Labels"] != "No Finding"]
df.shape

(51751, 30)

In [9]:
df.path[0]

'/content/drive/MyDrive/Proyecto_Lewagon_Rayos_X/images_001/images/00000001_000.png'

In [10]:
df['path'] = df.path.map(lambda x: "/".join(x.split("/")[-3:]))

In [11]:
df.path[0]

'images_001/images/00000001_000.png'

In [12]:
path_to_png = params.GCP_IMAGE_BUCKET

In [13]:
# utils.get_paths(df, path_to_png, overwrite_path=True)

In [14]:
df.path.nunique()

51751

In [15]:
# utils.get_paths(df, path_to_png, return_relative=False)

In [16]:
from sklearn.preprocessing import MultiLabelBinarizer

In [17]:
df["labels"] = df["Fixed_Labels"].map(lambda x: x.split("|"))

In [18]:
mlb = MultiLabelBinarizer().fit(df.labels)
mlb_classes = mlb.classes_
mlb_classes.shape

(14,)

In [19]:
y = mlb.transform(df.labels).astype("int16")

In [20]:
# df.path = df['Image Index']

In [21]:
df.path = df.path.map(lambda x: os.path.join(params.GCP_IMAGE_BUCKET, x))

In [22]:
# df.path

In [23]:
df_train, df_val, df_test = data.split_df(
    df, "Patient ID", (0.65, 0.175, 0.175), total_filter=0.3
)

### `tf.data.Dataset`

In [24]:
import random
from glob import glob

import tensorflow as tf


def make_dataset(
    path, batch_size, filenames, label_array, img_size: tuple = (224, 224)
):
    """
    - path: root to image folders
    - batch_size: to iterate
    - filenames: nd.array with list of absolute paths (filenames), in same order as label_array
    - label_array: matching index as filenames
    """

    def parse_image(filename):
        image = tf.io.read_file(filename)
        image = tf.image.decode_jpeg(image, channels=3)
        image = tf.image.resize(image, img_size)
        return image

    def configure_for_performance(ds):
        ds = ds.shuffle(buffer_size=1000)
        ds = ds.batch(batch_size)
        ds = ds.repeat()
        ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
        return ds

    # classes = os.listdir(path)
    # filenames = glob(path + "/*/*")
    filenames = filenames.unique()
    random.shuffle(filenames)
    # it = np.nditer(filenames, flags=['refs_ok', 'c_index'], )
    # for file in it:
    #     labels =
    # labels = [classes.index(name.split("/")[-2]) for name in filenames]
    labels = y

    filenames_ds = tf.data.Dataset.from_tensor_slices(filenames)
    images_ds = filenames_ds.map(
        parse_image, num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    labels_ds = tf.data.Dataset.from_tensor_slices(labels)
    ds = tf.data.Dataset.zip((images_ds, labels_ds))
    ds = configure_for_performance(ds)

    return ds

In [25]:
path_to_png

'gs://images-xray-lewagon/'

In [26]:
y = y.tolist()
df_train = df_train.path.to_list()
df_val = df_val.path.to_list()
df_test = df_test.path.to_list()

ds_train = data.make_dataset(path_to_png, 32, df_train, y)
ds_val = data.make_dataset(path_to_png, 32, df_val, y)

2021-10-19 16:52:23.228552: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2021-10-19 16:52:23.228593: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (alienware): /proc/driver/nvidia/version does not exist
2021-10-19 16:52:23.229058: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [27]:
# ds_train = data.make_dataset(path_to_png, 32, df_train.path, y)
# ds_val = data.make_dataset(path_to_png, 32, df_val.path, y)

In [29]:
classes_dict = pd.DataFrame(mlb.classes_).to_dict()[0]
classes = mlb.classes_
# num_images = df.shape[0]

In [31]:
classes_dict = pd.DataFrame(mlb.classes_).to_dict()[0]
classes_dict

{0: 'Atelectasis',
 1: 'Cardiomegaly',
 2: 'Consolidation',
 3: 'Edema',
 4: 'Effusion',
 5: 'Emphysema',
 6: 'Fibrosis',
 7: 'Hernia',
 8: 'Infiltration',
 9: 'Mass',
 10: 'Nodule',
 11: 'Pleural_Thickening',
 12: 'Pneumonia',
 13: 'Pneumothorax'}

In [44]:
model = trainer.Trainer(ds_train, ds_val, "multilabel")

In [45]:
img_size = (224, 224)

In [46]:
model.build_cnn(
    input_shape=img_size,
    output_shape=len(classes),
    dense_layer_geometry=(1024, 512, 256),
    dropout_layers=True,
    dropout_rate=0.25,
)

In [47]:
model.pipeline.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg16 (Functional)           (None, 7, 7, 512)         14714688  
_________________________________________________________________
flatten (Flatten)            (None, 25088)             0         
_________________________________________________________________
dense (Dense)                (None, 1024)              25691136  
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               1

In [48]:
model.compile_model()

In [49]:
# model.fit(dataset, batch_size=32, epochs=5, steps_per_epoch=math.ceil(num_images/32))
batch_size = 32
epochs = 3
training_images = len(df_train)
steps_per_epoch = math.ceil(training_images / 32)

validation_images = len(df_val)
validation_steps = math.ceil(validation_images / 32)

In [50]:
steps_per_epoch

333

In [51]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from oauth2client.client import GoogleCredentials

In [None]:
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [52]:
model.fit_model(
    epochs=epochs,
    batch_size=batch_size,
    steps_per_epoch=steps_per_epoch,
    validation_steps=validation_steps,
)

Epoch 1/3


2021-10-18 19:59:42.397519: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-10-18 19:59:52.845310: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 69 of 1000
2021-10-18 20:00:02.803986: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 155 of 1000
2021-10-18 20:00:12.864273: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 239 of 1000
2021-10-18 20:00:23.090447: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 315 of 1000
2021-10-18 20:00:32.785618: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 399 of 1000
2021-10-18 20:00:42.703919: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shu



2021-10-18 20:29:21.082678: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 83 of 1000
2021-10-18 20:29:29.594320: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 54 of 1000
2021-10-18 20:29:31.215737: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 147 of 1000
2021-10-18 20:29:39.440913: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 114 of 1000
2021-10-18 20:29:41.256042: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 211 of 1000
2021-10-18 20:29:50.496526: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 184 of 1000
2021-10-18 20:29:51.893131: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buf


Epoch 00001: val_loss improved from inf to 0.32352, saving model to best_weights.hdf5
Epoch 2/3

2021-10-18 21:05:41.398501: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 80 of 1000




2021-10-18 21:05:51.639747: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 178 of 1000
2021-10-18 21:05:59.878201: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 51 of 1000
2021-10-18 21:06:01.637736: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 256 of 1000
2021-10-18 21:06:09.758097: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 115 of 1000
2021-10-18 21:06:11.447331: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 338 of 1000
2021-10-18 21:06:19.831302: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 170 of 1000
2021-10-18 21:06:21.395414: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle bu


Epoch 00002: val_loss improved from 0.32352 to 0.32309, saving model to best_weights.hdf5
Epoch 3/3

2021-10-18 21:41:46.984734: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 34 of 1000




2021-10-18 21:41:56.953676: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 100 of 1000
2021-10-18 21:42:03.515674: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 65 of 1000
2021-10-18 21:42:06.744100: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 167 of 1000
2021-10-18 21:42:14.274939: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 135 of 1000
2021-10-18 21:42:21.468417: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 224 of 1000
2021-10-18 21:42:23.593744: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle buffer (this may take a while): 157 of 1000
2021-10-18 21:42:26.851076: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:175] Filling up shuffle bu


Epoch 00003: val_loss improved from 0.32309 to 0.31725, saving model to best_weights.hdf5


<keras.callbacks.History at 0x7fd844323610>

# Aux

In [None]:
filenames[0:10]

In [None]:
df[df["Image Index"] == "00010162_000.png"].index[0]

In [None]:
for path in filenames[0:10]:
    name = path.split("/")[-1]
    idx = df[df["Image Index"] == name].index[0]
    label = y[idx]
    print(idx)
    print(label)

In [None]:
labels = [df[df["Image Index"] == path.split("/")[-1]].index[0] for path in filenames]

In [None]:
len(df["Image Index"])

In [None]:
y.shape

In [None]:
IMG_SIZE = 224

In [None]:
os.listdir(path_to_png)

In [None]:
df.drop(
    columns=[
        "Follow-up #",
        "Patient Age",
        "Patient Gender",
        "View Position",
        "OriginalImagePixelSpacing_x",
        "OriginalImagePixelSpacing_y",
        "OriginalImageWidth",
        "OriginalImageHeight",
        "OriginalImage[Width",
        "Height]",
        "OriginalImagePixelSpacing[x",
        "y]",
        "Count_diseases",
    ],
    inplace=True,
    errors="ignore",
)
df.head(3)