In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import math
import os
from glob import glob

import numpy as np
import pandas as pd

In [None]:
from xray import data, params, trainer, utils

In [None]:
csv_labels = "sample_labels.csv"
path_to_csv = os.path.join("../../raw_data/sample-data/", csv_labels)

In [None]:
df = data.get_data(
    path_to_csv,
)
df.head(3)

In [None]:
df["labels"] = df["Finding Labels"].map(lambda x: x.split("|"))
df.head(3)

In [None]:
df = df[df["Finding Labels"] != "No Finding"]

In [None]:
path_to_png = "../../raw_data/sample-data/images"

In [None]:
utils.get_paths(df, path_to_png, return_relative=False)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
mlb = MultiLabelBinarizer().fit(df.labels)
mlb_classes = mlb.classes_
mlb_classes.shape

In [None]:
y = mlb.transform(df.labels).astype("int16")

In [None]:
df_train, df_val, df_test = data.split_df(
    df, "Patient ID", (0.65, 0.175, 0.175), total_filter=0.3
)

## `tf.data.Dataset`

In [None]:
import random
from glob import glob

import tensorflow as tf


def make_dataset(
    path, batch_size, filenames, label_array, img_size: tuple = (224, 224)
):
    """
    - path: root to image folders
    - batch_size: to iterate
    - filenames: nd.array with list of absolute paths (filenames), in same order as label_array
    - label_array: matching index as filenames
    """

    def parse_image(filename):
        image = tf.io.read_file(filename)
        image = tf.image.decode_jpeg(image, channels=3)
        image = tf.image.resize(image, img_size)
        return image

    def configure_for_performance(ds):
        ds = ds.shuffle(buffer_size=1000)
        ds = ds.batch(batch_size)
        ds = ds.repeat()
        ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
        return ds

    # classes = os.listdir(path)
    # filenames = glob(path + "/*/*")
    filenames = filenames.unique()
    random.shuffle(filenames)
    # it = np.nditer(filenames, flags=['refs_ok', 'c_index'], )
    # for file in it:
    #     labels =
    # labels = [classes.index(name.split("/")[-2]) for name in filenames]
    labels = y

    filenames_ds = tf.data.Dataset.from_tensor_slices(filenames)
    images_ds = filenames_ds.map(
        parse_image, num_parallel_calls=tf.data.experimental.AUTOTUNE
    )
    labels_ds = tf.data.Dataset.from_tensor_slices(labels)
    ds = tf.data.Dataset.zip((images_ds, labels_ds))
    ds = configure_for_performance(ds)

    return ds

In [None]:
ds_train = make_dataset(path_to_png, 32, df_train.path, y)
ds_val = make_dataset(path_to_png, 32, df_val.path, y)

In [None]:
classes_dict = pd.DataFrame(mlb.classes_).to_dict()[0]
classes = mlb.classes_
num_images = df.shape[0]

In [None]:
model = trainer.Trainer(ds_train, ds_val, "multilabel")

In [None]:
img_size = (224, 224)

In [None]:
model.build_cnn(
    input_shape=img_size,
    output_shape=len(classes),
    dense_layer_geometry=(1024, 512, 256),
    dropout_layers=True,
    dropout_rate=0.25,
)

In [None]:
model.pipeline.summary()

In [None]:
model.compile_model()

In [None]:
# model.fit(dataset, batch_size=32, epochs=5, steps_per_epoch=math.ceil(num_images/32))
batch_size = 32
epochs = 5
training_images = df_train.shape[0]
steps_per_epoch = math.ceil(training_images / 32)

validation_images = df_train.shape[0]
validation_steps = math.ceil(validation_images / 32)

In [None]:
steps_per_epoch

In [None]:
model.fit_model(
    epochs=epochs,
    batch_size=batch_size,
    steps_per_epoch=steps_per_epoch,
    validation_steps=validation_steps,
)

# Aux

In [None]:
filenames[0:10]

In [None]:
df[df["Image Index"] == "00010162_000.png"].index[0]

In [None]:
for path in filenames[0:10]:
    name = path.split("/")[-1]
    idx = df[df["Image Index"] == name].index[0]
    label = y[idx]
    print(idx)
    print(label)

In [None]:
labels = [df[df["Image Index"] == path.split("/")[-1]].index[0] for path in filenames]

In [None]:
len(df["Image Index"])

In [None]:
y.shape

In [None]:
IMG_SIZE = 224

In [None]:
os.listdir(path_to_png)

In [None]:
df.drop(
    columns=[
        "Follow-up #",
        "Patient Age",
        "Patient Gender",
        "View Position",
        "OriginalImagePixelSpacing_x",
        "OriginalImagePixelSpacing_y",
        "OriginalImageWidth",
        "OriginalImageHeight",
        "OriginalImage[Width",
        "Height]",
        "OriginalImagePixelSpacing[x",
        "y]",
        "Count_diseases",
    ],
    inplace=True,
    errors="ignore",
)
df.head(3)