In [1]:
from imblearn.under_sampling import RandomUnderSampler
import tensorflow as tf
import pandas
import numpy
import os

RANDOM_SEED = 0
DATASET_DIR = os.path.join(os.getcwd(), 'chexpert/chexpertchestxrays-u20210408')
DATA_CSV_PATH = os.path.join(DATASET_DIR, 'train_cheXbert.csv')
CHEXNET_PATH = os.path.join(os.getcwd(), 'modelos/pre_treinados/brucechou1983_CheXNet_Keras_0.3.0_weights.h5')
LABEL_DICT = {0: 0, 1: 1, -1: 2}

data_df = pandas.read_csv(DATA_CSV_PATH)

# Removendo as visões laterais
data_df = data_df[data_df['Frontal/Lateral'] == 'Frontal']

# Substituindo NaN por zero
data_df.fillna(0, inplace=True)

# Tratando caminhos das imagens
img_paths = data_df['Path'].values
img_paths = list(map(lambda path: os.path.join(os.getcwd(), DATASET_DIR, path), img_paths))

# Tratando labels
labels = data_df['Pneumonia'].values
labels = numpy.array(list(map(lambda y: LABEL_DICT[int(y)], labels)))

print(numpy.count_nonzero(labels == 0))
print(numpy.count_nonzero(labels == 1))
print(numpy.count_nonzero(labels == 2))

# Realizando undersampling
img_paths, labels = (
    RandomUnderSampler(random_state=RANDOM_SEED)
    .fit_resample(numpy.array(img_paths).reshape(-1, 1), labels)
)
img_paths = img_paths.flatten()

2024-09-18 18:27:43.045872: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-18 18:27:43.062161: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-18 18:27:43.067102: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-18 18:27:43.079456: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


170685
3738
16604


In [2]:
import keras
from keras import backend as K
from keras import layers
from keras import callbacks
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import gc

NUM_FOLDS = 15
BATCH_SIZE = 32
IMG_SIZE = 224

def preprocess_and_load(img_path, label):
    img = tf.io.read_file(img_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, [IMG_SIZE, IMG_SIZE], antialias=True)
    img = keras.applications.densenet.preprocess_input(img)
    return img, label


def train_model(model, train_set, val_set, finetune):
    if finetune:
        model.get_layer('chexnet').trainable = True
        lr = 1e-4
        epochs=4
    else:
        model.get_layer('chexnet').trainable = False
        lr = 1e-3
        epochs=40
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss=keras.losses.CategoricalCrossentropy(),
        metrics=[
            keras.metrics.AUC(curve='ROC', name='roc_auc'),
            'accuracy'
        ],
    )
    
    history = model.fit(
        train_set,
        validation_data=val_set,
        epochs=epochs,
        verbose=1,
        callbacks=[
            callbacks.EarlyStopping(
                monitor='val_loss',
                patience=8,
                min_delta=1e-4,
                restore_best_weights=True,
                verbose=1
            ),
            callbacks.ReduceLROnPlateau(
                patience=4,
                min_delta=1e-4,
                verbose=1
            )
        ],
    )
    return history

def create_model():
        # Carregando os pesos do chexnet
        densenet = tf.keras.applications.DenseNet121(
            weights=None,
            include_top=False,
            input_shape=(IMG_SIZE, IMG_SIZE, 3),
            pooling='avg',
        )
        output = layers.Dense(14, activation='sigmoid', name='output')(densenet.layers[-1].output)
        chexnet = keras.Model(inputs=densenet.input, outputs=output)
        chexnet.load_weights(CHEXNET_PATH)
        chexnet = keras.Model(name='chexnet', inputs=chexnet.input, outputs=chexnet.layers[-2].output)

        # Modelo principal
        inputs = keras.Input(shape=(IMG_SIZE, IMG_SIZE, 3))
        x = layers.RandomFlip('horizontal')(inputs)
        x = chexnet(x, training=False)
        x = layers.Dropout(0.25)(x)
        x = layers.Dense(
            units=512,
            activation='relu',
        )(x)
        x = layers.Dense(
            units=256,
            activation='relu',
        )(x)
        x = layers.Dense(
            units=128,
            activation='relu',
        )(x)
        outputs = layers.Dense(
            name='output_layer',
            units=3,
            activation='softmax'
        )(x)
        model = keras.Model(inputs, outputs)
        return model

tprs = []
aucs = []
mean_fpr = numpy.linspace(0, 1, 100)

'''
    Seeds usadas:
    0, 42, 2024, 2002, 412, 253,
    54, 621, 3432, 333, 1234, 460,
    12, 3451, 553, 541
'''

for fold, seed in enumerate([12, 3451, 553, 541]):
    print(f'Fold {fold}...')
    
    x_train, x_val, y_train, y_val = train_test_split(
        img_paths,
        labels,
        test_size=0.15,
        stratify=labels,
        random_state=seed,
    )

    y_train = keras.utils.to_categorical(y_train)
    y_val = keras.utils.to_categorical(y_val)

    # Configurando dataset de treino
    train_set = (
        tf.data.Dataset.from_tensor_slices((x_train, y_train))
        .map(preprocess_and_load, num_parallel_calls=tf.data.AUTOTUNE)
        .cache()
        .shuffle(buffer_size=1000)
        .batch(BATCH_SIZE)
        .prefetch(buffer_size=tf.data.AUTOTUNE)
    )

    # Configurando dataset de validação
    val_set = (
        tf.data.Dataset.from_tensor_slices((x_val, y_val))
        .map(preprocess_and_load, num_parallel_calls=tf.data.AUTOTUNE)
        .cache()
        .batch(BATCH_SIZE)
        .prefetch(buffer_size=tf.data.AUTOTUNE)
    )

    model = create_model()
    _ = train_model(model, train_set, val_set, finetune=False)
    _ = train_model(model, train_set, val_set, finetune=True)

    # Testando modelo, recuperando métricas
    y_pred = model.predict(val_set)

    # Calculando taxa de falsos positivos e verdadeiros positivos
    fpr, tpr, _ = roc_curve(y_val[:, 2], y_pred[:, 2])
    roc_auc = auc(fpr, tpr)
    interp_tpr = numpy.interp(mean_fpr, fpr, tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(roc_auc)

    del model, train_set, val_set
    gc.collect()
    K.clear_session()

Fold 0...


I0000 00:00:1726618265.876418 3262769 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1726618268.354283 3262769 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1726618268.354551 3262769 cuda_executor.cc:1015] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
I0000 00:00:1726618268.356004 3262769 cuda_executor.cc:1015] successful NUMA node read from SysFS ha

Epoch 1/40


2024-09-17 21:11:29.527173: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:3: Filling up shuffle buffer (this may take a while): 461 of 1000
2024-09-17 21:11:39.561492: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:3: Filling up shuffle buffer (this may take a while): 974 of 1000
2024-09-17 21:11:40.200688: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:480] Shuffle buffer filled.
I0000 00:00:1726618300.700323 3262878 service.cc:146] XLA service 0x7fca64002350 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1726618300.700358 3262878 service.cc:154]   StreamExecutor device (0): Quadro P2200, Compute Capability 6.1
2024-09-17 21:11:41.069746: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-09-17 21:11:42.885812: I external/local_xla/xla/stream_executor/cuda/cuda_

[1m  1/298[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:54:53[0m 47s/step - accuracy: 0.2500 - loss: 1.1876 - roc_auc: 0.3765

I0000 00:00:1726618319.571512 3262878 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m267s[0m 740ms/step - accuracy: 0.3540 - loss: 1.1130 - roc_auc: 0.5250 - val_accuracy: 0.3803 - val_loss: 1.0836 - val_roc_auc: 0.5803 - learning_rate: 0.0010
Epoch 2/40
[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 126ms/step - accuracy: 0.4196 - loss: 1.0659 - roc_auc: 0.6036 - val_accuracy: 0.4652 - val_loss: 1.0433 - val_roc_auc: 0.6553 - learning_rate: 0.0010
Epoch 3/40
[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 159ms/step - accuracy: 0.4205 - loss: 1.0569 - roc_auc: 0.6171 - val_accuracy: 0.4480 - val_loss: 1.0474 - val_roc_auc: 0.6325 - learning_rate: 0.0010
Epoch 4/40
[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 159ms/step - accuracy: 0.4347 - loss: 1.0496 - roc_auc: 0.6254 - val_accuracy: 0.3535 - val_loss: 1.0906 - val_roc_auc: 0.5663 - learning_rate: 0.0010
Epoch 5/40
[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 159ms/step - acc




[1m297/298[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 573ms/step - accuracy: 0.4674 - loss: 1.0309 - roc_auc: 0.6551




[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 885ms/step - accuracy: 0.4674 - loss: 1.0308 - roc_auc: 0.6551 - val_accuracy: 0.4902 - val_loss: 1.0047 - val_roc_auc: 0.6819 - learning_rate: 1.0000e-04
Epoch 2/4
[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 598ms/step - accuracy: 0.4849 - loss: 1.0003 - roc_auc: 0.6824 - val_accuracy: 0.4985 - val_loss: 1.0000 - val_roc_auc: 0.6876 - learning_rate: 1.0000e-04
Epoch 3/4
[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m179s[0m 600ms/step - accuracy: 0.5071 - loss: 0.9800 - roc_auc: 0.7007 - val_accuracy: 0.4991 - val_loss: 0.9944 - val_roc_auc: 0.6906 - learning_rate: 1.0000e-04
Epoch 4/4
[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 505ms/step - accuracy: 0.5114 - loss: 0.9633 - roc_auc: 0.7136 - val_accuracy: 0.5051 - val_loss: 0.9958 - val_roc_auc: 0.6908 - learning_rate: 1.0000e-04
Restoring model weights from the end of the best epoch: 3.
[1m53/53[0m [32m

2024-09-17 23:06:22.837757: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:42: Filling up shuffle buffer (this may take a while): 485 of 1000
2024-09-17 23:06:32.949439: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:42: Filling up shuffle buffer (this may take a while): 848 of 1000
2024-09-17 23:06:35.737072: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:480] Shuffle buffer filled.


[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m307s[0m 913ms/step - accuracy: 0.3451 - loss: 1.1120 - roc_auc: 0.5190 - val_accuracy: 0.4201 - val_loss: 1.0657 - val_roc_auc: 0.6090 - learning_rate: 0.0010
Epoch 2/40
[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 158ms/step - accuracy: 0.3996 - loss: 1.0751 - roc_auc: 0.5909 - val_accuracy: 0.4023 - val_loss: 1.0531 - val_roc_auc: 0.6219 - learning_rate: 0.0010
Epoch 3/40
[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 159ms/step - accuracy: 0.4147 - loss: 1.0646 - roc_auc: 0.6056 - val_accuracy: 0.4510 - val_loss: 1.0348 - val_roc_auc: 0.6568 - learning_rate: 0.0010
Epoch 4/40
[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 159ms/step - accuracy: 0.4329 - loss: 1.0550 - roc_auc: 0.6250 - val_accuracy: 0.4902 - val_loss: 1.0221 - val_roc_auc: 0.6711 - learning_rate: 0.0010
Epoch 5/40
[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 159ms/step - acc

In [3]:
from IPython.display import display

results_csv_exists = os.path.isfile('resultados_folds.csv')
tprs_csv_exists = os.path.isfile('tprs_folds.csv')

results_df = pandas.DataFrame({
    'auc': aucs,
})

results_df.to_csv(
    path_or_buf='resultados_folds.csv',
    mode='a' if results_csv_exists else 'w',
    header=not results_csv_exists,
    index=False
)

tprs_df = pandas.DataFrame(tprs)
tprs_df.to_csv(
    path_or_buf='tprs_folds.csv',
    mode='a' if tprs_csv_exists else 'w',
    header=not tprs_csv_exists,
    index=False
)

# # Gerando curva roc
# mean_tpr = numpy.mean(tprs, axis=0)
# mean_tpr[-1] = 1.0
# mean_auc = auc(mean_fpr, mean_tpr)
# std_auc = numpy.std(aucs)
# ax.plot(
#     mean_fpr,
#     mean_tpr,
#     color="b",
#     label=r"Mean ROC (AUC = %0.2f $\pm$ %0.2f)" % (mean_auc, std_auc),
#     lw=2,
#     alpha=0.8,
# )
# std_tpr = numpy.std(tprs, axis=0)
# tprs_upper = numpy.minimum(mean_tpr + std_tpr, 1)
# tprs_lower = numpy.maximum(mean_tpr - std_tpr, 0)
# ax.fill_between(
#     mean_fpr,
#     tprs_lower,
#     tprs_upper,
#     color="grey",
#     alpha=0.2,
#     label=r"$\pm$ 1 std. dev.",
# )
# ax.set(
#     xlabel="False Positive Rate",
#     ylabel="True Positive Rate",
#     title=f"Mean ROC curve with variability\n(Positive label '{uncertain}')",
# )
# ax.legend(loc="lower right")
# plt.show()

In [4]:
# from scipy.stats import ttest_1samp

# t_stat, p_value = ttest_1samp(aucs, 0.5, alternative='greater')
# alpha = 0.05

# print("T statistic:", t_stat)
# print("P-value:", p_value)

# if p_value < alpha:
#     print('Null hypothesis rejected')

# else:
#     print('Failed to reject null hypothesis')