In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import os
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.applications import DenseNet121
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.metrics import roc_auc_score, roc_curve, auc
from PIL import Image
import matplotlib.gridspec as gridspec
import matplotlib.ticker as ticker
sns.set_style('whitegrid')
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [40]:
# Load the data
data = pd.read_csv('E:/A__CVPR/Dataset/bbox/Data_Entry_2017_v2020.csv')
data = data[data['Patient Age'] < 100]  # Removing invalid data points
data_image_paths = {os.path.basename(x): x for x in glob(os.path.join('D:/New CX/CXR8/images', '**', '*.png'))}
print('Scans found:', len(data_image_paths), ', Total Headers', data.shape[0])

data['path'] = data['Image Index'].map(data_image_paths.get)
data['Patient Age'] = data['Patient Age'].map(lambda x: int(x))

Scans found: 0 , Total Headers 112120


In [41]:
# Preprocess the data and create labels
data['Finding Labels'] = data['Finding Labels'].map(lambda x: x.replace('No Finding', ''))
from itertools import chain
all_labels = np.unique(list(chain(*data['Finding Labels'].map(lambda x: x.split('|')).tolist())))
all_labels = [x for x in all_labels if len(x) > 0]

for c_label in all_labels:
    if len(c_label) > 1:  # leave out empty labels
        data[c_label] = data['Finding Labels'].map(lambda finding: 1.0 if c_label in finding else 0)


In [42]:
# Filter out cases with at least 1000 samples
MIN_CASES = 1000
all_labels = [c_label for c_label in all_labels if data[c_label].sum() > MIN_CASES]
print('Clean Labels:', [(c_label, int(data[c_label].sum())) for c_label in all_labels])

Clean Labels: [('Atelectasis', 11559), ('Cardiomegaly', 2776), ('Consolidation', 4667), ('Edema', 2303), ('Effusion', 13317), ('Emphysema', 2516), ('Fibrosis', 1686), ('Infiltration', 19894), ('Mass', 5782), ('Nodule', 6331), ('Pleural_Thickening', 3385), ('Pneumonia', 1431), ('Pneumothorax', 5302)]


In [43]:
# Resample to have a balanced dataset
sample_weights = data['Finding Labels'].map(lambda x: len(x.split('|')) if len(x) > 0 else 0).values + 4e-2
sample_weights /= sample_weights.sum()
data = data.sample(40000, weights=sample_weights)

In [44]:
# Create vectors for disease labels
data['disease_vec'] = data.apply(lambda x: [x[all_labels].values], 1).map(lambda x: x[0])


In [45]:
# Split the dataset into train, validation, and test sets
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(data, test_size=0.20, random_state=2018, stratify=data['Finding Labels'].map(lambda x: x[:4]))
train_df, valid_df = train_test_split(train_df, test_size=0.10, random_state=2018, stratify=train_df['Finding Labels'].map(lambda x: x[:4]))

In [50]:

from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define the custom data generator for loading images from DataFrame
def flow_from_dataframe(img_data_gen, in_df, path_col, y_col, **dflow_args):
    df_gen = img_data_gen.flow_from_dataframe(
        dataframe=in_df,
        directory=None,  # Since full paths are provided in the 'path' column
        x_col=path_col,
        y_col=y_col,
        class_mode='raw',  # Use 'raw' for multi-label classification
        **dflow_args
    )
    return df_gen

# Set up ImageDataGenerator with the appropriate preprocessing function for DenseNet
core_idg_dense = ImageDataGenerator(preprocessing_function=tf.keras.applications.densenet.preprocess_input)

# Set the image size
IMG_SIZE = (224, 224)

# Ensure that all values in the 'path' column are strings and drop rows with NaN or invalid paths
train_df['path'] = train_df['path'].astype(str)
valid_df['path'] = valid_df['path'].astype(str)

# Drop rows where paths are NaN or empty
train_df = train_df[train_df['path'].notna()]
valid_df = valid_df[valid_df['path'].notna()]

# Check for any issues with the paths (optional debugging step)
print(f"Number of training samples: {train_df.shape[0]}")
print(f"Number of validation samples: {valid_df.shape[0]}")

# Now create the generators again
train_gen = flow_from_dataframe(core_idg_dense, train_df, 
                                path_col='path', y_col='disease_vec', 
                                target_size=IMG_SIZE, color_mode='rgb', 
                                batch_size=16)

valid_gen = flow_from_dataframe(core_idg_dense, valid_df, 
                                path_col='path', y_col='disease_vec', 
                                target_size=IMG_SIZE, color_mode='rgb', 
                                batch_size=32)




Number of training samples: 28800
Number of validation samples: 3200
Found 0 validated image filenames.
Found 0 validated image filenames.


In [52]:
# Create DenseNet model
img_in = Input(shape=(224, 224, 3))
base_model = DenseNet121(include_top=False, weights='imagenet', input_tensor=img_in, pooling='avg')
x = base_model.output
predictions = Dense(len(all_labels), activation="sigmoid", name="predictions")(x)
model = Model(inputs=img_in, outputs=predictions)

In [55]:
print(f"Training generator samples: {train_gen.samples}")
print(f"Validation generator samples: {valid_gen.samples}")


Training generator samples: 0
Validation generator samples: 0


In [53]:
# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss="binary_crossentropy", metrics=[tf.keras.metrics.BinaryAccuracy()])

In [54]:
# Train the model
history = model.fit(train_gen, steps_per_epoch=100, validation_data=valid_gen, epochs=20)

ValueError: Asked to retrieve element 0, but the Sequence has length 0