In [171]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from glob import glob
%matplotlib inline
import matplotlib.pyplot as plt

In [172]:
all_xray_df = pd.read_csv("E:/A__CVPR/Dataset/bbox/Data_Entry_2017.csv")
all_image_paths = {os.path.basename(x): x for x in 
                glob(os.path.join('D:/New CX/CXR8/images', '*.png'))}
print('Scans found:', len(all_image_paths), ', Total Headers', all_xray_df.shape[0])
all_xray_df['path'] = all_xray_df['Image Index'].map(all_image_paths.get)
all_xray_df['path'] = all_xray_df['Image Index'].map(all_image_paths.get)

label_counts = all_xray_df['Finding Labels'].value_counts()[:15]

all_xray_df['Finding Labels'] = all_xray_df['Finding Labels'].map(lambda x: x.replace('No Finding', ''))
from itertools import chain
all_labels = np.unique(list(chain(*all_xray_df['Finding Labels'].map(lambda x: x.split('|')).tolist())))
all_labels = [x for x in all_labels if len(x)>0]
print('All Labels ({}): {}'.format(len(all_labels), all_labels))
for c_label in all_labels:
    if len(c_label)>1: # leave out empty labels
        all_xray_df[c_label] = all_xray_df['Finding Labels'].map(lambda finding: 1.0 if c_label in finding else 0)

# keep at least 1000 cases
MIN_CASES = 1000
all_labels = [c_label for c_label in all_labels if all_xray_df[c_label].sum()>MIN_CASES]
print('Clean Labels ({})'.format(len(all_labels)), 
    [(c_label,int(all_xray_df[c_label].sum())) for c_label in all_labels])

# since the dataset is very unbiased, we can resample it to be a more reasonable collection
# weight is 0.1 + number of findings
sample_weights = all_xray_df['Finding Labels'].map(lambda x: len(x.split('|')) if len(x)>0 else 0).values + 4e-2
sample_weights /= sample_weights.sum()

all_xray_df = all_xray_df.sample(40000, weights=sample_weights)

label_counts = all_xray_df['Finding Labels'].value_counts()[:15]

label_counts = 100*np.mean(all_xray_df[all_labels].values,0)

Scans found: 112120 , Total Headers 112120
All Labels (14): ['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration', 'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia', 'Pneumothorax']
Clean Labels (13) [('Atelectasis', 11559), ('Cardiomegaly', 2776), ('Consolidation', 4667), ('Edema', 2303), ('Effusion', 13317), ('Emphysema', 2516), ('Fibrosis', 1686), ('Infiltration', 19894), ('Mass', 5782), ('Nodule', 6331), ('Pleural_Thickening', 3385), ('Pneumonia', 1431), ('Pneumothorax', 5302)]


In [173]:
all_xray_df['disease_vec'] = all_xray_df.apply(lambda x: [x[all_labels].values], 1).map(lambda x: x[0])

In [174]:
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(all_xray_df, 
                                test_size = 0.25, 
                                random_state = 2018,
                                stratify = all_xray_df['Finding Labels'].map(lambda x: x[:4]))
print('train', train_df.shape[0], 'validation', valid_df.shape[0])

train 30000 validation 10000


In [175]:
valid_df['newLabel'] = valid_df.apply(lambda x: x['Finding Labels'].split('|'), axis=1)
train_df['newLabel'] = train_df.apply(lambda x: x['Finding Labels'].split('|'), axis=1)

In [176]:
from keras.preprocessing.image import ImageDataGenerator
core_idg = ImageDataGenerator(samplewise_center=True, 
                            samplewise_std_normalization=True, 
                            horizontal_flip = True, 
                            vertical_flip = False, 
                            height_shift_range= 0.05, 
                            width_shift_range=0.1, 
                            rotation_range=5, 
                            shear_range = 0.1,
                            fill_mode = 'reflect',
                            zoom_range=0.15)



In [177]:
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
weight_path="{}_weights.best.hdf5".format('xray_class')

checkpoint = ModelCheckpoint('xray_class_weights.best.hdf5',
                            monitor='val_loss',
                            verbose=1,
                            save_best_only=True,
                            save_weights_only=True,  # Save only the weights, not the full model
                            mode='min')


early = EarlyStopping(monitor="val_loss", 
                    mode="min", 
                    patience=5)
callbacks_list = [checkpoint, early]

In [178]:
import tensorflow as tf

class AdamAccumulate(tf.keras.optimizers.Optimizer):
    def __init__(self, learning_rate=0.001, accum_iters=4, **kwargs):
        super(AdamAccumulate, self).__init__(name="AdamAccumulate", **kwargs)
        self.learning_rate = learning_rate
        self.accum_iters = tf.constant(accum_iters, dtype=tf.float32)
        self.iterations_accum = tf.Variable(0, dtype=tf.int64)

    def _create_slots(self, var_list):
        """ Create slots for all trainable variables """
        for var in var_list:
            if self.has_slot(var, 'accum_grad'):
                continue
            self.add_slot(var, 'accum_grad')
    
    def apply_gradients(self, grads_and_vars, name=None, experimental_aggregate_gradients=True):
        """ Apply accumulated gradients after accum_iters steps """
        grads, variables = zip(*grads_and_vars)
        self._create_slots(variables)  # Ensure slots are created for all variables

        accum_grads = [self.get_slot(var, 'accum_grad') for var in variables]
        self.iterations_accum.assign_add(1)

        # Accumulate the gradients
        for g, accum_grad in zip(grads, accum_grads):
            accum_grad.assign_add(g)

        def apply_accumulated_grads():
            applied_grads = []
            for accum_grad, var in zip(accum_grads, variables):
                applied_grads.append(self._apply_gradients(var, accum_grad / self.accum_iters))
                accum_grad.assign(tf.zeros_like(accum_grad))  # Reset the gradients
            return tf.group(*applied_grads)

        # Apply accumulated gradients only every accum_iters steps
        apply_grads_op = tf.cond(
            tf.equal(self.iterations_accum % self.accum_iters, 0),
            true_fn=apply_accumulated_grads,
            false_fn=lambda: tf.no_op()
        )
        return apply_grads_op

    def _apply_gradients(self, var, grad):
        """ Update the weights based on the accumulated gradients """
        return var.assign_sub(self.learning_rate * grad)

    def get_config(self):
        """ Return the optimizer config for serialization """
        config = {'learning_rate': self.learning_rate, 'accum_iters': self.accum_iters.numpy()}
        base_config = super(AdamAccumulate, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))


In [179]:
import tensorflow as tf
from tensorflow.keras.applications.mobilenet import MobileNet
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout

class AccumModel(tf.keras.Model):
    def __init__(self, num_classes, IMG_SIZE, channels=1, accum_steps=4):
        super(AccumModel, self).__init__()
        self.base_model = MobileNet(input_shape=(*IMG_SIZE, channels),
                                    include_top=False, weights=None)
        self.pooling = GlobalAveragePooling2D()
        self.dropout1 = Dropout(0.5)
        self.dense1 = Dense(512, activation='relu')
        self.dropout2 = Dropout(0.5)
        self.dense2 = Dense(num_classes, activation='sigmoid')
        self.accum_steps = accum_steps  # Number of steps to accumulate gradients
        self.step = 0

    def build(self, input_shape):
        super(AccumModel, self).build(input_shape)
        # Initialize accum_grads after the model is built as tf.Variables
        self.accum_grads = [tf.Variable(tf.zeros_like(var), trainable=False) for var in self.trainable_variables]

    def call(self, inputs, training=False):
        x = self.base_model(inputs, training=training)
        x = self.pooling(x)
        x = self.dropout1(x, training=training)
        x = self.dense1(x)
        x = self.dropout2(x, training=training)
        return self.dense2(x)

    def reset_accumulated_gradients(self):
        for var in self.accum_grads:
            var.assign(tf.zeros_like(var))

    def train_step(self, data):
        x, y = data

        with tf.GradientTape() as tape:
            y_pred = self(x, training=True)
            loss = self.compiled_loss(y, y_pred)

        gradients = tape.gradient(loss, self.trainable_variables)

        # Accumulate gradients
        for i, grad in enumerate(gradients):
            self.accum_grads[i].assign_add(grad)

        self.step += 1

        if self.step % self.accum_steps == 0:
            # Apply accumulated gradients
            self.optimizer.apply_gradients(zip(self.accum_grads, self.trainable_variables))
            # Reset accumulated gradients
            self.reset_accumulated_gradients()

        # Update metrics (includes the metric that tracks the loss)
        self.compiled_metrics.update_state(y, y_pred)

        # Return a dict mapping metric names to current value
        return {m.name: m.result() for m in self.metrics}


In [180]:

IMG_SIZE_LIST = [(1024, 1024), (512, 512), (256, 256), (224, 224), (192, 192), (128, 128), (64, 64)]
BATCH_SIZE_LIST = [4, 8, 16, 32, 32, 32, 64]

STEPS_PER_EPOCH = 10000
EPOCHS = 10

In [181]:
def MakeModel(IMG_SIZE, num_classes, channels=1, accum_steps=4):
    model = AccumModel(num_classes, IMG_SIZE, channels, accum_steps)
    return model



In [182]:
import gc
from collections import defaultdict
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss

run_this_code = True  # This Code Block Takes a long time to run

if run_this_code:
    train_results = defaultdict(dict)
    test_results = defaultdict(dict)

    lr = 0.0005
    syntheticBatch = 256

    plt.figure(figsize=(20, 5))

    for imageSize, batchSize in zip(IMG_SIZE_LIST, BATCH_SIZE_LIST):  # Number of batches before gradient accumulation

        batch = int(256 / batchSize)  # Gradient Accumulation Step

        train_gen = core_idg.flow_from_dataframe(dataframe=train_df,
                                                directory=None,
                                                x_col='path',
                                                y_col='newLabel',
                                                class_mode='categorical',
                                                classes=all_labels,
                                                target_size=imageSize,
                                                color_mode='grayscale',
                                                batch_size=batchSize)

        valid_gen = core_idg.flow_from_dataframe(dataframe=valid_df,
                                                irectory=None,
                                                x_col='path',
                                                y_col='newLabel',
                                                class_mode='categorical',
                                                classes=all_labels,
                                                target_size=imageSize,
                                                color_mode='grayscale',
                                                batch_size=batchSize)  # we can use much larger batches for evaluation

        print('Running Image Size: ', imageSize, 'Running Batch size : ',
            batchSize, 'Learning Rate : ', lr)

        predictions_train = pd.DataFrame()
        predictions_test = pd.DataFrame()

        multi_disease_model = MakeModel(imageSize, len(all_labels), accum_steps=batch)
        opt = tf.keras.optimizers.Adam(learning_rate=lr)
        multi_disease_model.compile(optimizer=opt, loss='binary_crossentropy',
                                    metrics=['binary_accuracy', 'mae'])

        history = multi_disease_model.fit(train_gen,
                                        steps_per_epoch=STEPS_PER_EPOCH,
                                        validation_data=valid_gen,
                                        epochs=EPOCHS,
                                        callbacks=callbacks_list,
                                        validation_steps=1000)


        plt.plot(history.history['val_loss'])

        p = history.history['val_loss'][0]

        del multi_disease_model, history
        gc.collect()
        print('*' * 50)
        print('')

        test_results[imageSize[0]][lr] = p
        imageSizeFile = pd.DataFrame(test_results)
        imageSizeFile.to_csv("imageSize.csv", index=True)

    plt.legend([str(x[0]) for x in IMG_SIZE_LIST], loc='upper right')
    plt.title('model accuracy')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.show()
    plt.savefig('image_size_selection.png', bbox_inches='tight')

Found 28235 validated image filenames belonging to 13 classes.
Found 9414 validated image filenames belonging to 13 classes.
Running Image Size:  (1024, 1024) Running Batch size :  4 Learning Rate :  0.0005
Epoch 1/10

Epoch 1: val_loss improved from inf to 0.71697, saving model to xray_class_weights.best.hdf5


ValueError: Unable to synchronously create dataset (name already exists)

<Figure size 2000x500 with 0 Axes>