# Data Loader and Preprocessing

In [1]:
import tensorflow as tf

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [2]:
import tensorflow as tf

# List available physical devices
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only use the first GPU
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')

        # Allow memory growth to avoid consuming all GPU memory
        tf.config.experimental.set_memory_growth(gpus[0], True)
        print("GPU is being used")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        print(e)


GPU is being used


In [3]:
import numpy as np
from keras.utils import img_to_array
from keras.utils.image_utils import load_img
import os
import glob

def load_images_to_array(image_dir, target_size=(128, 128)):
    # List of image file paths
    image_paths = glob.glob(os.path.join(image_dir, '*.jpeg'))

    # Initialize list to hold image arrays
    images = []

    for image_path in image_paths:
        # Load and preprocess image
        img = load_img(image_path, target_size=target_size)
        img_array = img_to_array(img)
        images.append(img_array)

    # Convert list of arrays to a single NumPy array
    images_array = np.array(images)

    return images_array

# Example: flattening images with shape (128, 128, 3)
def flatten_images(image_array):
    return image_array.reshape(image_array.shape[0], -1)  # Flatten each image

In [4]:
minority_class_dir = 'D:\\programing\\Pneumonia\\Images\\Chest_xray\\2D\\train\\NORMAL'
majority_class_dir = 'D:\\programing\\Pneumonia\\Images\\Chest_xray\\2D\\train\\PNEUMONIA'
minor_images_array = load_images_to_array(minority_class_dir, target_size=(128, 128))
major_images_array = load_images_to_array(majority_class_dir, target_size=(128, 128))
images_array = np.concatenate((minor_images_array, major_images_array), axis=0)

In [5]:
# Suppose `images` is your NumPy array with shape (num_images, 128, 128, 3)
images_flattened = flatten_images(images_array)
smote_labels = np.array([0]*len(minor_images_array) + [1]*len(major_images_array))

In [6]:
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.utils import shuffle

# Flattened images and labels
X = images_flattened
y = smote_labels

# Initialize SMOTE
smote = SMOTE(sampling_strategy='auto')

# Parameters
batch_size = 100  # Adjust batch size as needed
n_samples = len(X)
n_batches = n_samples // batch_size + 1

# Shuffle data to ensure randomness
X, y = shuffle(X, y, random_state=42)

# Containers for resampled data
X_resampled_batches = []
y_resampled_batches = []

for _ in range(n_batches):
    # Randomly select indices for the batch
    indices = np.random.choice(n_samples, batch_size, replace=False)
    
    X_batch = X[indices]
    y_batch = y[indices]
    
    # Apply SMOTE to the random batch
    X_resampled, y_resampled = smote.fit_resample(X_batch, y_batch)
    
    # Store the results
    X_resampled_batches.append(X_resampled)
    y_resampled_batches.append(y_resampled)

# Concatenate all batches
X_resampled = np.concatenate(X_resampled_batches, axis=0)
y_resampled = np.concatenate(y_resampled_batches, axis=0)

print(X_resampled.shape)  # Shape will reflect the new number of samples

(7892, 49152)


In [7]:
from numba import cuda
from sklearn.model_selection import train_test_split
cuda.select_device(0)
cuda.close()

# Assuming synthetic_data and synthetic_labels are your NumPy arrays
synthetic_data = X_resampled.reshape(-1, 128, 128, 3)  # Ensure the shape is correct
synthetic_labels = y_resampled

# Split the data into training and remaining sets (80% training, 20% remaining)
train_data, remaining_data, train_labels, remaining_labels = train_test_split(
    synthetic_data, synthetic_labels, test_size=0.2, random_state=42
)

# Split the remaining data into validation and test sets (50% validation, 50% test)
val_data, test_data, val_labels, test_labels = train_test_split(
    remaining_data, remaining_labels, test_size=0.5, random_state=42
)

In [8]:
import tensorflow as tf

# Convert NumPy arrays to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((val_data, val_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_data, test_labels))

# Shuffle, batch, and prefetch the datasets as needed
batch_size = 32

train_dataset = train_dataset.shuffle(buffer_size=len(train_data)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)


In [10]:
from collections import Counter

# Initialize a Counter to count the number of images per class
class_counts = Counter()

# Iterate over the dataset and count the labels
for images, labels in train_dataset:
    class_counts.update(labels.numpy())

# Get class names from the dataset
class_names = {
    0: 'Normal',
    1: 'Pneumonia'
}

# Print the count of images per class
for class_index, class_count in class_counts.items():
    class_name = class_names[class_index]
    print(f"Class '{class_name}' has {class_count} images.")

Class 'Normal' has 3182 images.
Class 'Pneumonia' has 3131 images.


In [11]:
import plotly.express as px

fig = px.bar(x=class_names, y=list(class_counts.values())[::-1])
fig.show()

In [None]:
# # Save the datasets
# tf.data.experimental.save(train_dataset, 'train_dataset')
# tf.data.experimental.save(val_dataset, 'val_dataset')
# tf.data.experimental.save(test_dataset, 'test_dataset')

# Model training

In [12]:
# import tensorflow as tf
# from keras import layers, models

# # Set a random seed for reproducibility
# tf.random.set_seed(42)

In [13]:
# # Define a simple CNN model
# model = models.Sequential([
#     layers.Rescaling(1./255, input_shape=(128, 128, 3)),
#     layers.Conv2D(32, (3, 3), activation='relu'),
#     layers.MaxPooling2D((2, 2)),
#     layers.Conv2D(64, (3, 3), activation='relu'),
#     layers.MaxPooling2D((2, 2)),
#     layers.Conv2D(128, (3, 3), activation='relu'),
#     layers.MaxPooling2D((2, 2)),
#     layers.Conv2D(256, (3, 3), activation='relu'),
#     layers.Flatten(),
#     layers.Dense(256, activation='relu'),
#     layers.Dense(128, activation='relu'),
#     layers.Dense(1, activation='sigmoid')  # Use softmax if you have more than two classes
# ])
# # Compile the model
# model.compile(optimizer='adam',
#               loss='binary_crossentropy',  # Use sparse_categorical_crossentropy for more than two classes
#               metrics=['accuracy'])

In [14]:
# model.summary()

In [15]:
# # Train the model
# history = model.fit(
#     train_dataset,
#     validation_data=val_dataset,
#     epochs=3
# )

# # Evaluate the model on the test dataset
# test_loss, test_acc = model.evaluate(test_dataset)
# print(f"Test accuracy: {test_acc}")

In [16]:
# from numba import cuda 
# device = cuda.get_current_device()
# device.reset()

In [17]:
# def resnet_block(input, filters, strides=1):
#     x = layers.Conv2D(filters, kernel_size=3, strides=strides, padding='same', use_bias=False)(input)
#     x = layers.BatchNormalization()(x)
#     x = layers.ReLU()(x)

#     x = layers.Conv2D(filters, kernel_size=3, strides=1, padding='same', use_bias=False)(x)
#     x = layers.BatchNormalization()(x)

#     if strides != 1 or input.shape[-1] != filters:
#         input = layers.Conv2D(filters, kernel_size=1, strides=strides, padding='same', use_bias=False)(input)
#         input = layers.BatchNormalization()(input)

#     x = layers.add([x, input])
#     x = layers.ReLU()(x)
#     return x

# def resnet18(input_shape, num_classes):
#     inputs = layers.Input(shape=input_shape)

#     x = layers.Conv2D(64, kernel_size=7, strides=2, padding='same', use_bias=False)(inputs)
#     x = layers.BatchNormalization()(x)
#     x = layers.ReLU()(x)
#     x = layers.MaxPooling2D(pool_size=3, strides=2, padding='same')(x)

#     x = resnet_block(x, 64)
#     x = resnet_block(x, 64)

#     x = resnet_block(x, 128, strides=2)
#     x = resnet_block(x, 128)

#     x = resnet_block(x, 256, strides=2)
#     x = resnet_block(x, 256)

#     x = resnet_block(x, 512, strides=2)
#     x = resnet_block(x, 512)

#     x = layers.GlobalAveragePooling2D()(x)
#     outputs = layers.Dense(num_classes, activation='softmax')(x)

#     model = models.Model(inputs, outputs)
#     return model

In [18]:
# input_shape = (128, 128, 3)  # Modify based on your data shape
# num_classes = 2  # Modify based on your number of classes

# model = resnet18(input_shape=input_shape, num_classes=num_classes)

# model.compile(optimizer='adam',
#               loss='binary_crossentropy',
#               metrics=['accuracy'])