In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image
import os
import shutil

The original data was split into train and test folders, but in this case the images were unified keeping their structure for the labels.
This way get 4 folders: glioma_tumor, meninglioma_tumor, no_tumor and pituitary_tumor.

In [2]:
# Define the path to the data
data_path = 'data/Unified'

# List the subfolders in the training data folder
subfolders = os.listdir(data_path)

# Initialize a dictionary to hold the count of images in each subfolder
image_counts_data = {}

# Count the number of images in each subfolder
for subfolder in subfolders:
    subfolder_path = os.path.join(data_path, subfolder)
    if os.path.isdir(subfolder_path):
        num_images = len(os.listdir(subfolder_path))
        image_counts_data[subfolder] = num_images

# Display the results
print("Number of images in each class:")
for subfolder, count in image_counts_data.items():
    print(f"{subfolder}: {count}")

Number of images in each class:
glioma_tumor: 926
meningioma_tumor: 937
no_tumor: 500
pituitary_tumor: 901


As we can see the no_tumor class is underrepresented compated to the tumor clases. So we will use the ImageDataGenerator class to generate images from the no_tumor class so it reaches a number equal to the highest represented class (in this case meningioma_tumor)

In [3]:
# Paths
original_training_data_path = 'data/Unified'
augmented_training_data_path = 'data/Augmented'

# Initialize ImageDataGenerator with augmentation parameters
datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Function to augment images of a specific class
def augment_images(class_name, target_count):
    class_path = os.path.join(original_training_data_path, class_name)
    augmented_class_path = os.path.join(augmented_training_data_path, class_name)
    
    # Create subfolder in augmented_training if it doesn't exist
    if not os.path.exists(augmented_class_path):
        os.makedirs(augmented_class_path)
    
    # List existing images
    existing_images = os.listdir(class_path)
    existing_count = len(existing_images)
    
    # Copy existing images to augmented_training
    for image_name in existing_images:
        src_path = os.path.join(class_path, image_name)
        dst_path = os.path.join(augmented_class_path, image_name)
        if not os.path.exists(dst_path):  # Check if image already exists in destination
            shutil.copy(src_path, dst_path)
    
    # Start augmentation until target_count is reached
    for i in range(target_count - existing_count):
        # Randomly select an image to augment
        image_name = np.random.choice(existing_images)
        image_path = os.path.join(class_path, image_name)
        img = image.load_img(image_path, target_size=(128, 128))
        
        # Convert to numpy array and add batch dimension
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        
        # Generate augmented image
        augmented_images = next(datagen.flow(x, batch_size=1))
        augmented_image = image.array_to_img(augmented_images[0])
        
        # Save augmented image
        save_path = os.path.join(augmented_class_path, f"aug_{i}_{image_name}")
        augmented_image.save(save_path)

# Function to copy images of a specific class
def copy_images(class_name):
    class_path = os.path.join(original_training_data_path, class_name)
    augmented_class_path = os.path.join(augmented_training_data_path, class_name)
    
    # Create subfolder in augmented_training if it doesn't exist
    if not os.path.exists(augmented_class_path):
        os.makedirs(augmented_class_path)
    
    # List existing images
    existing_images = os.listdir(class_path)
    
    # Copy existing images to augmented_training
    for image_name in existing_images:
        src_path = os.path.join(class_path, image_name)
        dst_path = os.path.join(augmented_class_path, image_name)
        if not os.path.exists(dst_path):  # Check if image already exists in destination
            shutil.copy(src_path, dst_path)

# Get the number of images in each class in the original training data
image_counts = {class_name: len(os.listdir(os.path.join(original_training_data_path, class_name))) 
                for class_name in os.listdir(original_training_data_path) 
                if os.path.isdir(os.path.join(original_training_data_path, class_name))}

# Find the class with the maximum number of images
max_count = max(image_counts.values())

# Check if augmentation for 'no_tumor' is needed
augmented_no_tumor_path = os.path.join(augmented_training_data_path, 'no_tumor')
if not os.path.exists(augmented_no_tumor_path) or len(os.listdir(augmented_no_tumor_path)) < max_count:
    # Create augmented_training folder if it doesn't exist
    if not os.path.exists(augmented_training_data_path):
        os.makedirs(augmented_training_data_path)
    
    augment_images('no_tumor', max_count)
else:
    print("Augmented images already exist. Skipping augmentation.")

# Copy all original training images to augmented_training
for class_name in image_counts.keys():
    if class_name != 'no_tumor':  # We've already handled no_tumor
        copy_images(class_name)

Augmented images already exist. Skipping augmentation.


In [4]:
# Define the path to the data
data_path = 'data/Augmented'

# List the subfolders in the training data folder
subfolders = os.listdir(data_path)

# Initialize a dictionary to hold the count of images in each subfolder
image_counts_data = {}

# Count the number of images in each subfolder
for subfolder in subfolders:
    subfolder_path = os.path.join(data_path, subfolder)
    if os.path.isdir(subfolder_path):
        num_images = len(os.listdir(subfolder_path))
        image_counts_data[subfolder] = num_images

# Display the results
print("Number of images in each class after data augmentation:")
for subfolder, count in image_counts_data.items():
    print(f"{subfolder}: {count}")

Number of images in each class after data augmentation:
glioma_tumor: 926
meningioma_tumor: 937
no_tumor: 937
pituitary_tumor: 901


## Neural Networks

Now that we have a better distribution of classes a train and test data set are generated.

In [5]:
# Image dimensions
img_height, img_width = 128, 128
batch_size = 32

In [10]:
train_ds, val_ds = tf.keras.utils.image_dataset_from_directory(
  data_path,
  validation_split=0.2,
  subset="both",
  seed=42,
  image_size=(img_height, img_width),
  batch_size=batch_size)

Found 3701 files belonging to 4 classes.
Using 2961 files for training.
Using 740 files for validation.


In [12]:
class_names = train_ds.class_names
print(class_names)

['glioma_tumor', 'meningioma_tumor', 'no_tumor', 'pituitary_tumor']


## Convolutional Neural Network (CNN)

Now that we have the train and test datasets we will try some configurations for neural networks.

In [13]:
# Set random seed
tf.random.set_seed(42)

num_classes = len(class_names)

# Build the model
model_1 = keras.Sequential([
      layers.Rescaling(1./255, input_shape=(img_height, img_width, 3)),
      layers.Conv2D(16, 3, padding='same', activation='relu'),
      layers.MaxPooling2D(),
      layers.Conv2D(32, 3, padding='same', activation='relu'),
      layers.MaxPooling2D(),
      layers.Conv2D(64, 3, padding='same', activation='relu'),
      layers.MaxPooling2D(),
      layers.Flatten(),
      layers.Dense(128, activation='relu'),
      layers.Dense(num_classes)
])

# Compile the model
model_1.compile(optimizer='adam',
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])

# Train the model
history_1 = model_1.fit(train_ds, epochs=10, validation_data=val_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


The accuracy on the train data is much higher than the validation accuracy, so we will try other configurations.

In [14]:
# Removing the Conv2D(64, 3, padding='same', activation='relu') and a MaxPooling2D() layers
# Set random seed
tf.random.set_seed(42)

num_classes = len(class_names)

# Build the model
model_2 = keras.Sequential([
      layers.Rescaling(1./255, input_shape=(img_height, img_width, 3)),
      layers.Conv2D(16, 3, padding='same', activation='relu'),
      layers.MaxPooling2D(),
      layers.Conv2D(32, 3, padding='same', activation='relu'),
      layers.MaxPooling2D(),
      layers.Flatten(),
      layers.Dense(128, activation='relu'),
      layers.Dense(num_classes)
])

# Compile the model
model_2.compile(optimizer='adam',
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])

# Train the model
history_2 = model_2.fit(train_ds, epochs=10, validation_data=val_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
# Set random seed
tf.random.set_seed(42)

num_classes = len(class_names)

data_augmentation = keras.Sequential(
  [
    layers.RandomFlip("horizontal",
                      input_shape=(img_height,
                                  img_width,
                                  3)),
    layers.RandomRotation(0.1),
    layers.RandomZoom(0.1),
  ]
)


model_3 = keras.Sequential([
  data_augmentation,
  layers.Rescaling(1./255),
  layers.Conv2D(16, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(32, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Conv2D(64, 3, padding='same', activation='relu'),
  layers.MaxPooling2D(),
  layers.Dropout(0.2),
  layers.Flatten(),
  layers.Dense(128, activation='relu'),
  layers.Dense(num_classes)
])


# Compile the model
model_3.compile(optimizer='adam',
                loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                metrics=['accuracy'])

# Train the model
history_3 = model_3.fit(train_ds, epochs=10, validation_data=val_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In this an augmentation layer is added at the start and a dropout layer.
For the purposes of this excercise ~83% of accuracy will be considered acceptable so the model is saved so it can be used in a predictive system.

In [16]:
model_3.save("best_model")

INFO:tensorflow:Assets written to: best_model\assets


INFO:tensorflow:Assets written to: best_model\assets
