In [11]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define paths
dataset_path = "/content/cell_images/cell_images"
batch_size = 32  # Load 32 images at a time to save RAM
img_size = (128, 128)  # Resize to 128x128

# Define ImageDataGenerator
datagen = ImageDataGenerator(rescale=1./255, validation_split=0.2)  # Normalize pixel values

# Training data generator
train_generator = datagen.flow_from_directory(
    dataset_path,
    target_size=img_size,
    batch_size=batch_size,
    class_mode="binary",
    subset="training"
)

# Validation data generator
val_generator = datagen.flow_from_directory(
    dataset_path,
    target_size=img_size,
    batch_size=batch_size,
    class_mode="binary",
    subset="validation"
)

# Print dataset info
print("Training batches:", len(train_generator))
print("Validation batches:", len(val_generator))


Found 22048 images belonging to 2 classes.
Found 5510 images belonging to 2 classes.
Training batches: 689
Validation batches: 173


This function will extract features from both Parasitized and Uninfected images.

In [12]:
import os

# Define the correct dataset path after extracting the zip
dataset_path = "/content/cell_images/cell_images"

# List the contents of the 'cell_images' folder
print("Contents of '/content/cell_images/cell_images':", os.listdir(dataset_path))

# Correct the paths to Parasitized and Uninfected directories
parasitized_dir = os.path.join(dataset_path, "Parasitized")
uninfected_dir = os.path.join(dataset_path, "Uninfected")

# Verify directory existence
if not os.path.exists(parasitized_dir):
    print(f"Error: {parasitized_dir} does not exist.")
else:
    print(f"Found Parasitized Directory: {parasitized_dir}")

if not os.path.exists(uninfected_dir):
    print(f"Error: {uninfected_dir} does not exist.")
else:
    print(f"Found Uninfected Directory: {uninfected_dir}")


Contents of '/content/cell_images/cell_images': ['Uninfected', 'Parasitized']
Found Parasitized Directory: /content/cell_images/cell_images/Parasitized
Found Uninfected Directory: /content/cell_images/cell_images/Uninfected


In [13]:
import os
import cv2
import numpy as np
import gc  # Garbage collection module

IMG_SIZE = 64  # Reduce the image size to 64x64
BATCH_SIZE = 50  # Reduce the batch size for testing

# Preprocessing function with batch processing and memory management
def load_and_preprocess_images(directory, label, img_size=IMG_SIZE, batch_size=BATCH_SIZE):
    images = []
    labels = []

    # Get a list of image filenames, excluding unwanted files like Thumbs.db
    valid_files = [f for f in os.listdir(directory) if f.lower() != "thumbs.db"]

    # Process images in batches
    for i, img_name in enumerate(valid_files):
        if i % batch_size == 0 and i > 0:
            # Yield current batch
            yield np.array(images), np.array(labels)
            images, labels = [], []  # Reset for next batch
            gc.collect()  # Run garbage collection to free memory

        img_path = os.path.join(directory, img_name)

        # Load image in RGB format
        img = cv2.imread(img_path)
        if img is None:
            print(f"Skipping corrupted file: {img_name}")
            continue

        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        # Resize the image
        img = cv2.resize(img, (img_size, img_size))

        # Normalize the image (scaling pixel values to [0,1])
        img = img / 255.0

        images.append(img)
        labels.append(label)

    # Yield last batch if any images remain
    if images:
        yield np.array(images), np.array(labels)
    gc.collect()  # Final garbage collection

# Initialize empty lists to store the data
parasitized_images, parasitized_labels = [], []
uninfected_images, uninfected_labels = [], []

# Load Parasitized Images in batches
for batch_images, batch_labels in load_and_preprocess_images(parasitized_dir, label=1):
    parasitized_images.append(batch_images)
    parasitized_labels.append(batch_labels)

# Load Uninfected Images in batches
for batch_images, batch_labels in load_and_preprocess_images(uninfected_dir, label=0):
    uninfected_images.append(batch_images)
    uninfected_labels.append(batch_labels)

# Concatenate all the batches into one array
parasitized_images = np.concatenate(parasitized_images)
parasitized_labels = np.concatenate(parasitized_labels)
uninfected_images = np.concatenate(uninfected_images)
uninfected_labels = np.concatenate(uninfected_labels)

# Check the final shape of the loaded data
print(f"Loaded {len(parasitized_images)} Parasitized images")
print(f"Loaded {len(uninfected_images)} Uninfected images")


Loaded 13779 Parasitized images
Loaded 13779 Uninfected images


In [14]:
import psutil
print("Available memory (in GB):", psutil.virtual_memory().available / (1024**3))


Available memory (in GB): 0.9740028381347656


In [None]:
from sklearn.model_selection import train_test_split

# Combine the images and labels into a single array
images = np.concatenate([parasitized_images, uninfected_images], axis=0)
labels = np.concatenate([parasitized_labels, uninfected_labels], axis=0)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(images, labels, test_size=0.2, random_state=42)

print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


Custom CNN Models (2-Layer vs. 3-Layer)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

In [None]:
def build_2_layer_cnn(input_shape=(128, 128, 3)):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D(pool_size=(2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D(pool_size=(2, 2)),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')  # For binary classification (parasitized vs uninfected)
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


In [None]:
def build_3_layer_cnn(input_shape=(128, 128, 3)):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D(pool_size=(2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D(pool_size=(2, 2)),
        Conv2D(128, (3, 3), activation='relu'),
        MaxPooling2D(pool_size=(2, 2)),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

Train the Models

In [None]:
print(f"X_train shape before reshaping: {X_train.shape}")
print(f"X_test shape before reshaping: {X_test.shape}")


In [None]:
X_train_cnn = X_train.reshape(-1, 128, 128, 3)
X_test_cnn = X_test.reshape(-1, 128, 128, 3)
