In [2]:
import numpy as np
import pandas as pd
import cv2
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

In [3]:
# Load your CSV file
csv_file_path = '../DL_for_Hin_Chest_X_Ray/HIN_archive/Data_Entry_2017.csv'
df = pd.read_csv(csv_file_path)

# Initialize constants
IMAGE_DIR = "../DL_for_Hin_Chest_X_Ray/HIN_archive/images/"
IMAGE_SIZE = (224, 224)

# Initialize the multi-label binarizer
mlb = MultiLabelBinarizer()

unique_labels = df["Finding Labels"].str.split("|").explode().unique()
mlb.fit([unique_labels])
unique_labels

array(['Cardiomegaly', 'Emphysema', 'Effusion', 'No Finding', 'Hernia',
       'Infiltration', 'Mass', 'Nodule', 'Atelectasis', 'Pneumothorax',
       'Pleural_Thickening', 'Pneumonia', 'Fibrosis', 'Edema',
       'Consolidation'], dtype=object)

In [3]:
def preprocess_image(file_path, image_size=IMAGE_SIZE):
    """
    Loads and preprocesses an image from the given file path.
    Resizes to the specified image size and normalizes pixel values.
    """
    image = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)  # Load image in grayscale
    if image is None:
        return None
    image = cv2.resize(image, image_size)
    image = image / 255.0  # Normalize pixel values to [0, 1]
    return image


def prepare_data(df, image_dir=IMAGE_DIR, image_size=IMAGE_SIZE):
    """
    Prepares images and labels from the dataset for model training.
    - Loads images based on 'Image Index' in the dataframe.
    - Converts 'Finding Labels' to one-hot encoded vectors.
    - Returns arrays of images and labels.
    """
    images = []
    labels = []
    
    for _, row in df.iterrows():
        # Construct image path
        image_path = os.path.join(image_dir, row["Image Index"])
        image = preprocess_image(image_path, image_size)
        
        if image is not None:
            images.append(image)
            # Convert labels into a list of diseases, then one-hot encode
            label = row["Finding Labels"].split("|")
            labels.append(label)
    
    # Convert lists to arrays
    images = np.array(images).reshape(-1, image_size[0], image_size[1], 1)  # Adding channel dimension for grayscale
    labels = mlb.transform(labels)  # Convert labels to multi-label one-hot encoding
    
    return images, labels

In [4]:
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 1)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(unique_labels), activation='sigmoid')  # For multi-label classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-02-13 20:55:05.225148: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2025-02-13 20:55:05.225270: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-02-13 20:55:05.225303: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
I0000 00:00:1739476505.226052 28171155 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1739476505.226237 28171155 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [1]:
images, labels = prepare_data(df)

image_sizes = [(64, 64), (128, 128), (256, 256)]
percentages = [0.1, 0.2, 0.3]
results = {}

for image_size in image_sizes:
    for pct in percentages:
        # Resize images
        resized_images = np.array(images).reshape(-1, image_size[0], image_size[1], 1)
        # Slice data based on percentage
        subset_images = resized_images[:int(pct * len(images))]
        subset_labels = labels[:int(pct * len(labels))]

        # Train the model
        model.fit(subset_images, subset_labels, epochs=10, batch_size=32, validation_split=0.2)

        # Evaluate and store accuracy
        accuracy = model.evaluate(subset_images, subset_labels)
        results[(image_size, pct)] = accuracy

print(results)

NameError: name 'prepare_data' is not defined