In [1]:
import numpy as np
import pandas as pd
import cv2
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MultiLabelBinarizer
from tqdm import tqdm
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

In [2]:
# Load your CSV file
csv_file_path = '../DL_for_Hin_Chest_X_Ray/HIN_archive/Data_Entry_2017.csv'
df = pd.read_csv(csv_file_path)

# Initialize constants
IMAGE_DIR = "../DL_for_Hin_Chest_X_Ray/HIN_archive/images/"
IMAGE_SIZE = (256, 256)

# Initialize the multi-label binarizer
mlb = MultiLabelBinarizer()

unique_labels = df["Finding Labels"].str.split("|").explode().unique()
mlb.fit([unique_labels])
unique_labels

array(['Cardiomegaly', 'Emphysema', 'Effusion', 'No Finding', 'Hernia',
       'Infiltration', 'Mass', 'Nodule', 'Atelectasis', 'Pneumothorax',
       'Pleural_Thickening', 'Pneumonia', 'Fibrosis', 'Edema',
       'Consolidation'], dtype=object)

In [3]:
def preprocess_image(file_path, image_size=IMAGE_SIZE):
    """
    Loads and preprocesses an image from the given file path.
    Resizes to the specified image size and normalizes pixel values.
    """
    image = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)  # Load image in grayscale
    if image is None:
        return None
    image = cv2.resize(image, image_size)
    image = image / 255.0  # Normalize pixel values to [0, 1]
    return image


def prepare_data(df, image_dir=IMAGE_DIR, image_size=IMAGE_SIZE):
    """
    Prepares images and labels from the dataset for model training.
    - Loads images based on 'Image Index' in the dataframe.
    - Converts 'Finding Labels' to one-hot encoded vectors.
    - Returns arrays of images and labels.
    """
    images = []
    labels = []
    
    for _, row in df.iterrows():
        # Construct image path
        image_path = os.path.join(image_dir, row["Image Index"])
        image = preprocess_image(image_path, image_size)
        
        if image is not None:
            images.append(image)
            # Convert labels into a list of diseases, then one-hot encode
            label = row["Finding Labels"].split("|")
            labels.append(label)
    
    # Convert lists to arrays
    images = np.array(images).reshape(-1, image_size[0], image_size[1], 1)  # Adding channel dimension for grayscale
    labels = mlb.transform(labels)  # Convert labels to multi-label one-hot encoding
    
    return images, labels

In [7]:
def create_model(image_size_x, image_size_y):

    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=(image_size_x, image_size_y, 1)),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(len(unique_labels), activation='sigmoid')  # For multi-label classification
    ])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [None]:
images, labels = prepare_data(df)



In [8]:
image_sizes = [(256, 256), (128, 128), (64, 64)]
percentages = [0.05]
results = {}

for image_size in image_sizes:
    for pct in percentages:
        # Resize images
        resized_images = np.array(images).reshape(-1, image_size[0], image_size[1], 1)
        # Slice data based on percentage
        subset_images = resized_images[:int(pct * len(images))]
        subset_labels = labels[:int(pct * len(labels))]

        model = create_model(image_size[0], image_size[1])

        # Train the model
        model.fit(subset_images, subset_labels, epochs=10, batch_size=32, validation_split=0.2)

        # Evaluate and store accuracy
        accuracy = model.evaluate(subset_images, subset_labels)
        results[(image_size, pct)] = accuracy

print(results)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10


2025-02-14 16:39:27.149687: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 369ms/step - accuracy: 0.1477 - loss: 1.1117 - val_accuracy: 0.1333 - val_loss: 0.3369
Epoch 2/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 200ms/step - accuracy: 0.0953 - loss: 0.4827 - val_accuracy: 0.1333 - val_loss: 0.2911
Epoch 3/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 177ms/step - accuracy: 0.1122 - loss: 0.4206 - val_accuracy: 0.3667 - val_loss: 0.2900
Epoch 4/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 175ms/step - accuracy: 0.1978 - loss: 0.3707 - val_accuracy: 0.3667 - val_loss: 0.3073
Epoch 5/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 186ms/step - accuracy: 0.2571 - loss: 0.3315 - val_accuracy: 0.2500 - val_loss: 0.3076
Epoch 6/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 238ms/step - accuracy: 0.3105 - loss: 0.3113 - val_accuracy: 0.3500 - val_loss: 0.3053
Epoch 7/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[