In [1]:
!pip install kaggle



In [3]:
# Upload the kaggle.json file
from google.colab import files
files.upload()  # Upload kaggle.json file again if needed
# Remove the conflicting .kaggle file (if it exists)
!rm -f /root/.kaggle

# Create the .kaggle directory
!mkdir -p /root/.kaggle
# Skip mkdir and directly copy kaggle.json
!cp kaggle.json ~/.kaggle/

# Set proper permissions for the file
!chmod 600 ~/.kaggle/kaggle.json

cp: cannot stat 'kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory


In [2]:
!kaggle datasets list

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.10/dist-packages/kaggle/__init__.py", line 7, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.10/dist-packages/kaggle/api/kaggle_api_extended.py", line 407, in authenticate
    raise IOError('Could not find {}. Make sure it\'s located in'
OSError: Could not find kaggle.json. Make sure it's located in /root/.config/kaggle. Or use the environment method. See setup instructions at https://github.com/Kaggle/kaggle-api/


In [2]:
!kaggle datasets download -d itsanmol124/mimic-cxr

Dataset URL: https://www.kaggle.com/datasets/itsanmol124/mimic-cxr
License(s): unknown
Downloading mimic-cxr.zip to /content
100% 2.04G/2.04G [00:31<00:00, 32.2MB/s]
100% 2.04G/2.04G [00:31<00:00, 68.9MB/s]


In [3]:
import zipfile

# Unzip the dataset
with zipfile.ZipFile("/content/mimic-cxr.zip", "r") as zip_ref:
    zip_ref.extractall("/content/mimic_cxr")
print("Dataset extracted to /content/mimic_cxr")

Dataset extracted to /content/mimic_cxr


In [10]:
# Define the model
def create_hybrid_model(input_shape, num_labels):
    inputs = tf.keras.Input(shape=input_shape)
    x = layers.Conv2D(32, (3, 3), activation="relu")(inputs)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Conv2D(64, (3, 3), activation="relu")(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Conv2D(128, (3, 3), activation="relu")(x)
    x = layers.MaxPooling2D((2, 2))(x)
    x = layers.Flatten()(x)
    x = layers.Dense(256, activation="relu")(x)
    x = layers.Dropout(0.5)(x)
    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(num_labels, activation="sigmoid")(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

In [4]:
import os
import zipfile
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split

# Load CSV file
extract_dir = "/content/mimic_cxr"
csv_file = os.path.join(extract_dir, "mimic-cxr.csv")

# Step 2: Load CSV and define features
data_df = pd.read_csv(csv_file)
tabular_features = [
    "Atelectasis", "Cardiomegaly", "Consolidation", "Edema",
    "Enlarged Cardiomediastinum", "Lung Lesion", "Lung Opacity",
    "Pleural Effusion", "Pneumonia", "Pneumothorax"
]
for feature in tabular_features:
    data_df[feature] = data_df[feature].astype(np.float32)

# Preprocess the label column for multi-label classification
all_labels = [
    "Normal", "Atelectasis", "Cardiomegaly", "Consolidation", "Edema",
    "Enlarged Cardiomediastinum", "Lung Lesion", "Lung Opacity",
    "Pleural Effusion", "Pneumonia", "Pneumothorax"
]

def process_labels(label_str):
    """Convert label strings to one-hot encoded vectors."""
    labels = label_str.split(", ")
    return [1 if label in labels else 0 for label in all_labels]

data_df['label'] = data_df['label'].apply(process_labels)


data_df[tabular_features] = data_df[tabular_features].astype(float)

#data_df['filepath'] = data_df['filename'].apply(lambda x: os.path.join(extract_dir, x))
data_df['filepath'] = data_df.apply(lambda row: os.path.join(extract_dir, row['split'], row['filename']), axis=1)


In [5]:
# Step 2: Prepare undersampled training dataset
train_data = data_df[data_df['split'] == 'train']
valid_data = data_df[data_df['split'] == 'valid']
test_data = data_df[data_df['split'] == 'test']

print(f"Training Set: {len(train_data)}")
print(f"Validation Set: {len(valid_data)}")
print(f"Testing Set: {len(test_data)}")


Training Set: 83837
Validation Set: 711
Testing Set: 1455


In [6]:
# Undersample training dataset
undersampled_train_data = train_data.sample(n=10000, random_state=42)

# Combine valid and test datasets without modification
valid_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

print(f"Undersampled Training Set: {len(undersampled_train_data)}")
print(f"Validation Set: {len(valid_data)}")
print(f"Testing Set: {len(test_data)}")

# Verify label column is one-hot encoded
print(undersampled_train_data['label'].head())


Undersampled Training Set: 10000
Validation Set: 711
Testing Set: 1455
43437    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
42418    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
44136    [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0]
65209    [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
14608    [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Name: label, dtype: object


In [7]:
# Step 3: Define preprocessing functions
# Preprocessing functions
def preprocess_image(image_path):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [224, 224])
    image /= 255.0  # Normalize to [0, 1]
    return image

def preprocess_tabular_data(tabular_values):
    # Explicitly cast tabular values to float32
    tabular_tensor = tf.convert_to_tensor(tabular_values, dtype=tf.float32)
    return tabular_tensor

def load_and_preprocess(filepath, label, tabular_values):
    """Load and preprocess image, tabular data, and label."""
    image = preprocess_image(filepath)
    tabular_data = preprocess_tabular_data(tabular_values)

    # Cast label to float32
    label = tf.cast(label, dtype=tf.float32)

    return (image, tabular_data), label

In [8]:
def create_tf_dataset(data):
    filepaths = data["filepath"].tolist()
    labels = np.array(data["label"].tolist())  # Convert labels to NumPy arrays
    tabular_data = data[tabular_features].values.astype(np.float32)  # Ensure tabular data is float32

    dataset = tf.data.Dataset.from_tensor_slices((filepaths, labels, tabular_data))
    dataset = dataset.map(
        lambda filepath, label, tabular_values: load_and_preprocess(filepath, label, tabular_values),
        num_parallel_calls=tf.data.AUTOTUNE
    )
    return dataset.batch(32).prefetch(tf.data.AUTOTUNE)

train_dataset = create_tf_dataset(undersampled_train_data)
valid_dataset = create_tf_dataset(valid_data)
test_dataset = create_tf_dataset(test_data)

In [9]:
# Hybrid Model Definition
image_input = layers.Input(shape=(224, 224, 3), name="image_input")
image_model = tf.keras.applications.ResNet50(weights="imagenet", include_top=False, input_tensor=image_input)
image_features = layers.Flatten()(image_model.output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [10]:
# Tabular input
tabular_input = layers.Input(shape=(len(tabular_features),), name="tabular_input")
tabular_features_dense = layers.Dense(128, activation="relu")(tabular_input)

# Combine image and tabular features
combined = layers.concatenate([image_features, tabular_features_dense])
combined_dense = layers.Dense(128, activation="relu")(combined)
output = layers.Dense(11, activation="sigmoid")(combined_dense)

# Build the hybrid model
hybrid_model = models.Model(inputs=[image_input, tabular_input], outputs=output)
hybrid_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
epochs = 30
history = hybrid_model.fit(
    train_dataset,
    validation_data=valid_dataset,
    epochs=epochs
)


Epoch 1/30
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 405ms/step - accuracy: 0.4102 - loss: 0.7329 - val_accuracy: 0.7932 - val_loss: 0.2761
Epoch 2/30
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 297ms/step - accuracy: 0.7930 - loss: 0.0505 - val_accuracy: 0.8650 - val_loss: 0.0349
Epoch 3/30
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 301ms/step - accuracy: 0.8176 - loss: 0.0086 - val_accuracy: 0.8594 - val_loss: 0.0047
Epoch 4/30
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 302ms/step - accuracy: 0.8218 - loss: 0.0029 - val_accuracy: 0.8608 - val_loss: 0.0017
Epoch 5/30
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 303ms/step - accuracy: 0.8267 - loss: 0.0013 - val_accuracy: 0.8523 - val_loss: 0.0010
Epoch 6/30
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 302ms/step - accuracy: 0.8284 - loss: 6.7294e-04 - val_accuracy: 0.8495 - val_loss: 6.9319e