In [1]:
!pip install spektral



In [2]:
!kaggle datasets download -d itsanmol124/mimic-cxr

Dataset URL: https://www.kaggle.com/datasets/itsanmol124/mimic-cxr
License(s): unknown
mimic-cxr.zip: Skipping, found more recently modified local copy (use --force to force download)


In [3]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.sparse as tf_sparse
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling2D, concatenate
from spektral.utils import normalized_adjacency
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras import layers, models, metrics, regularizers, callbacks
from spektral.layers import GATConv
from sklearn.metrics.pairwise import cosine_similarity

# Unzip the dataset (if not already done)
import zipfile
with zipfile.ZipFile("/content/mimic-cxr.zip", "r") as zip_ref:
    zip_ref.extractall("/content/mimic_cxr")
print("Dataset extracted to /content/mimic_cxr")

Dataset extracted to /content/mimic_cxr


In [4]:
# Load CSV file
extract_dir = "/content/mimic_cxr"
csv_file = os.path.join(extract_dir, "mimic-cxr.csv")

data_df = pd.read_csv(csv_file)

# Define tabular features
tabular_features = [
    "Atelectasis", "Cardiomegaly", "Consolidation", "Edema",
    "Enlarged Cardiomediastinum", "Lung Lesion", "Lung Opacity",
    "Pleural Effusion", "Pneumonia", "Pneumothorax"
]

# Convert tabular features to float32
data_df[tabular_features] = data_df[tabular_features].astype(np.float32)

In [5]:
# Process labels
def process_labels(label_str):
    all_labels = [
        "Normal", "Atelectasis", "Cardiomegaly", "Consolidation", "Edema",
        "Enlarged Cardiomediastinum", "Lung Lesion", "Lung Opacity",
        "Pleural Effusion", "Pneumonia", "Pneumothorax"
    ]
    labels = label_str.split(", ")
    return [1 if label in labels else 0 for label in all_labels]

data_df['label'] = data_df['label'].apply(process_labels)

# Add file paths
data_df['filepath'] = data_df.apply(lambda row: os.path.join(extract_dir, row['split'], row['filename']), axis=1)

# Split data into train, validation, and test sets
train_data = data_df[data_df['split'] == 'train'].sample(n=1000, random_state=42)
valid_data = data_df[data_df['split'] == 'valid'].reset_index(drop=True)
test_data = data_df[data_df['split'] == 'test'].reset_index(drop=True)

In [6]:
# Preprocess images
def preprocess_image(image_path):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [224, 224])
    image /= 255.0
    return image

# Preprocess tabular data
def preprocess_tabular_data(tabular_values):
    return tf.convert_to_tensor(tabular_values, dtype=tf.float32)

# Load and preprocess data
def load_and_preprocess(filepath, label, tabular_values):
    image = preprocess_image(filepath)
    tabular_data = preprocess_tabular_data(tabular_values)
    label = tf.cast(label, dtype=tf.float32)
    return (image, tabular_data), label

# Create TensorFlow datasets
def create_tf_dataset(data):
    filepaths = data["filepath"].tolist()
    labels = np.array(data["label"].tolist())
    tabular_data = data[tabular_features].values.astype(np.float32)
    dataset = tf.data.Dataset.from_tensor_slices((filepaths, labels, tabular_data))
    dataset = dataset.map(lambda filepath, label, tabular_values: load_and_preprocess(filepath, label, tabular_values),
                          num_parallel_calls=tf.data.AUTOTUNE)
    return dataset.batch(32).prefetch(buffer_size=tf.data.AUTOTUNE)

train_dataset = create_tf_dataset(train_data)
valid_dataset = create_tf_dataset(valid_data)
test_dataset = create_tf_dataset(test_data)

In [7]:
# Custom GATConv layer
class CustomGATConv(GATConv):
    def call(self, inputs):
        x, a = inputs  # x: node features, a: adjacency matrix
        return super().call([x, a])

# Build hybrid GAT model
def build_hybrid_gat_model(num_features, num_nodes):
    # Image input branch
    image_input = Input(shape=(224, 224, 3), name="image_input")
    image_model = tf.keras.applications.ResNet50(weights="imagenet", include_top=False, input_tensor=image_input)
    image_features = GlobalAveragePooling2D()(image_model.output)

    # Tabular input branch
    tabular_input = Input(shape=(num_nodes, num_features), name="tabular_input")
    adj_input = Input(shape=(num_nodes, num_nodes), sparse=True, name="adj_input")

    # GATConv layer
    gat_output = CustomGATConv(channels=16, attn_heads=8, concat_heads=True)([tabular_input, adj_input])
    gat_output = layers.GlobalAveragePooling1D()(gat_output)

    # Combine image and GAT features
    combined = concatenate([image_features, gat_output])
    output = Dense(11, activation="sigmoid")(combined)

    model = tf.keras.Model(inputs=[image_input, tabular_input, adj_input], outputs=output)
    return model

In [8]:
# Define number of nodes and features
num_nodes = 100
num_features = len(tabular_features)

# Construct adjacency matrix using cosine similarity
tabular_data = train_data[tabular_features].values
adj_matrix = cosine_similarity(tabular_data)
adj_matrix = normalized_adjacency(adj_matrix)

# Convert adjacency matrix to sparse tensor
indices = np.array(np.nonzero(adj_matrix)).T
values = adj_matrix[indices[:, 0], indices[:, 1]]
adjacency_sparse = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=adj_matrix.shape)
adjacency_sparse = tf.sparse.reorder(adjacency_sparse)

In [9]:
# Build the model
hybrid_gat_model = build_hybrid_gat_model(num_features, num_nodes)
hybrid_gat_model.summary()

# Compile the model
hybrid_gat_model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy", Precision(name="precision"), Recall(name="recall")]
)

In [10]:
# Prepare training data
train_images = np.array([preprocess_image(filepath) for filepath in train_data['filepath']])
train_tabular = train_data[tabular_features].values
train_labels = np.array(train_data['label'].tolist())

# Prepare validation data
val_images = np.array([preprocess_image(filepath) for filepath in valid_data['filepath']])
val_tabular = valid_data[tabular_features].values
val_labels = np.array(valid_data['label'].tolist())

# Expand the tabular data to match the expected input shape (batch_size, num_nodes, num_features)
train_tabular = np.expand_dims(train_tabular, axis=1)  # Add num_nodes dimension
train_tabular = np.repeat(train_tabular, num_nodes, axis=1)  # Repeat along num_nodes dimension

val_tabular = np.expand_dims(val_tabular, axis=1)  # Add num_nodes dimension
val_tabular = np.repeat(val_tabular, num_nodes, axis=1)  # Repeat along num_nodes dimension

# Convert the adjacency matrix to a sparse tensor
adjacency_sparse = tf.sparse.SparseTensor(
    indices=np.array([[0, 1], [1, 2], [2, 3]]),  # Example indices
    values=np.array([1.0, 1.0, 1.0], dtype=np.float32),  # Example values
    dense_shape=(100, 100)  # Shape of the adjacency matrix
)
adjacency_sparse = tf.sparse.reorder(adjacency_sparse)  # Ensure it's properly ordered

# Repeat the adjacency matrix for each sample in the batch
adjacency_sparse_train = tf.sparse.expand_dims(adjacency_sparse, axis=0)  # Add batch dimension
adjacency_sparse_train = tf.sparse.concat(0, [adjacency_sparse_train] * len(train_images))  # Repeat for each sample

adjacency_sparse_val = tf.sparse.expand_dims(adjacency_sparse, axis=0)  # Add batch dimension
adjacency_sparse_val = tf.sparse.concat(0, [adjacency_sparse_val] * len(val_images))  # Repeat f

# Print shapes to verify
print("Train Images Shape:", train_images.shape)  # (500, 224, 224, 3)
print("Train Tabular Shape:", train_tabular.shape)  # (500, 10)
print("Adjacency Matrix Train Shape:", adjacency_sparse_train.shape)  # (500, 100, 100)
print("Train Labels Shape:", train_labels.shape)  # (500, 11)

Train Images Shape: (1000, 224, 224, 3)
Train Tabular Shape: (1000, 100, 10)
Adjacency Matrix Train Shape: (1000, 100, 100)
Train Labels Shape: (1000, 11)


In [11]:
# Train the model
history = hybrid_gat_model.fit(
    x=[train_images, train_tabular, adjacency_sparse_train],
    y=train_labels,
    batch_size=16,
    epochs=25,
    validation_data=([val_images, val_tabular, adjacency_sparse_val], val_labels)
)

# Calculate Precision, Recall, and F1-Score
precision = history.history['precision'][-1]
recall = history.history['recall'][-1]
f1_score = 2 * (precision * recall) / (precision + recall + 1e-7)
#Calculate for validation
val_precision = history.history['val_precision'][-1]
val_recall = history.history['val_recall'][-1]
val_f1_score = 2 * (val_precision * val_recall) / (val_precision + val_recall + 1e-7)
'''
print(f"Final Precision: {precision:.4f}")
print(f"Final Recall: {recall:.4f}")
print(f"Final F1-Score: {f1_score:.4f}")
'''
print(f"Final Validation Precision: {val_precision:.4f}")
print(f"Final Validation Recall: {val_recall:.4f}")
print(f"Final Validation F1-Score: {val_f1_score:.4f}")

Epoch 1/25
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 828ms/step - accuracy: 0.2735 - loss: 0.4381 - precision: 0.2697 - recall: 0.1170 - val_accuracy: 0.4191 - val_loss: 45.4956 - val_precision: 0.1876 - val_recall: 0.5820
Epoch 2/25
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 201ms/step - accuracy: 0.5711 - loss: 0.2798 - precision: 0.9439 - recall: 0.2594 - val_accuracy: 0.7243 - val_loss: 0.3425 - val_precision: 0.8179 - val_recall: 0.9171
Epoch 3/25
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 203ms/step - accuracy: 0.6608 - loss: 0.1652 - precision: 0.9581 - recall: 0.7082 - val_accuracy: 0.8059 - val_loss: 0.3221 - val_precision: 0.7332 - val_recall: 0.9930
Epoch 4/25
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 207ms/step - accuracy: 0.7210 - loss: 0.1037 - precision: 0.9792 - recall: 0.8781 - val_accuracy: 0.8017 - val_loss: 0.3201 - val_precision: 0.7140 - val_recall: 1.0000
Epoch 5/25
[1m63/

In [12]:
# Prepare test data
test_images = np.array([preprocess_image(filepath) for filepath in test_data['filepath']])
test_tabular = test_data[tabular_features].values
test_labels = np.array(test_data['label'].tolist())

# Expand the tabular data to match the expected input shape (batch_size, num_nodes, num_features)
test_tabular = np.expand_dims(test_tabular, axis=1)  # Add num_nodes dimension
test_tabular = np.repeat(test_tabular, num_nodes, axis=1)  # Repeat along num_nodes dimension

# Repeat the adjacency matrix for each sample in the batch
adjacency_sparse_test = tf.sparse.expand_dims(adjacency_sparse, axis=0)  # Add batch dimension
adjacency_sparse_test = tf.sparse.concat(0, [adjacency_sparse_test] * len(test_images))  # Repeat for each sample

# Print shapes to verify
print("Test Images Shape:", test_images.shape)
print("Test Tabular Shape:", test_tabular.shape)
print("Adjacency Matrix Test Shape:", adjacency_sparse_test.shape)
print("Test Labels Shape:", test_labels.shape)

# Evaluate the model on the test dataset
test_results = hybrid_gat_model.evaluate(
    x=[test_images, test_tabular, adjacency_sparse_test],
    y=test_labels,
    batch_size=16,
    verbose=1
)

# Extract metrics from test results
test_loss = test_results[0]
test_accuracy = test_results[1]
test_precision = test_results[2]
test_recall = test_results[3]
test_f1_score = 2 * (test_precision * test_recall) / (test_precision + test_recall + 1e-7)

# Print test metrics
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"Test F1-Score: {test_f1_score:.4f}")

Test Images Shape: (1455, 224, 224, 3)
Test Tabular Shape: (1455, 100, 10)
Adjacency Matrix Test Shape: (1455, 100, 100)
Test Labels Shape: (1455, 11)
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 94ms/step - accuracy: 0.7485 - loss: 0.0210 - precision: 0.9963 - recall: 0.9684
Test Loss: 0.0221
Test Accuracy: 0.7485
Test Precision: 0.9963
Test Recall: 0.9616
Test F1-Score: 0.9787
