In [None]:

import numpy as np
import pandas as pd
import os
from pathlib import Path
import nibabel as nib
from skimage.transform import resize
import matplotlib.pyplot as plt

# Set directories
BASE_DIR = Path("./experiments")
IMG_DIR = BASE_DIR / "images"
MRI_DIR = BASE_DIR / "mri_volumes"
TABULAR_DIR = BASE_DIR / "tabular"

for d in [IMG_DIR, MRI_DIR, TABULAR_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# Synthetic X-ray images: 128x128 grayscale
n_xray = 10
xray_images = np.random.randint(0, 256, size=(n_xray, 128, 128), dtype=np.uint8)
xray_labels = np.random.randint(0, 2, size=(n_xray,))  # binary classification

# Save X-rays as numpy arrays
for i, img in enumerate(xray_images):
    np.save(IMG_DIR / f"xray_{i}.npy", img)

# Synthetic MRI volumes: 64x64x64
n_mri = 5
mri_volumes = np.random.rand(n_mri, 64, 64, 64).astype(np.float32)
mri_masks = np.random.randint(0, 2, size=(n_mri, 64, 64, 64)).astype(np.uint8)

for i, vol in enumerate(mri_volumes):
    np.save(MRI_DIR / f"mri_{i}.npy", vol)
    np.save(MRI_DIR / f"mask_{i}.npy", mri_masks[i])

# Synthetic tabular data
n_patients = 20
tabular_df = pd.DataFrame({
    "age": np.random.randint(30, 80, size=n_patients),
    "sex": np.random.choice(["male","female"], size=n_patients),
    "bp": np.random.normal(120,15,size=n_patients),
    "chol": np.random.normal(200,30,size=n_patients),
    "label": np.random.randint(0,2,size=n_patients)
})
tabular_df.to_csv(TABULAR_DIR / "clinical_data.csv", index=False)

print("Synthetic X-rays, MRI volumes, and tabular data generated and saved.")


In [None]:

from tensorflow.keras.utils import to_categorical

# Load X-rays
xray_files = sorted(IMG_DIR.glob("xray_*.npy"))
xray_data = np.array([np.load(f) for f in xray_files])
xray_data = xray_data.astype("float32") / 255.0  # normalize to 0-1
xray_data = xray_data[..., np.newaxis]  # add channel dimension

# Load labels
xray_labels = np.array([np.load(f.with_name(f.stem.split('_')[0] + '_labels.npy')) 
                        if (f.with_name(f.stem.split('_')[0] + '_labels.npy')).exists() 
                        else np.random.randint(0,2) 
                        for f in xray_files])
xray_labels = to_categorical(xray_labels, num_classes=2)

print("X-ray data shape:", xray_data.shape)
print("X-ray labels shape:", xray_labels.shape)

# Quick visualization
import matplotlib.pyplot as plt
plt.figure(figsize=(8,4))
for i in range(4):
    plt.subplot(1,4,i+1)
    plt.imshow(xray_data[i,...,0], cmap="gray")
    plt.title(f"Label: {np.argmax(xray_labels[i])}")
    plt.axis("off")
plt.show()


In [None]:

# Load MRI volumes and masks
mri_files = sorted(MRI_DIR.glob("mri_*.npy"))
mask_files = sorted(MRI_DIR.glob("mask_*.npy"))

mri_data = np.array([np.load(f) for f in mri_files])
mri_masks = np.array([np.load(f) for f in mask_files])

# Normalize MRI volumes to 0-1
mri_data = (mri_data - mri_data.min()) / (mri_data.max() - mri_data.min())
mri_data = mri_data[..., np.newaxis]  # add channel dimension

# One-hot encode masks for multi-class segmentation (here binary)
mri_masks = mri_masks[..., np.newaxis]

print("MRI volumes shape:", mri_data.shape)
print("MRI masks shape:", mri_masks.shape)

# Quick slice visualization
import matplotlib.pyplot as plt
slice_idx = 32
plt.figure(figsize=(10,4))
for i in range(min(3, mri_data.shape[0])):
    plt.subplot(2,3,i+1)
    plt.imshow(mri_data[i,:,:,slice_idx,0], cmap="gray")
    plt.title(f"Volume {i}")
    plt.axis("off")
    
    plt.subplot(2,3,i+4)
    plt.imshow(mri_masks[i,:,:,slice_idx,0], cmap="gray")
    plt.title(f"Mask {i}")
    plt.axis("off")
plt.show()


In [None]:

# Load tabular data
clinical_csv = TABULAR_DIR / "clinical_data.csv"
clinical_df = pd.read_csv(clinical_csv)

# Encode categorical features
clinical_df['sex'] = clinical_df['sex'].map({'male': 0, 'female': 1})

# Separate features and labels
X_tab = clinical_df.drop(columns=['label']).values.astype('float32')
y_tab = clinical_df['label'].values.astype('int')

print("Tabular features shape:", X_tab.shape)
print("Tabular labels shape:", y_tab.shape)
print(clinical_df.head())

# Save preprocessed tabular data for later use
np.save(TABULAR_DIR / "X_tab.npy", X_tab)
np.save(TABULAR_DIR / "y_tab.npy", y_tab)
print("Preprocessed tabular data saved.")


In [None]:

PREPROCESSED_DIR = BASE_DIR / "preprocessed"
for d in [PREPROCESSED_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# Save X-ray data
np.save(PREPROCESSED_DIR / "xray_data.npy", xray_data)
np.save(PREPROCESSED_DIR / "xray_labels.npy", xray_labels)

# Save MRI volumes and masks
np.save(PREPROCESSED_DIR / "mri_data.npy", mri_data)
np.save(PREPROCESSED_DIR / "mri_masks.npy", mri_masks)

# Save tabular data
np.save(PREPROCESSED_DIR / "X_tab.npy", X_tab)
np.save(PREPROCESSED_DIR / "y_tab.npy", y_tab)

print(f"All preprocessed datasets saved to {PREPROCESSED_DIR.resolve()}")

print("Summary of preprocessed data:")
print(f"X-ray data: {xray_data.shape}, labels: {xray_labels.shape}")
print(f"MRI volumes: {mri_data.shape}, masks: {mri_masks.shape}")
print(f"Tabular features: {X_tab.shape}, labels: {y_tab.shape}")
