In [3]:
import pandas as pd

df = pd.read_csv("../../../scratch/physionet.org/files/symile-mimic/1.0.0/test.csv")

# print column names
print("Columns in test.csv:", df.columns.tolist())

# 1) Keep only the positive match per query admission
uni = df[df[df.columns.tolist()[-1]].astype(int) == 1].copy()

# 2) Sanity checks
assert uni[df.columns.tolist()[-2]].nunique() == 464, f"Expected 464 unique label_hadm_id, got {uni[df.columns.tolist()[-2]].nunique()}"
assert len(uni) == 464, f"Expected 464 rows, got {len(uni)}"

Columns in test.csv: ['subject_id', 'hadm_id', 'cxr_path', 'Atelectasis', 'Cardiomegaly', 'Edema', 'Lung Opacity', 'No Finding', 'Pleural Effusion', 'ecg_path', '51221', '51265', '50912', '50971', '51222', '51301', '51249', '51279', '51250', '51248', '51277', '51006', '50983', '50902', '50882', '50868', '50931', '50960', '50893', '50970', '51237', '51274', '51275', '51146', '51256', '51254', '51200', '51244', '52172', '50934', '51678', '50947', '50861', '50878', '50813', '50863', '50885', '50820', '50862', '50802', '50821', '50804', '50818', '52075', '52073', '52074', '52069', '51133', '50910', '52135', '51221_percentile', '51265_percentile', '50912_percentile', '50971_percentile', '51222_percentile', '51301_percentile', '51249_percentile', '51279_percentile', '51250_percentile', '51248_percentile', '51277_percentile', '51006_percentile', '50983_percentile', '50902_percentile', '50882_percentile', '50868_percentile', '50931_percentile', '50960_percentile', '50893_percentile', '50970_pe

In [4]:
mask = (df[df.columns.tolist()[-1]].astype(int) == 1).values
print("Kept rows:", mask.sum())  # should be 464

Kept rows: 464


In [6]:
import pandas as pd
import numpy as np
from pathlib import Path

data_root = Path("../../../scratch/physionet.org/files/symile-mimic/1.0.0/data_npy/test")

cxr = np.load(data_root / "cxr_test.npy")
ecg_features = np.load(data_root / "ecg_features.npy")
labs_features = np.load(data_root / "labs_features.npy")
hadm_id = np.load(data_root / "hadm_id_test.npy")

# All must have same length
N = cxr.shape[0]
assert all(arr.shape[0] == N for arr in [ecg_features, labs_features, hadm_id])
assert mask.shape[0] == N

In [7]:
cxr_u = cxr[mask]
ecg_u = ecg_features[mask]
labs_features_u = labs_features[mask]

# IMPORTANT:
# Use label_hadm_id, not candidate hadm_id
hadm_id_u = df.loc[mask, df.columns.tolist()[-2]].values

In [8]:
assert cxr_u.shape[0] == 464
assert len(np.unique(hadm_id_u)) == 464


In [9]:
out_dir = Path("../../../scratch/physionet.org/files/symile-mimic/1.0.0/data_npy/test_uni/")
out_dir.mkdir(parents=True, exist_ok=True)

np.save(out_dir / "cxr_test_uni.npy", cxr_u)
np.save(out_dir / "ecg_features.npy", ecg_u)
np.save(out_dir / "labs_features.npy", labs_features_u)
np.save(out_dir / "hadm_id_test.npy", hadm_id_u)

print("Wrote UNI test split with", cxr_u.shape[0], "samples")

Wrote UNI test split with 464 samples
