In [1]:
# convert_to_npz.py
import pandas as pd
import numpy as np
from pathlib import Path

SRC = Path("B/time_series")
OUT = Path("data_npz")
OUT.mkdir(exist_ok=True)
LABELS = pd.read_csv("B/labels.csv")  # must contain patient_id or filename + label

# normalize labels: accept column "filename" or "patient_id"
if "filename" in LABELS.columns:
    LABELS["patient_id"] = LABELS["filename"].str.replace(r"\.csv$", "", regex=True)
elif "patient_id" not in LABELS.columns:
    raise SystemExit("labels.csv must contain 'patient_id' or 'filename' column")

label_map = dict(zip(LABELS["patient_id"].astype(str), LABELS["label"].astype(int)))

for csv_path in sorted(SRC.glob("*.csv")):
    pid = csv_path.stem
    df = pd.read_csv(csv_path)            # simple read; adjust read_csv args if needed
    # optional: ensure consistent column order here if you want
    X = df.to_numpy(dtype=np.float32)     # shape (T, F)
    y = label_map.get(pid, -1)            # -1 if missing label
    np.savez_compressed(OUT / f"{pid}.npz", X=X, y=np.int8(y))
print("Done. Wrote .npz files to", OUT)


Done. Wrote .npz files to data_npz


In [2]:
# view_npz_matrix.py
import numpy as np
from pathlib import Path
import pandas as pd

FILE = Path("data_npz/p100001.npz")  # adjust if needed

def load(file: Path):
    arr = np.load(file, allow_pickle=True)
    X = arr["X"] if "X" in arr.files else None
    y = arr["y"].item() if "y" in arr.files else None
    extras = {k: arr[k] for k in arr.files if k not in ("X", "y")}
    return X, y, extras

def print_overview(X, y, extras):
    print("File:", FILE)
    print("Label y:", y)
    if X is None:
        print("No 'X' array found.")
        return
    print("Shape (T, F):", X.shape)
    print("Dtype:", X.dtype)
    print("Total elements:", X.size)
    print("Number of NaNs:", int(np.isnan(X).sum()))
    if extras:
        print("Extra arrays in file:", list(extras.keys()))
    print()

def show_full(X):
    # Only use if matrix is reasonably small
    np.set_printoptions(threshold=100000, precision=4, suppress=True)
    print(X)

def show_head(X, n=10):
    print(f"First {n} rows:")
    print(X[:n])

def show_tail(X, n=10):
    print(f"Last {n} rows:")
    print(X[-n:])

def show_as_dataframe(X, extras):
    # If feature names exist in extras, use them
    cols = None
    if "feature_names" in extras:
        try:
            cols = [str(x) for x in extras["feature_names"].tolist()]
        except Exception:
            cols = None
    if cols is None:
        cols = [f"f{i}" for i in range(X.shape[1])]
    df = pd.DataFrame(X, columns=cols)
    print(df.head(20))  # show first 20 rows in table form

def main():
    X, y, extras = load(FILE)
    print_overview(X, y, extras)
    if X is None:
        return

    # Choose one of the following inspection methods:
    # 1) Safe quick view
    show_head(X, n=8)
    # 2) Tail
    show_tail(X, n=8)
    # 3) Pretty table if you want column names
    # show_as_dataframe(X, extras)
    # 4) Full print only if small
    # show_full(X)

if __name__ == "__main__":
    main()


File: data_npz\p100001.npz
Label y: 0
Shape (T, F): (23, 40)
Dtype: float32
Total elements: 920
Number of NaNs: 629

First 8 rows:
[[  93.     92.5      nan  110.     76.     56.     22.       nan     nan
      nan     nan     nan     nan     nan     nan     nan     nan     nan
      nan     nan     nan     nan     nan     nan     nan     nan     nan
      nan     nan     nan     nan     nan     nan     nan   73.      1.
     1.      0.   -214.64    2.  ]
 [  91.     96.       nan  108.     84.5    72.     23.5      nan     nan
      nan     nan     nan     nan     nan     nan     nan     nan     nan
      nan     nan     nan  233.       nan     nan     nan     nan     nan
      nan     nan     nan     nan     nan     nan     nan   73.      1.
     1.      0.   -214.64    3.  ]
 [  93.     98.       nan  123.     87.     61.     21.       nan     nan
      nan     nan     nan     nan     nan     nan     nan     nan     nan
      nan     nan     nan     nan     nan     nan     nan     n

In [None]:
import jax.numpy as jnp
import numpy as np


class HospData:

    class InvalidData(Exception):
        def __init__(self):
            pass
            
        def __str__(self):
            return f"Something went wrong\n"
        
        
    def __init__(self, dataset):
        self.dataset = dataset
           
        
    def extract_data(self):
        """returns data_train, data_validation, data_test"""
        try:
            data_train = jnp.array(self.dataset["data_train"])
            data_validation = jnp.array(self.dataset["data_validation"])
            data_test = jnp.array(self.dataset["data_test"])
        except Exception as e:
            raise self.InvalidData(self.dataset)
        
        return data_train, data_validation,  data_test
    
    
    
    
        
try:
    TrainD = HospData(np.load(''))
    TestD = HospData(np.load(''))
    ValD = HospData(np.load(''))
except Exception as e:
    raise e
    




In [5]:
import numpy as np
import jax.numpy as jnp
from pathlib import Path
import random
from typing import Mapping, Union

class HospData:
    class InvalidData(Exception):
        def __init__(self, info=None):
            self.info = info
        def __str__(self):
            return "Something went wrong\n" + (str(self.info) if self.info is not None else "")

    def __init__(self, dataset: Union[Mapping, str, Path], seed: int = 42):
        """
        dataset: entweder ein Mapping mit keys
                 'data_train','data_validation','data_test' (numpy arrays),
                 oder ein Pfad zu einem Ordner mit per-patient .npz Dateien (je Datei: 'X' und 'y').
        """
        if isinstance(dataset, (str, Path)):
            self.dataset = self._build_from_dir(Path(dataset), seed)
        elif isinstance(dataset, Mapping):
            self.dataset = dataset
        else:
            raise TypeError("dataset must be a mapping or a path to a directory")

    def _build_from_dir(self, folder: Path, seed: int):
        files = sorted(folder.glob("*.npz"))
        if not files:
            raise FileNotFoundError(f"No .npz files in {folder}")
        ids = [p.stem for p in files]
        random.Random(seed).shuffle(ids)

        n = len(ids)
        n_train = int(n * 0.8)
        n_val = int(n * 0.1)
        train_ids = ids[:n_train]
        val_ids = ids[n_train:n_train + n_val]
        test_ids = ids[n_train + n_val:]

        def load_list(id_list):
            seqs = []
            for pid in id_list:
                arr = np.load(folder / f"{pid}.npz", allow_pickle=True)
                if "X" not in arr.files:
                    raise KeyError(f"'X' not in {pid}.npz")
                seqs.append(arr["X"].astype(np.float32))
            return seqs

        X_train = load_list(train_ids)
        X_val = load_list(val_ids)
        X_test = load_list(test_ids)

        # pad to global max length
        all_seqs = X_train + X_val + X_test
        T_max = max(x.shape[0] for x in all_seqs)
        F = all_seqs[0].shape[1]

        def pad(seqs):
            out = np.zeros((len(seqs), T_max, F), dtype=np.float32)
            for i, s in enumerate(seqs):
                out[i, : s.shape[0], :] = s
            return out

        return {
            "data_train": pad(X_train),
            "data_validation": pad(X_val),
            "data_test": pad(X_test)
        }

    def extract_data(self):
        """returns data_train, data_validation, data_test as jnp arrays"""
        try:
            dt = jnp.array(self.dataset["data_train"])
            dv = jnp.array(self.dataset["data_validation"])
            dte = jnp.array(self.dataset["data_test"])
        except Exception as e:
            raise self.InvalidData(e)
        return dt, dv, dte
#print the shape of the data arrays
try:
    TrainD = HospData(np.load('data_npz'))
    TestD = HospData(np.load('data_npz'))
    ValD = HospData(np.load('data_npz'))
except Exception as e:
    raise e


PermissionError: [Errno 13] Permission denied: 'data_npz'