Table of Contents
# Train Test Split
#######noch Ã¼berarbeiten 

1. [Dataset Split](# Create stratified Train / Test Split)
2. Sanity Checks
3. Export to /data

In [2]:
#Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split


In [None]:

# Load the data
df_raw = pd.read_csv('../data/Dataset.csv')

# y = Label-Spalte, z.B. 'SepsisLabel'
patient_ids = df_raw["Patient_ID"].unique()
patient_labels = df_raw.groupby("Patient_ID")["SepsisLabel"].max()  

train_pat, test_pat = train_test_split(
    patient_ids,
    test_size=0.2,
    stratify=patient_labels[patient_ids],
    random_state=42
)

train_mask = df_raw["Patient_ID"].isin(train_pat)
test_mask  = df_raw["Patient_ID"].isin(test_pat)

df_raw_train = df_raw[train_mask].copy()
df_raw_test  = df_raw[test_mask].copy()


In [4]:

#Sanity-Check: class distribution per split


print("Patients train:", len(train_pat))
print("Patients test:", len(test_pat))

print("Label distribution (patients) train:")
print(patient_labels.loc[train_pat].value_counts(normalize=True))

print("Label distribution (patients) test:")
print(patient_labels.loc[test_pat].value_counts(normalize=True))


Patients train: 32268
Patients test: 8068
Label distribution (patients) train:
SepsisLabel
0    0.927296
1    0.072704
Name: proportion, dtype: float64
Label distribution (patients) test:
SepsisLabel
0    0.927367
1    0.072633
Name: proportion, dtype: float64


In [5]:
# Export Data to make them accessible for the models

df_raw_train.to_csv('../data/train.csv', index=False)
df_raw_test.to_csv('../data/test.csv', index=False)

In [None]:
#Helper Function to move PSV Files based on train.csv and test.csv

from pathlib import Path
import shutil


BASE = Path("/teamspace/studios/this_studio/detecting_Sepsis/data")
SRC  = BASE / "PSV_Patients_ALL"
TRAIN_CSV = BASE / "train.csv"
TEST_CSV  = BASE / "test.csv"
OUT_TRAIN = BASE / "PSV_Patients_TRAIN"
OUT_TEST  = BASE / "PSV_Patients_TEST"

def get_id_col(df):
    for c in ["Patient_ID", "PATIENT_ID", "patient_id", "patientID"]:
        if c in df.columns:
            return c
    raise ValueError(f"Keine Patient-ID-Spalte gefunden. Spalten sind: {list(df.columns)[:20]} ...")

def ids_from_csv(path):
    df = pd.read_csv(path)
    id_col = get_id_col(df)
    ids = pd.to_numeric(df[id_col], errors="coerce")
    if ids.isna().any():
        bad = df[id_col][ids.isna()].head(10).tolist()
        raise ValueError(f"{path.name}: Nicht-numerische Patient IDs gefunden (z.B.): {bad}")
    ids = ids.astype(int)
    return set(ids.unique()), id_col

def pid_to_fname(pid: int) -> str:
    return f"p{pid:06d}.psv"

# Basic existence checks
for p in [SRC, TRAIN_CSV, TEST_CSV]:
    if not p.exists():
        raise FileNotFoundError(f"Fehlt: {p}")

train_ids, train_col = ids_from_csv(TRAIN_CSV)
test_ids,  test_col  = ids_from_csv(TEST_CSV)

print("train:", len(train_ids), "test:", len(test_ids))
overlap = train_ids & test_ids
print("overlap:", len(overlap))

def check_missing(ids_set, name):
    missing = []
    for pid in ids_set:
        if not (SRC / pid_to_fname(pid)).exists():
            missing.append(pid)
    print(f"{name} missing PSV:", len(missing))
    if missing:
        print("examples:", missing[:20])
    return missing

missing_train = check_missing(train_ids, "TRAIN")
missing_test  = check_missing(test_ids,  "TEST")

# (optional but recommended) clean output dirs
for out_dir in [OUT_TRAIN, OUT_TEST]:
    out_dir.mkdir(parents=True, exist_ok=True)
    for f in out_dir.glob("*.psv"):
        f.unlink()

def copy_set(ids_set, out_dir, missing):
    missing_set = set(missing)
    copied = 0
    for pid in ids_set:
        if pid in missing_set:
            continue
        src_fp = SRC / pid_to_fname(pid)
        shutil.copy2(src_fp, out_dir / src_fp.name)
        copied += 1
    return copied

c_train = copy_set(train_ids, OUT_TRAIN, missing_train)
c_test  = copy_set(test_ids,  OUT_TEST,  missing_test)

print("copied train:", c_train, "files:", len(list(OUT_TRAIN.glob("*.psv"))))
print("copied test :", c_test,  "files:", len(list(OUT_TEST.glob("*.psv"))))
print("All done.")
