In [20]:
import os
import pandas as pd
import numpy as np

modalities = ["Frag", "CNV", "PFE", "NDR", "NDR2K"]
base_dir = "../database/cfDNA"
out_dir = base_dir
os.makedirs(out_dir, exist_ok=True)

for mod in modalities:
    df_norm = pd.read_csv(os.path.join(base_dir, "normal", f"healthy_{mod}.csv"), index_col=0)
    df_can  = pd.read_csv(os.path.join(base_dir, "cancer",  f"cancer_{mod}.csv"), index_col=0)

    # —— 在这里做缺失值填充 —— 
    df_norm = df_norm.fillna(df_norm.mean())   # 用各列均值填充
    df_can  = df_can.fillna(df_can.mean())

    X_norm = df_norm.values.astype(np.float32)
    X_can  = df_can.values.astype(np.float32)
    y_norm = np.zeros(X_norm.shape[0], dtype=np.int64)
    y_can  = np.ones (X_can.shape[0], dtype=np.int64)

    X = np.vstack([X_norm, X_can])
    y = np.concatenate([y_norm, y_can])
    np.savez_compressed(os.path.join(out_dir, f"{mod}.npz"), X=X, y=y)
    print(f"[{mod}] Completed: X.shape={X.shape}, y.shape={y.shape}")


[Frag] Completed: X.shape=(450, 888), y.shape=(450,)
[CNV] Completed: X.shape=(450, 5760), y.shape=(450,)
[PFE] Completed: X.shape=(450, 19415), y.shape=(450,)
[NDR] Completed: X.shape=(450, 19434), y.shape=(450,)
[NDR2K] Completed: X.shape=(450, 19434), y.shape=(450,)
