#### cfDNA存放目录：/home/maweicheng/database/cfDNA
#### 癌症CT存放目录：/home/maweicheng/database/khct/patch_output/split_libs
#### 正常结节存放目录：/home/maweicheng/resgsca/database/3slice/64/nocancer

In [9]:
import os
import glob
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import StratifiedShuffleSplit
modalities = ["Frag", "PFE", "NDR", "NDR2K"]
cf_base_dir = "/home/maweicheng/database/cfDNA"
ct_cancer_dir = "/home/maweicheng/database/Patch"
ct_normal_dir = "/home/maweicheng/resgsca/database/3slice/64/nocancer"

save_train_path = "/home/maweicheng/database/cancer_normal/train.npz"
save_test_path = "/home/maweicheng/database/cancer_normal/test.npz"

# 统计跳过的样本
skipped_cancer = {"no_ct_files": [], "multiple_ct_files": [], "ct_load_error": [], "shape_error": [], "missing_modality": []}
skipped_normal = {"ct_load_error": [], "no_candidates": 0}

# 1. 读取所有模态数据
cf_data = {}
for mod in modalities:
    df_norm = pd.read_csv(os.path.join(cf_base_dir, "normal", f"healthy_{mod}.csv"), index_col=0)
    df_can  = pd.read_csv(os.path.join(cf_base_dir, "cancer",  f"cancer_{mod}.csv"), index_col=0)
    df_norm = df_norm.fillna(df_norm.mean())
    df_can  = df_can.fillna(df_can.mean())
    cf_data[mod] = {"normal": df_norm, "cancer": df_can}

# 获取共同的 index（癌症部分）
cancer_ids = cf_data["Frag"]["cancer"].index.tolist()
normal_ids = cf_data["Frag"]["normal"].index.tolist()

print(f"总癌症样本数: {len(cancer_ids)}")
print(f"总正常样本数: {len(normal_ids)}")

# 2. 处理 cancer 样本
X_cancer = {mod: [] for mod in modalities}
CT_cancer = []
y_cancer = []
id_cancer = []

for cid in cancer_ids:
    ct_pattern = os.path.join(ct_cancer_dir, cid, "*.npz")
    ct_files = glob.glob(ct_pattern)

    # 确保该病人目录下确实有一个 .npz 文件
    if len(ct_files) == 0:
        skipped_cancer["no_ct_files"].append(cid)
        print(f"[跳过-癌症] {cid}: 未找到CT文件")
        continue
    elif len(ct_files) > 1:
        skipped_cancer["multiple_ct_files"].append(cid)
        print(f"[跳过-癌症] {cid}: 找到多个CT文件 ({len(ct_files)}个)")
        continue

    ct_path = ct_files[0]
    try:
        ct_npz = np.load(ct_path)
        ct_data = ct_npz["data"]
        if ct_data.shape == (3, 64, 64):
            ct_data = np.transpose(ct_data, (1, 2, 0)) 

        if ct_data.shape != (64, 64, 3):
            skipped_cancer["shape_error"].append(cid)
            print(f"[跳过-癌症] {cid}: CT形状异常 {ct_data.shape} (应为 (64, 64, 3))")
            continue  

    except Exception as e:
        skipped_cancer["ct_load_error"].append(cid)
        print(f"[跳过-癌症] {cid}: CT加载失败 - {e}")
        continue

    # 检查 cfDNA 模态是否齐全
    missing_modalities = [mod for mod in modalities if cid not in cf_data[mod]["cancer"].index]
    if missing_modalities:
        skipped_cancer["missing_modality"].append(cid)
        print(f"[跳过-癌症] {cid}: 缺失模态数据 {missing_modalities}")
        continue

    # 添加数据
    for mod in modalities:
        X_cancer[mod].append(cf_data[mod]["cancer"].loc[cid].values.astype(np.float32))
    CT_cancer.append(ct_data.astype(np.float32))
    y_cancer.append(1)
    id_cancer.append(cid)

print(f"成功处理癌症样本: {len(y_cancer)}/{len(cancer_ids)}")

# 3. 处理 normal 样本
X_normal = {mod: [] for mod in modalities}
CT_normal = []
y_normal = []
id_normal = []

used_nocancer = set()
available_ct_files = sorted(os.listdir(ct_normal_dir))

for nid in normal_ids:
    # 随机选择未使用的 CT 文件
    candidates = list(set(available_ct_files) - used_nocancer)
    if not candidates:
        skipped_normal["no_candidates"] = len(normal_ids) - len(y_normal)
        print(f"[跳过-正常] 剩余 {len(normal_ids) - len(y_normal)} 个样本: CT文件不够用")
        break  # 不够用了
    
    selected = random.choice(candidates)
    used_nocancer.add(selected)
    ct_path = os.path.join(ct_normal_dir, selected)
    try:
        ct_npz = np.load(ct_path)
        ct_data = ct_npz["data"]
    except Exception as e:
        skipped_normal["ct_load_error"].append(nid)
        print(f"[跳过-正常] {nid}: CT加载失败 (文件: {selected}) - {e}")
        continue

    for mod in modalities:
        X_normal[mod].append(cf_data[mod]["normal"].loc[nid].values.astype(np.float32))
    CT_normal.append(ct_data.astype(np.float32))
    y_normal.append(0)
    id_normal.append(nid)

print(f"成功处理正常样本: {len(y_normal)}/{len(normal_ids)}")

# 4. 组装训练与测试集（70% 训练 + 30% 测试，按顺序切分）
def split_data(X_dict, CT_list, y_list, id_list, test_size=0.3, random_state=42):
    # 确保所有模态数据和CT数据的样本数一致
    assert all(len(X_dict[mod]) == len(y_list) for mod in modalities)
    assert len(CT_list) == len(y_list)
    assert len(id_list) == len(y_list)
    
    # 使用分层抽样划分
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state)
    
    # y_list需要是numpy数组
    y_array = np.array(y_list)
    
    # 获取划分索引
    for train_idx, test_idx in sss.split(np.zeros(len(y_array)), y_array):
        pass  # 只需要第一次划分
        
    # 划分数据
    train = {mod: np.stack([X_dict[mod][i] for i in train_idx]) for mod in modalities}
    test = {mod: np.stack([X_dict[mod][i] for i in test_idx]) for mod in modalities}
    
    train["CT"] = np.stack([CT_list[i] for i in train_idx])
    test["CT"] = np.stack([CT_list[i] for i in test_idx])
    train["y"] = y_array[train_idx]
    test["y"] = y_array[test_idx]
    train["id"] = np.array([id_list[i] for i in train_idx])
    test["id"] = np.array([id_list[i] for i in test_idx])
    return train, test

# 使用示例
print(f"\n=== 数据统计 ===")
print(f"癌症样本数: {len(y_cancer)}")
print(f"正常样本数: {len(y_normal)}")

# 打印跳过统计
print(f"\n=== 跳过样本统计 ===")
print("癌症样本跳过原因:")
for reason, samples in skipped_cancer.items():
    if isinstance(samples, list) and len(samples) > 0:
        print(f"  {reason}: {len(samples)}个 - {samples[:5]}{'...' if len(samples) > 5 else ''}")
    
print("正常样本跳过原因:")
for reason, count in skipped_normal.items():
    if isinstance(count, int) and count > 0:
        print(f"  {reason}: {count}个")
    elif isinstance(count, list) and len(count) > 0:
        print(f"  {reason}: {len(count)}个 - {count[:5]}{'...' if len(count) > 5 else ''}")

# 设置相同的random_state以确保可重复性
train_can, test_can = split_data(X_cancer, CT_cancer, y_cancer, id_cancer, test_size=0.1, random_state=42)
train_nor, test_nor = split_data(X_normal, CT_normal, y_normal, id_normal, test_size=0.1, random_state=42)

# 合并
train_all = {key: np.concatenate([train_can[key], train_nor[key]]) for key in train_can}
test_all  = {key: np.concatenate([test_can[key], test_nor[key]])   for key in test_can}

# 5. 保存
np.savez_compressed(save_train_path, **train_all)
np.savez_compressed(save_test_path,  **test_all)
print(f"\n已保存：{save_train_path}")
print(f"已保存：{save_test_path}")
print(f"\n最终训练集样本数: {len(train_all['y'])}")
print(f"最终测试集样本数: {len(test_all['y'])}")

总癌症样本数: 397
总正常样本数: 340
[跳过-癌症] Lib-002: 未找到CT文件
[跳过-癌症] Lib-003: 未找到CT文件
[跳过-癌症] Lib-030: 未找到CT文件
[跳过-癌症] Lib-031: 未找到CT文件
[跳过-癌症] Lib-030: 未找到CT文件
[跳过-癌症] Lib-031: 未找到CT文件
[跳过-癌症] Lib-037: 未找到CT文件
[跳过-癌症] Lib-038: 未找到CT文件
[跳过-癌症] Lib-037: 未找到CT文件
[跳过-癌症] Lib-038: 未找到CT文件
[跳过-癌症] Lib-047: 未找到CT文件
[跳过-癌症] Lib-048: 未找到CT文件
[跳过-癌症] Lib-049: 未找到CT文件
[跳过-癌症] Lib-050: 未找到CT文件
[跳过-癌症] Lib-062: 未找到CT文件
[跳过-癌症] Lib-063: 未找到CT文件
[跳过-癌症] Lib-047: 未找到CT文件
[跳过-癌症] Lib-048: 未找到CT文件
[跳过-癌症] Lib-049: 未找到CT文件
[跳过-癌症] Lib-050: 未找到CT文件
[跳过-癌症] Lib-062: 未找到CT文件
[跳过-癌症] Lib-063: 未找到CT文件
[跳过-癌症] Lib-072: 未找到CT文件
[跳过-癌症] Lib-073: 未找到CT文件
[跳过-癌症] Lib-072: 未找到CT文件
[跳过-癌症] Lib-073: 未找到CT文件
[跳过-癌症] Lib-124: 未找到CT文件
[跳过-癌症] Lib-124: 未找到CT文件
[跳过-癌症] Lib-129: 未找到CT文件
[跳过-癌症] Lib-129: 未找到CT文件
[跳过-癌症] Lib-137: 未找到CT文件
[跳过-癌症] Lib-137: 未找到CT文件
[跳过-癌症] Lib-141: 未找到CT文件
[跳过-癌症] Lib-141: 未找到CT文件
[跳过-癌症] Lib-177: 未找到CT文件
[跳过-癌症] Lib-179: 未找到CT文件
[跳过-癌症] Lib-180: 未找到CT文件
[跳过-癌症] Lib-181: 未找到CT文件
[跳过-癌症] Lib-177: 未找到CT文件
[

In [None]:
import os
import glob
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

modalities = ["Frag", "CNV", "PFE", "NDR", "NDR2K"]
cf_base_dir = "/home/maweicheng/database/cfDNA"
ct_cancer_dir = "/home/maweicheng/database/khct/patch_output/split_libs_new"
ct_normal_dir = "/home/maweicheng/resgsca/database/3slice/64/nocancer"

save_train_path = os.path.join(cf_base_dir, "cancer_and_normal/train.npz")
save_test_path = os.path.join(cf_base_dir, "cancer_and_normal/test.npz")

# 统计跳过的样本
skipped_cancer = {"no_ct_files": [], "multiple_ct_files": [], "ct_load_error": [], "shape_error": [], "missing_modality": []}
skipped_normal = {"ct_load_error": [], "no_candidates": 0}

# 1. 读取所有模态数据
cf_data = {}
for mod in modalities:
    df_norm = pd.read_csv(os.path.join(cf_base_dir, "normal", f"healthy_{mod}.csv"), index_col=0)
    df_can  = pd.read_csv(os.path.join(cf_base_dir, "cancer",  f"cancer_{mod}.csv"), index_col=0)
    df_norm = df_norm.fillna(df_norm.mean())
    df_can  = df_can.fillna(df_can.mean())
    cf_data[mod] = {"normal": df_norm, "cancer": df_can}

# 获取 index
cancer_ids = cf_data["Frag"]["cancer"].index.tolist()
normal_ids = cf_data["Frag"]["normal"].index.tolist()

# 从 normal_ids 中随机抽取与 cancer_ids 相同数量的样本
random.seed(51)  # 确保结果可复现
normal_ids = random.sample(normal_ids, len(cancer_ids))

print(f"总癌症样本数: {len(cancer_ids)}")
print(f"随机选择的正常样本数: {len(normal_ids)}")

# 2. 处理 cancer 样本
X_cancer = {mod: [] for mod in modalities}
CT_cancer = []
y_cancer = []
id_cancer = []

for cid in cancer_ids:
    ct_pattern = os.path.join(ct_cancer_dir, cid, "*.npz")
    ct_files = glob.glob(ct_pattern)
    
    if len(ct_files) == 0:
        skipped_cancer["no_ct_files"].append(cid)
        print(f"[跳过-癌症] {cid}: 未找到CT文件")
        continue
    elif len(ct_files) > 1:
        skipped_cancer["multiple_ct_files"].append(cid)
        print(f"[跳过-癌症] {cid}: 找到多个CT文件 ({len(ct_files)}个)")
        continue
        
    ct_path = ct_files[0]
    try:
        ct_npz = np.load(ct_path)
        ct_data = ct_npz["data"]
        if ct_data.shape == (3, 64, 64):
            ct_data = np.transpose(ct_data, (1, 2, 0))
            
        if ct_data.shape != (64, 64, 3):
            skipped_cancer["shape_error"].append(cid)
            print(f"[跳过-癌症] {cid}: CT形状异常 {ct_data.shape} (应为 (64, 64, 3))")
            continue  
    except Exception as e:
        skipped_cancer["ct_load_error"].append(cid)
        print(f"[跳过-癌症] {cid}: CT加载失败 - {e}")
        continue

    # 检查 cfDNA 模态是否齐全
    missing_modalities = [mod for mod in modalities if cid not in cf_data[mod]["cancer"].index]
    if missing_modalities:
        skipped_cancer["missing_modality"].append(cid)
        print(f"[跳过-癌症] {cid}: 缺失模态数据 {missing_modalities}")
        continue

    for mod in modalities:
        X_cancer[mod].append(cf_data[mod]["cancer"].loc[cid].values.astype(np.float32))

    # plt.imshow(ct_data / np.max(ct_data))  # 归一化展示
    # plt.title(f"Cancer ID: {cid}")
    # plt.axis('off')
    # plt.show()
    CT_cancer.append(ct_data.astype(np.float32))
    y_cancer.append(1)
    id_cancer.append(cid)

print(f"成功处理癌症样本: {len(y_cancer)}/{len(cancer_ids)}")

# 3. 处理 normal 样本
X_normal = {mod: [] for mod in modalities}
CT_normal = []
y_normal = []
id_normal = []

used_nocancer = set()
available_ct_files = sorted(os.listdir(ct_normal_dir))
print("开始处理正常样本")

for nid in normal_ids:
    candidates = list(set(available_ct_files) - used_nocancer)
    if not candidates:
        skipped_normal["no_candidates"] = len(normal_ids) - len(y_normal)
        print(f"[跳过-正常] 剩余 {len(normal_ids) - len(y_normal)} 个样本: CT文件不够用")
        break
        
    selected = random.choice(candidates)
    used_nocancer.add(selected)
    ct_path = os.path.join(ct_normal_dir, selected)
    try:
        ct_npz = np.load(ct_path)
        ct_data = ct_npz["data"]
    except Exception as e:
        skipped_normal["ct_load_error"].append(nid)
        print(f"[跳过-正常] {nid}: CT加载失败 (文件: {selected}) - {e}")
        continue

    for mod in modalities:
        X_normal[mod].append(cf_data[mod]["normal"].loc[nid].values.astype(np.float32))
    
    # plt.imshow(ct_data / np.max(ct_data))  # 归一化展示
    # plt.title(f"Cancer ID: {nid}")
    # plt.axis('off')
    # plt.show()
    CT_normal.append(ct_data.astype(np.float32))
    y_normal.append(0)
    id_normal.append(nid)  # 去掉 .npz 后缀

print(f"成功处理正常样本: {len(y_normal)}/{len(normal_ids)}")

# 4. 组装训练与测试集（90% 训练 + 10% 测试）
def stratified_split_data(X_dict, CT_list, y_list, id_list, test_size=0.1, seed=42):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)
    train_idx, test_idx = next(sss.split(np.zeros(len(y_list)), y_list))

    def subset(data, idx): return [data[i] for i in idx]

    train = {mod: np.stack(subset(X_dict[mod], train_idx)) for mod in modalities}
    test = {mod: np.stack(subset(X_dict[mod], test_idx)) for mod in modalities}
    train["CT"] = np.stack(subset(CT_list, train_idx))
    test["CT"] = np.stack(subset(CT_list, test_idx))
    train["y"] = np.array([y_list[i] for i in train_idx])
    test["y"] = np.array([y_list[i] for i in test_idx])
    train["id"] = np.array([id_list[i] for i in train_idx])
    test["id"] = np.array([id_list[i] for i in test_idx])
    return train, test

print(f"\n=== 数据统计 ===")
print(f"癌症样本数: {len(y_cancer)}")
print(f"正常样本数: {len(y_normal)}")

# 打印跳过统计
print(f"\n=== 跳过样本统计 ===")
print("癌症样本跳过原因:")
for reason, samples in skipped_cancer.items():
    if isinstance(samples, list) and len(samples) > 0:
        print(f"  {reason}: {len(samples)}个 - {samples[:5]}{'...' if len(samples) > 5 else ''}")

print("正常样本跳过原因:")
for reason, count in skipped_normal.items():
    if isinstance(count, int) and count > 0:
        print(f"  {reason}: {count}个")
    elif isinstance(count, list) and len(count) > 0:
        print(f"  {reason}: {len(count)}个 - {count[:5]}{'...' if len(count) > 5 else ''}")

train_can, test_can = stratified_split_data(X_cancer, CT_cancer, y_cancer, id_cancer)
train_nor, test_nor = stratified_split_data(X_normal, CT_normal, y_normal, id_normal)

# 合并
train_all = {key: np.concatenate([train_can[key], train_nor[key]]) for key in train_can}
test_all  = {key: np.concatenate([test_can[key], test_nor[key]])   for key in test_can}

# 5. 保存
np.savez_compressed(save_train_path, **train_all)
np.savez_compressed(save_test_path, **test_all)
print(f"\n已保存：{save_train_path}")
print(f"已保存：{save_test_path}")
print(f"\n最终训练集样本数: {len(train_all['y'])}")
print(f"最终测试集样本数: {len(test_all['y'])}")

开始处理正常样本
癌症样本数: 98
正常样本数: 110
已保存： /home/maweicheng/database/cfDNA/cancer_and_normal/train.npz /home/maweicheng/database/cfDNA/cancer_and_normal/test.npz


# 这儿是恶性良性的

In [None]:
import os
import glob
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit

# 参数设定
modalities = ["Frag", "CNV", "PFE", "NDR", "NDR2K"]
cf_base_dir = "/home/maweicheng/database/cfDNA"
ct_base_dir = "/home/maweicheng/database/khct/patch_output/split_libs_new_different"
save_train_path = os.path.join(cf_base_dir, "cancer_and_benign/train.npz")
save_test_path = os.path.join(cf_base_dir, "cancer_and_benign/test.npz")

# Library ID 列表
cancer_ids = ["Lib-004", "Lib-019", "Lib-020", "Lib-021", "Lib-022", "Lib-025", "Lib-028", "Lib-032", "Lib-039", "Lib-040", "Lib-066",
              "Lib-067", "Lib-068", "Lib-069", "Lib-076", "Lib-077", "Lib-082", "Lib-083", "Lib-086", "Lib-087", "Lib-088", "Lib-090", "Lib-093", "Lib-094",
              "Lib-107", "Lib-108", "Lib-109", "Lib-110", "Lib-121", "Lib-122", "Lib-123", "Lib-124", "Lib-126", "Lib-128", "Lib-130", "Lib-131", "Lib-137",
              "Lib-140", "Lib-142", "Lib-143", "Lib-145", "Lib-149", "Lib-150", "Lib-157", "Lib-158", "Lib-159", "Lib-160"]

benign_ids = ["Lib-005", "Lib-006", "Lib-026", "Lib-027", "Lib-030", "Lib-031", "Lib-033", "Lib-034", "Lib-035", "Lib-036", "Lib-064", "Lib-065", "Lib-070",
              "Lib-071", "Lib-074", "Lib-075", "Lib-084", "Lib-085", "Lib-089", "Lib-091", "Lib-092", "Lib-095", "Lib-096", "Lib-097", "Lib-125", "Lib-132",
              "Lib-138", "Lib-144", "Lib-147", "Lib-148", "Lib-151", "Lib-152", "Lib-153", "Lib-154"]

print(f"恶性样本ID数: {len(cancer_ids)}")
print(f"良性样本ID数: {len(benign_ids)}")

cf_data = {}
for mod in modalities:
    df_all = pd.read_csv(os.path.join(cf_base_dir,"cancer", f"cancer_{mod}.csv"), index_col=0)
    df_all = df_all.fillna(df_all.mean())  # 缺失值填充
    cf_data[mod] = df_all

# 统计跳过的样本
skipped_samples = {"cancer": {"no_ct_files": [], "multiple_ct_files": [], "ct_load_error": [], "shape_error": [], "missing_modality": []},
                  "benign": {"no_ct_files": [], "multiple_ct_files": [], "ct_load_error": [], "shape_error": [], "missing_modality": []}}

# 通用处理函数
def process_samples(id_list, label):
    X = {mod: [] for mod in modalities}
    CT = []
    y = []
    ids = []
    sample_type = "cancer" if label == 1 else "benign"
    type_name = "恶性" if label == 1 else "良性"

    for pid in id_list:
        # 加载 CT 图像
        ct_pattern = os.path.join(ct_base_dir, pid, "*.npz")
        ct_files = glob.glob(ct_pattern)
        
        if len(ct_files) == 0:
            skipped_samples[sample_type]["no_ct_files"].append(pid)
            print(f"[跳过-{type_name}] {pid}: 未找到CT文件")
            continue
        elif len(ct_files) > 1:
            skipped_samples[sample_type]["multiple_ct_files"].append(pid)
            print(f"[跳过-{type_name}] {pid}: 找到多个CT文件 ({len(ct_files)}个)")
            continue

        try:
            ct_npz = np.load(ct_files[0])
            ct_data = ct_npz["data"]
            if ct_data.shape == (3, 64, 64):
                ct_data = np.transpose(ct_data, (1, 2, 0))  # (64, 64, 3)
                
            if ct_data.shape != (64, 64, 3):
                skipped_samples[sample_type]["shape_error"].append(pid)
                print(f"[跳过-{type_name}] {pid}: CT形状异常 {ct_data.shape} (应为 (64, 64, 3))")
                continue
        except Exception as e:
            skipped_samples[sample_type]["ct_load_error"].append(pid)
            print(f"[跳过-{type_name}] {pid}: CT加载失败 - {e}")
            continue

        # 检查 cfDNA 是否都有
        missing_modalities = [mod for mod in modalities if pid not in cf_data[mod].index]
        if missing_modalities:
            skipped_samples[sample_type]["missing_modality"].append(pid)
            print(f"[跳过-{type_name}] {pid}: 缺失模态数据 {missing_modalities}")
            continue

        # 添加数据
        for mod in modalities:
            X[mod].append(cf_data[mod].loc[pid].values.astype(np.float32))
        CT.append(ct_data.astype(np.float32))
        y.append(label)
        ids.append(pid)

        # 可视化
        # plt.imshow(ct_data / np.max(ct_data))
        # plt.title(f"{type_name} ID: {pid}")
        # plt.axis('off')
        # plt.show()

    print(f"成功处理{type_name}样本: {len(y)}/{len(id_list)}")
    return X, CT, y, ids

# 2. 分别处理癌症/良性
X_cancer, CT_cancer, y_cancer, id_cancer = process_samples(cancer_ids, label=1)
X_benign, CT_benign, y_benign, id_benign = process_samples(benign_ids, label=0)

print(f"\n=== 数据统计 ===")
print(f"恶性样本数: {len(y_cancer)}")
print(f"良性样本数: {len(y_benign)}")

# 打印跳过统计
print(f"\n=== 跳过样本统计 ===")
for sample_type, type_name in [("cancer", "恶性"), ("benign", "良性")]:
    print(f"{type_name}样本跳过原因:")
    for reason, samples in skipped_samples[sample_type].items():
        if len(samples) > 0:
            print(f"  {reason}: {len(samples)}个 - {samples[:5]}{'...' if len(samples) > 5 else ''}")

# 3. 训练测试划分函数
def stratified_split_data(X_dict, CT_list, y_list, id_list, test_size=0.1, seed=42):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)
    train_idx, test_idx = next(sss.split(np.zeros(len(y_list)), y_list))

    def subset(data, idx): return [data[i] for i in idx]

    train = {mod: np.stack(subset(X_dict[mod], train_idx)) for mod in modalities}
    test = {mod: np.stack(subset(X_dict[mod], test_idx)) for mod in modalities}
    train["CT"] = np.stack(subset(CT_list, train_idx))
    test["CT"] = np.stack(subset(CT_list, test_idx))
    train["y"] = np.array([y_list[i] for i in train_idx])
    test["y"] = np.array([y_list[i] for i in test_idx])
    train["id"] = np.array([id_list[i] for i in train_idx])
    test["id"] = np.array([id_list[i] for i in test_idx])
    return train, test

# 4. 拆分并合并训练测试集
train_can, test_can = stratified_split_data(X_cancer, CT_cancer, y_cancer, id_cancer)
train_ben, test_ben = stratified_split_data(X_benign, CT_benign, y_benign, id_benign)
train_all = {key: np.concatenate([train_can[key], train_ben[key]]) for key in train_can}
test_all = {key: np.concatenate([test_can[key], test_ben[key]]) for key in test_can}

# 5. 保存为 .npz
np.savez_compressed(save_train_path, **train_all)
np.savez_compressed(save_test_path, **test_all)
print(f"\n✅ 数据保存完成:")
print(f"训练集: {save_train_path} (样本数: {len(train_all['y'])})")
print(f"测试集: {save_test_path} (样本数: {len(test_all['y'])})")

# 下面是分亚型

In [None]:
import os
import glob
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit

# 参数设定
modalities = ["Frag", "CNV", "PFE", "NDR", "NDR2K"]
cf_base_dir = "/home/maweicheng/database/cfDNA"
ct_base_dir = "/home/maweicheng/database/khct/patch_output/split_libs_new_different"
save_train_path = os.path.join(cf_base_dir, "subtype/train.npz")
save_test_path = os.path.join(cf_base_dir, "subtype/test.npz")

# Library ID 列表
LUAD_ids = ["Lib-004","Lib-019", "Lib-020", "Lib-025", "Lib-028", "Lib-037", "Lib-038", "Lib-039", "Lib-040", "Lib-068", "Lib-069", "Lib-076",
"Lib-077", "Lib-082", "Lib-086", "Lib-087", "Lib-093", "Lib-094", "Lib-108", "Lib-110", "Lib-122", "Lib-124", "Lib-137", "Lib-150",
"Lib-157", "Lib-158", "Lib-159", "Lib-160"]

LUSC_ids = ["Lib-021", "Lib-022", "Lib-032", "Lib-083", "Lib-090", "Lib-126", "Lib-128", "Lib-140", "Lib-145", "Lib-149"]

print(f"LUAD样本ID数: {len(LUAD_ids)}")
print(f"LUSC样本ID数: {len(LUSC_ids)}")

# 1. 读取所有模态数据（合并癌症/良性）
cf_data = {}
for mod in modalities:
    df_all = pd.read_csv(os.path.join(cf_base_dir,"cancer", f"cancer_{mod}.csv"), index_col=0)
    df_all = df_all.fillna(df_all.mean())  # 缺失值填充
    cf_data[mod] = df_all

# 统计跳过的样本
skipped_samples = {"LUAD": {"no_ct_files": [], "multiple_ct_files": [], "ct_load_error": [], "shape_error": [], "missing_modality": []},
                  "LUSC": {"no_ct_files": [], "multiple_ct_files": [], "ct_load_error": [], "shape_error": [], "missing_modality": []}}

# 通用处理函数
def process_samples(id_list, label):
    X = {mod: [] for mod in modalities}
    CT = []
    y = []
    ids = []
    sample_type = "LUAD" if label == 1 else "LUSC"

    for pid in id_list:
        # 加载 CT 图像
        ct_pattern = os.path.join(ct_base_dir, pid, "*.npz")
        ct_files = glob.glob(ct_pattern)
        
        if len(ct_files) == 0:
            skipped_samples[sample_type]["no_ct_files"].append(pid)
            print(f"[跳过-{sample_type}] {pid}: 未找到CT文件")
            continue
        elif len(ct_files) > 1:
            skipped_samples[sample_type]["multiple_ct_files"].append(pid)
            print(f"[跳过-{sample_type}] {pid}: 找到多个CT文件 ({len(ct_files)}个)")
            continue

        try:
            ct_npz = np.load(ct_files[0])
            ct_data = ct_npz["data"]
            if ct_data.shape == (3, 64, 64):
                ct_data = np.transpose(ct_data, (1, 2, 0))  # (64, 64, 3)
                
            if ct_data.shape != (64, 64, 3):
                skipped_samples[sample_type]["shape_error"].append(pid)
                print(f"[跳过-{sample_type}] {pid}: CT形状异常 {ct_data.shape} (应为 (64, 64, 3))")
                continue
        except Exception as e:
            skipped_samples[sample_type]["ct_load_error"].append(pid)
            print(f"[跳过-{sample_type}] {pid}: CT加载失败 - {e}")
            continue

        # 检查 cfDNA 是否都有
        missing_modalities = [mod for mod in modalities if pid not in cf_data[mod].index]
        if missing_modalities:
            skipped_samples[sample_type]["missing_modality"].append(pid)
            print(f"[跳过-{sample_type}] {pid}: 缺失模态数据 {missing_modalities}")
            continue

        # 添加数据
        for mod in modalities:
            X[mod].append(cf_data[mod].loc[pid].values.astype(np.float32))
        CT.append(ct_data.astype(np.float32))
        y.append(label)
        ids.append(pid)

        # # 可视化
        # plt.imshow(ct_data / np.max(ct_data))
        # plt.title(f"{sample_type} ID: {pid}")
        # plt.axis('off')
        # plt.show()

    print(f"成功处理{sample_type}样本: {len(y)}/{len(id_list)}")
    return X, CT, y, ids

X_cancer, CT_cancer, y_cancer, id_cancer = process_samples(LUAD_ids, label=1)
X_benign, CT_benign, y_benign, id_benign = process_samples(LUSC_ids, label=0)

print(f"\n=== 数据统计 ===")
print(f"LUAD样本数: {len(y_cancer)}")
print(f"LUSC样本数: {len(y_benign)}")

# 打印跳过统计
print(f"\n=== 跳过样本统计 ===")
for sample_type in ["LUAD", "LUSC"]:
    print(f"{sample_type}样本跳过原因:")
    for reason, samples in skipped_samples[sample_type].items():
        if len(samples) > 0:
            print(f"  {reason}: {len(samples)}个 - {samples[:5]}{'...' if len(samples) > 5 else ''}")

# 3. 训练测试划分函数
def stratified_split_data(X_dict, CT_list, y_list, id_list, test_size=0.1, seed=42):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)
    train_idx, test_idx = next(sss.split(np.zeros(len(y_list)), y_list))

    def subset(data, idx): return [data[i] for i in idx]

    train = {mod: np.stack(subset(X_dict[mod], train_idx)) for mod in modalities}
    test = {mod: np.stack(subset(X_dict[mod], test_idx)) for mod in modalities}
    train["CT"] = np.stack(subset(CT_list, train_idx))
    test["CT"] = np.stack(subset(CT_list, test_idx))
    train["y"] = np.array([y_list[i] for i in train_idx])
    test["y"] = np.array([y_list[i] for i in test_idx])
    train["id"] = np.array([id_list[i] for i in train_idx])
    test["id"] = np.array([id_list[i] for i in test_idx])
    return train, test

# 4. 拆分并合并训练测试集
train_can, test_can = stratified_split_data(X_cancer, CT_cancer, y_cancer, id_cancer)
train_ben, test_ben = stratified_split_data(X_benign, CT_benign, y_benign, id_benign)
train_all = {key: np.concatenate([train_can[key], train_ben[key]]) for key in train_can}
test_all = {key: np.concatenate([test_can[key], test_ben[key]]) for key in test_can}

# 5. 保存为 .npz
np.savez_compressed(save_train_path, **train_all)
np.savez_compressed(save_test_path, **test_all)
print(f"\n✅ 数据保存完成:")
print(f"训练集: {save_train_path} (样本数: {len(train_all['y'])})")
print(f"测试集: {save_test_path} (样本数: {len(test_all['y'])})")