In [1]:
import os
import h5py
import nibabel as nib
import numpy as np
from tqdm import tqdm

# Define the base directories
preprocessed_dir = '../../data/1_preprocessed'
output_dir = '../../data/2_final_h5'

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Get the list of subject directories
subject_dirs = [d for d in os.listdir(preprocessed_dir) if os.path.isdir(os.path.join(preprocessed_dir, d)) and d.startswith('sub-')]

# Process each subject
for subject_dir in tqdm(subject_dirs, desc="Processing Subjects"):
    subject_id_with_prefix = os.path.basename(subject_dir)
    subject_id = subject_id_with_prefix.replace('sub-', '')
    
    # Define the paths for the different modalities
    ct_path = os.path.join(preprocessed_dir, subject_dir, f'{subject_id_with_prefix}_ct.nii.gz')
    pet_path = os.path.join(preprocessed_dir, subject_dir, f'{subject_id_with_prefix}_pet_desc-suv.nii.gz')
    mask_path = os.path.join(preprocessed_dir, subject_dir, f'{subject_id_with_prefix}_seg-lesion.nii.gz')
    
    # Check if all required files exist
    if not all(os.path.exists(p) for p in [ct_path, pet_path, mask_path]):
        print(f"Skipping {subject_id}: Missing one or more required files.")
        continue
        
    # Define the output HDF5 file path
    output_h5_path = os.path.join(output_dir, f'{subject_id}.h5')
    
    try:
        # Create and write to the HDF5 file
        with h5py.File(output_h5_path, 'w') as hf:
            # Load and save CT data
            ct_img = nib.load(ct_path)
            ct_data = ct_img.get_fdata()
            hf.create_dataset('CT', data=ct_data, compression="gzip")
            
            # Load and save PET data
            pet_img = nib.load(pet_path)
            pet_data = pet_img.get_fdata()
            hf.create_dataset('PET', data=pet_data, compression="gzip")
            
            # Load and save Lesion Mask data
            mask_img = nib.load(mask_path)
            mask_data = mask_img.get_fdata()
            hf.create_dataset('Lesion_mask', data=mask_data, compression="gzip")
            
    except Exception as e:
        print(f"Error processing {subject_id}: {e}")

print("\nDataset creation complete.")
print(f"HDF5 files are saved in: {os.path.abspath(output_dir)}")

Processing Subjects: 100%|██████████| 1030/1030 [12:42<00:00,  1.35it/s]


Dataset creation complete.
HDF5 files are saved in: /home/yaobo/Project/Lung-Cancer-Subtyping-Classification-V4.0/data/2_final_h5





In [15]:

import pandas as pd
import json
from sklearn.model_selection import StratifiedKFold
import os

# 定义文件路径
participants_path = '../../metadata/participants.tsv'
splits_json_path = '../../metadata/splits.json'
h5_dir = '../../data/2_final_h5'

# 1. 读取并准备数据
try:
    df_metadata = pd.read_csv(participants_path, sep='\t')
except FileNotFoundError:
    raise FileNotFoundError(f"无法找到文件: {participants_path}")

# 确保关键列存在
required_cols = ['subject_id', 'center', 'Pathology']
if not all(col in df_metadata.columns for col in required_cols):
    raise ValueError(f"'{participants_path}' 文件中缺少以下一列或多列: {required_cols}")

try:
    # 从 .h5 文件名中提取 subject_id (例如 'ID123.h5' -> 'ID123')
    h5_subject_ids = {f.replace('.h5', '') for f in os.listdir(h5_dir) if f.endswith('.h5')}
except FileNotFoundError:
    raise FileNotFoundError(f"无法找到HDF5数据目录: {h5_dir}。请先运行第一个单元格生成HDF5文件。")

print(f"从 '{participants_path}' 中读取到 {len(df_metadata)} 个受试者。")
print(f"在 '{h5_dir}' 目录中找到 {len(h5_subject_ids)} 个HDF5文件。")

# 筛选出那些既在TSV文件中有记录，又存在对应HDF5文件的受试者
df_filtered = df_metadata[df_metadata['subject_id'].isin(h5_subject_ids)].copy()
print(f"元数据和实际数据取交集后，用于划分的受试者总数为: {len(df_filtered)}")

if len(df_filtered) == 0:
    raise ValueError("没有找到任何共有的受试者ID，无法进行划分。请检查ID格式是否一致。")

# 2. 创建分层标签和清理ID
df_filtered['strata_group'] = df_filtered['center'] + '_' + df_filtered['Pathology']

n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# 使用筛选后的数据进行划分
X = df_filtered['subject_id']
y = df_filtered['strata_group']

splits_data = []

# 直接迭代生成器
for fold_idx, (_, val_idx) in enumerate(skf.split(X, y)):
    val_subjects = X.iloc[val_idx].tolist()
    
    splits_data.append({
        "fold": fold_idx,
        "val": val_subjects
    })

# 4. 将结果写入JSON文件
with open(splits_json_path, 'w') as f:
    json.dump(splits_data, f, indent=2)

print(f"成功创建并更新了 '{splits_json_path}'，包含 {n_splits} 折的分层划分。")
print("每个折都包含了该折对应的验证集受试者ID。")


从 '../../metadata/participants.tsv' 中读取到 1061 个受试者。
在 '../../data/2_final_h5' 目录中找到 1030 个HDF5文件。
元数据和实际数据取交集后，用于划分的受试者总数为: 1030
成功创建并更新了 '../../metadata/splits.json'，包含 5 折的分层划分。
每个折都包含了该折对应的验证集受试者ID。


In [17]:
# 修正后的统计单元格代码
import pandas as pd
import json

# 读取 participants.tsv 和 splits.json
participants_path = '../../metadata/participants.tsv'
splits_json_path = '../../metadata/splits.json'

df_participants = pd.read_csv(participants_path, sep='\t')
df_participants['strata_group'] = df_participants['center'] + '_' + df_participants['Pathology']

with open(splits_json_path, 'r') as f:
    splits_data = json.load(f)

# 准备一个 DataFrame 来存储统计结果
stats_list = []

# 遍历每个 fold
for fold_info in splits_data:
    fold_num = fold_info['fold']
    val_subjects = fold_info['val'] # ID不含'sub-'前缀
    
    # 直接使用原始的 'subject_id' 列进行匹配，因为两边都没有前缀
    val_df = df_participants[df_participants['subject_id'].isin(val_subjects)]
    
    # 统计每个类别的数量
    counts = val_df['strata_group'].value_counts().to_dict()
    
    # 添加 fold 编号并记录
    counts['fold'] = fold_num
    stats_list.append(counts)

# 将统计结果转换为 DataFrame 以便更好地显示
stats_df = pd.DataFrame(stats_list)
stats_df = stats_df.set_index('fold')

# # 填充 NaN 为 0，并确保所有类别都作为列存在
# all_strata_groups = df_participants['strata_group'].unique()
# for group in all_strata_groups:
#     if group not in stats_df.columns:
#         stats_df[group] = 0

# stats_df = stats_df.fillna(0).astype(int)

# # 重新排列列的顺序以便查看
# stats_df = stats_df[sorted(all_strata_groups)]

print("Distribution of strata groups across folds (validation sets):")
print(stats_df)

Distribution of strata groups across folds (validation sets):
      AKH_ADC  Neimeng_ADC  AKH_SCC  Neimeng_SCC  AKH_SCLC  Neimeng_SCLC
fold                                                                    
0          68           47       40           26        16             9
1          68           47       40           26        16             9
2          68           46       40           27        16             9
3          68           47       40           27        15             9
4          68           47       39           26        16            10
