In [None]:
'''
 # @ Author: Yaobo Jia
 # @ Create Time: 2025-05-27 14:15:28
 # @ Modified by: Yaobo Jia
 # @ Modified time: 2025-05-27 14:48:57
 # @ Description: 不同模态的nifti数据的预处理工作；
 # 完成数据的预处理，resample使得所有数据的spacing统一；
 # 同时对同一组数据的不同模态（CT、PET）取得相同的size
 '''
import os
import numpy as np
import nibabel as nib
import pandas as pd
from scipy.ndimage import zoom

def load_nifti(file_path):
    img = nib.load(file_path)
    data = img.get_fdata()
    affine = img.affine
    spacing = img.header.get_zooms()
    
    return data, affine, spacing

def save_nifti(data, affine, path):
    img = nib.Nifti1Image(data, affine)
    nib.save(img, path)

def resample(data, in_spacing, tgt_spacing, affine, order=1):
    if np.allclose(in_spacing, tgt_spacing):
        return data, affine
    
    zoom_factors = in_spacing / tgt_spacing
    out = zoom(data, zoom_factors, order=order)
    directions = affine[:3, :3] / np.linalg.norm(affine[:3, :3], axis=0)
    new_affine = affine.copy()
    new_affine[:3, :3] = directions @ np.diag(tgt_spacing)
    return out, new_affine

def get_bb_min_max(data, threshold=0):
    coords = np.array(np.where(data > threshold))
    return coords.min(axis=1), coords.max(axis=1)
    
def transform_coordinates(center, source_affine, target_affine):
	"""Transform coordinates from PET space to CT space."""
	center_world = nib.affines.apply_affine(source_affine, center)  # PET → World
	center_ct = nib.affines.apply_affine(np.linalg.inv(target_affine), center_world)  # World → CT
	return np.round(center_ct).astype(int)  # Convert to integer voxel indices

def crop(pet, ct, pet_affine, ct_affine):
    min_pet, max_pet = get_bb_min_max(pet, threshold=0)
    min_ct, max_ct = get_bb_min_max(ct, threshold=-1000) 
    # 将pet的bounding box坐标转移到ct坐标系下
    min_pet_ct = transform_coordinates(min_pet, pet_affine, ct_affine)
    max_pet_ct = transform_coordinates(max_pet, pet_affine, ct_affine)
    # compute overlap bounding box
    over_min = np.maximum(min_pet_ct, min_ct)
    over_max = np.minimum(max_pet_ct, max_ct) 
    # crop ct in ct coordinate
    ct_crop = ct[
	    over_min[0]:over_max[0]+1,
        over_min[1]:over_max[1]+1,
        over_min[2]:over_max[2]+1
    ]
    # transform bonuding box from ct coordinate to pet coordinate
    min_ct2pet = transform_coordinates(over_min, ct_affine, pet_affine)
    max_ct2pet = transform_coordinates(over_max, ct_affine, pet_affine)
    # crop pet in pet coordinate
    pet_crop = pet[
        min_ct2pet[0]:max_ct2pet[0]+1,
        min_ct2pet[1]:max_ct2pet[1]+1,
        min_ct2pet[2]:max_ct2pet[2]+1
    ]

    return pet_crop, ct_crop, over_min, over_max, min_ct2pet, max_ct2pet
    

def process_case(ct_path, pet_path, lung_path, mask_path):
    pet, pet_affine, pet_spacing = load_nifti(pet_path)
    ct, ct_affine, ct_spacing = load_nifti(ct_path)
    lung, _, _ = load_nifti(lung_path)
    mask, _, _ = load_nifti(mask_path)

    pet_resampled, pet_resampled_affine = resample(pet, pet_spacing, TARGET_SPACING, pet_affine)
    ct_resampled, ct_resampled_affine = resample(ct, ct_spacing, TARGET_SPACING, ct_affine)
    lung_resampled, _ = resample(lung, ct_spacing, TARGET_SPACING, ct_affine, order=0)
    mask_resampled, _ = resample(mask, pet_spacing, TARGET_SPACING, pet_affine, order=0)

    pet_cropped, ct_cropped, ct_min, ct_max, pet_min, pet_max = crop(pet_resampled, ct_resampled, pet_resampled_affine, ct_resampled_affine)
    lung_cropped = lung_resampled[
	    ct_min[0]:ct_max[0]+1,
        ct_min[1]:ct_max[1]+1,
        ct_min[2]:ct_max[2]+1
    ]
    mask_cropped = mask_resampled[
        pet_min[0]:pet_max[0]+1,
        pet_min[1]:pet_max[1]+1,
        pet_min[2]:pet_max[2]+1
    ]
    # in new affine: set origin = 0
    affine = pet_resampled_affine.copy()
    affine[:3, 3] = 0

    return pet_cropped, ct_cropped, lung_cropped, mask_cropped, affine


if __name__ == '__main__':
    BASE = '/mnt/HDD_1/FDG/LungCancer_Subtyping'
    TARGET_SPACING = np.array([4.07283, 4.07283, 3.0], dtype=float)
    centers = ['Neimeng_nifti_425', 'AKH_nifti_637']

    for center in centers:
        df = pd.read_excel(f'data/{center}_image_info.xlsx', sheet_name='Sheet1', dtype={'PID': str})
        for pid in df['PID']:
            print(f'[INFO] Processing {center} {pid}')
            root = f'{BASE}/data/nifti/{center}/{pid}'
            ct_path  = f'{root}/CT.nii.gz'
            pet_path = f'{root}/PET.nii.gz'
            lung_path = f'data/lung-seg/{center}/{pid}/lung.nii.gz'
            # lesion mask 可能文件名不固定
            lesion_dir = f'{BASE}/data/lesion_seg/{center}'
            mask_path  = next((os.path.join(lesion_dir, f)
                               for f in os.listdir(lesion_dir)
                               if f.startswith(pid) and f.endswith('.nii.gz')), None)
            if mask_path is None:
                print(f'  [WARN] mask missing, skip'); continue

            pet_cropped, ct_cropped, lung_cropped, mask_cropped, affine = process_case(ct_path, pet_path, lung_path, mask_path)

            out_dir = f'data/preprocessed/{center}/{pid}'
            os.makedirs(out_dir, exist_ok=True)
            save_nifti(ct_cropped, affine, f'{out_dir}/CT_rs.nii.gz')
            save_nifti(pet_cropped, affine, f'{out_dir}/PET_rs.nii.gz')
            save_nifti(mask_cropped, affine, f'{out_dir}/Mask_rs.nii.gz')
            save_nifti(lung_cropped, affine, f'{out_dir}/Lung_rs.nii.gz')   


## 重采样之后的数据位置
CT： /mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/resampled_nifti/{center}/{PID}/CT.nii.gz   
PET： /mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/resampled_nifti/{center}/{PID}/PET.nii.gz  
lung mask:  /mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/lung_seg/{center}/{PID}/lung_seg.nii.gz  
lesion mask:  /mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/lesion_seg/{center}/{PID}/lesion_seg.nii.gz  

## 完成文件完整性检查

In [1]:
# 文件完整性检查
import pandas as pd
import os
from pathlib import Path
from collections import defaultdict

def check_file_integrity():
    """
    检查重采样后数据的完整性
    """
    print("=== 重采样数据文件完整性检查 ===")
    
    # 读取metadata
    metadata_path = "metadata/metadata_filtered.csv"
    metadata_df = pd.read_csv(metadata_path, dtype={'PID': str})
    
    print(f"Metadata中的样本数: {len(metadata_df)}")
    print(f"包含的中心: {metadata_df['center'].value_counts().to_dict()}")
    
    # 定义数据路径模板
    data_paths = {
        'CT': '/mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/resampled_nifti/{center}/{pid}/CT.nii.gz',
        'PET': '/mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/resampled_nifti/{center}/{pid}/PET.nii.gz',
        'lung_mask': '/mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/lung_seg/{center}/{pid}/lung_seg.nii.gz',
        'lesion_mask': '/mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/lesion_seg/{center}/{pid}/lesion_seg.nii.gz'
    }
    
    # 统计结果
    complete_samples = []
    incomplete_samples = []
    missing_files = defaultdict(list)
    
    print(f"\n=== 检查metadata中的样本 ===")
    
    for idx, row in metadata_df.iterrows():
        center = row['center']
        pid = row['PID']
        
        sample_status = {
            'center': center,
            'pid': pid,
            'files': {}
        }
        
        all_files_exist = True
        
        for modality, path_template in data_paths.items():
            file_path = path_template.format(center=center, pid=pid)
            exists = os.path.exists(file_path)
            sample_status['files'][modality] = exists
            
            if not exists:
                all_files_exist = False
                missing_files[modality].append(f"{center}/{pid}")
        
        if all_files_exist:
            complete_samples.append(sample_status)
        else:
            incomplete_samples.append(sample_status)
    
    # 输出统计结果
    print(f"\n=== 完整性统计 ===")
    print(f"完整样本数: {len(complete_samples)} / {len(metadata_df)} ({len(complete_samples)/len(metadata_df)*100:.1f}%)")
    print(f"不完整样本数: {len(incomplete_samples)}")
    
    # 按模态统计缺失文件
    print(f"\n=== 各模态缺失统计 ===")
    for modality, missing_list in missing_files.items():
        print(f"{modality}: 缺失 {len(missing_list)} 个文件")
        if len(missing_list) <= 10:
            for missing in missing_list:
                print(f"  - {missing}")
        else:
            for missing in missing_list[:5]:
                print(f"  - {missing}")
            print(f"  ... 还有 {len(missing_list) - 5} 个缺失文件")
    
    # 按中心统计
    print(f"\n=== 各中心完整性统计 ===")
    for center in metadata_df['center'].unique():
        center_samples = metadata_df[metadata_df['center'] == center]
        center_complete = [s for s in complete_samples if s['center'] == center]
        print(f"{center}: {len(center_complete)} / {len(center_samples)} 完整 ({len(center_complete)/len(center_samples)*100:.1f}%)")
    
    return complete_samples, incomplete_samples, missing_files

# 执行检查
complete_samples, incomplete_samples, missing_files = check_file_integrity()

=== 重采样数据文件完整性检查 ===
Metadata中的样本数: 1030
包含的中心: {'AKH_nifti_637': 618, 'Neimeng_nifti_425': 412}

=== 检查metadata中的样本 ===

=== 完整性统计 ===
完整样本数: 1030 / 1030 (100.0%)
不完整样本数: 0

=== 各模态缺失统计 ===

=== 各中心完整性统计 ===
AKH_nifti_637: 618 / 618 完整 (100.0%)
Neimeng_nifti_425: 412 / 412 完整 (100.0%)


In [2]:
# 检查文件系统中存在但metadata中缺失的样本
def check_extra_files():
    """
    检查文件系统中存在但metadata中没有记录的样本
    """
    print(f"\n=== 检查额外文件 (存在于文件系统但不在metadata中) ===")
    
    # 读取metadata中的样本列表
    metadata_df = pd.read_csv("metadata/metadata_filtered.csv", dtype={'PID': str})
    metadata_samples = set()
    for _, row in metadata_df.iterrows():
        metadata_samples.add(f"{row['center']}/{row['PID']}")
    
    # 定义要检查的根目录
    base_dirs = {
        'resampled_nifti': '/mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/resampled_nifti',
        'lung_seg': '/mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/lung_seg',
        'lesion_seg': '/mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/lesion_seg'
    }
    
    # 检查每个目录
    extra_samples = defaultdict(set)
    
    for dir_type, base_dir in base_dirs.items():
        if not os.path.exists(base_dir):
            print(f"⚠ 目录不存在: {base_dir}")
            continue
            
        print(f"\n检查目录: {dir_type}")
        
        # 遍历中心目录
        for center in ['AKH_nifti_637', 'Neimeng_nifti_425']:
            center_dir = os.path.join(base_dir, center)
            
            if not os.path.exists(center_dir):
                print(f"  ⚠ 中心目录不存在: {center_dir}")
                continue
            
            # 遍历患者目录
            try:
                for pid in os.listdir(center_dir):
                    pid_dir = os.path.join(center_dir, pid)
                    if os.path.isdir(pid_dir):
                        sample_key = f"{center}/{pid}"
                        
                        # 检查是否在metadata中
                        if sample_key not in metadata_samples:
                            extra_samples[dir_type].add(sample_key)
                            print(f"    + 额外样本: {sample_key}")
            except PermissionError:
                print(f"  ❌ 无权限访问: {center_dir}")
            except Exception as e:
                print(f"  ❌ 读取错误: {center_dir} - {e}")
    
    # 汇总额外样本
    all_extra_samples = set()
    for dir_type, samples in extra_samples.items():
        all_extra_samples.update(samples)
    
    print(f"\n=== 额外样本汇总 ===")
    if all_extra_samples:
        print(f"发现 {len(all_extra_samples)} 个额外样本 (存在于文件系统但不在metadata中):")
        for sample in sorted(all_extra_samples):
            print(f"  - {sample}")
            
        # 按中心统计
        print(f"\n按中心统计额外样本:")
        center_extra = defaultdict(int)
        for sample in all_extra_samples:
            center = sample.split('/')[0]
            center_extra[center] += 1
        
        for center, count in center_extra.items():
            print(f"  {center}: {count} 个额外样本")
    else:
        print("✓ 没有发现额外样本，所有文件系统中的样本都在metadata中有记录")
    
    return extra_samples

# 执行额外文件检查
extra_samples = check_extra_files()


=== 检查额外文件 (存在于文件系统但不在metadata中) ===

检查目录: resampled_nifti
    + 额外样本: AKH_nifti_637/GRUJA-LINA20230323
    + 额外样本: AKH_nifti_637/HOFER-EMMERICH20150224
    + 额外样本: AKH_nifti_637/AHAMER-GERHARD20240213
    + 额外样本: AKH_nifti_637/STARCH-JOHANN-DI20230810
    + 额外样本: AKH_nifti_637/SAVIC-PERO20090924
    + 额外样本: AKH_nifti_637/CZERNY-JOHANN20190801
    + 额外样本: AKH_nifti_637/GEBAUER-GABRIELE20100929
    + 额外样本: AKH_nifti_637/HOBUSCH-DIETHARD20100817
    + 额外样本: AKH_nifti_637/REINER-ERNST20220504
    + 额外样本: AKH_nifti_637/TORDAY-ILSE-MARIA20110308
    + 额外样本: AKH_nifti_637/PUTZ-THERESIA20230831
    + 额外样本: AKH_nifti_637/GINDL-PFLANZ-DIANA20230814
    + 额外样本: AKH_nifti_637/HAJRULA-CENGIZ20210223
    + 额外样本: AKH_nifti_637/ERISTAVI-DAVID20091022
    + 额外样本: AKH_nifti_637/VASAROS-BRIGITTA20100315
    + 额外样本: Neimeng_nifti_425/406
    + 额外样本: Neimeng_nifti_425/320
    + 额外样本: Neimeng_nifti_425/270
    + 额外样本: Neimeng_nifti_425/156
    + 额外样本: Neimeng_nifti_425/257

检查目录: lung_seg
    + 额外样本: AKH

In [3]:
# 生成详细的完整性报告
def generate_integrity_report(complete_samples, incomplete_samples, missing_files, extra_samples):
    """
    生成详细的完整性报告并保存
    """
    print(f"\n=== 生成完整性报告 ===")
    
    # 创建详细报告DataFrame
    all_samples = []
    
    # 添加完整样本
    for sample in complete_samples:
        all_samples.append({
            'center': sample['center'],
            'PID': sample['pid'],
            'status': 'complete',
            'CT': True,
            'PET': True,
            'lung_mask': True,
            'lesion_mask': True,
            'missing_modalities': ''
        })
    
    # 添加不完整样本
    for sample in incomplete_samples:
        missing_mods = []
        for modality, exists in sample['files'].items():
            if not exists:
                missing_mods.append(modality)
        
        all_samples.append({
            'center': sample['center'],
            'PID': sample['pid'],
            'status': 'incomplete',
            'CT': sample['files']['CT'],
            'PET': sample['files']['PET'],
            'lung_mask': sample['files']['lung_mask'],
            'lesion_mask': sample['files']['lesion_mask'],
            'missing_modalities': ', '.join(missing_mods)
        })
    
    # 创建DataFrame并保存
    report_df = pd.DataFrame(all_samples)
    report_path = "metadata/file_integrity_report.csv"
    report_df.to_csv(report_path, index=False)
    
    print(f"✓ 完整性报告已保存: {report_path}")
    
    # 生成summary统计
    print(f"\n=== 最终统计汇总 ===")
    total_metadata_samples = len(complete_samples) + len(incomplete_samples)
    print(f"Metadata中的总样本数: {total_metadata_samples}")
    print(f"完整样本数: {len(complete_samples)} ({len(complete_samples)/total_metadata_samples*100:.1f}%)")
    print(f"不完整样本数: {len(incomplete_samples)} ({len(incomplete_samples)/total_metadata_samples*100:.1f}%)")
    
    # 各模态文件存在率
    print(f"\n各模态文件存在率:")
    modalities = ['CT', 'PET', 'lung_mask', 'lesion_mask']
    for modality in modalities:
        existing_count = report_df[modality].sum()
        print(f"  {modality}: {existing_count}/{total_metadata_samples} ({existing_count/total_metadata_samples*100:.1f}%)")
    
    # 额外文件统计
    total_extra = sum(len(samples) for samples in extra_samples.values())
    if total_extra > 0:
        print(f"\n额外文件 (不在metadata中): {total_extra} 个样本")
    
    # 保存额外文件列表
    if total_extra > 0:
        extra_df_data = []
        for dir_type, samples in extra_samples.items():
            for sample in samples:
                center, pid = sample.split('/')
                extra_df_data.append({
                    'center': center,
                    'PID': pid,
                    'found_in_directory': dir_type
                })
        
        if extra_df_data:
            extra_df = pd.DataFrame(extra_df_data)
            extra_path = "metadata/extra_files_report.csv"
            extra_df.to_csv(extra_path, index=False)
            print(f"✓ 额外文件报告已保存: {extra_path}")
    
    return report_df

# 生成报告
integrity_report = generate_integrity_report(complete_samples, incomplete_samples, missing_files, extra_samples)

# 显示前几行报告
print(f"\n=== 完整性报告预览 ===")
print(integrity_report.head(10))


=== 生成完整性报告 ===
✓ 完整性报告已保存: metadata/file_integrity_report.csv

=== 最终统计汇总 ===
Metadata中的总样本数: 1030
完整样本数: 1030 (100.0%)
不完整样本数: 0 (0.0%)

各模态文件存在率:
  CT: 1030/1030 (100.0%)
  PET: 1030/1030 (100.0%)
  lung_mask: 1030/1030 (100.0%)
  lesion_mask: 1030/1030 (100.0%)

额外文件 (不在metadata中): 60 个样本
✓ 额外文件报告已保存: metadata/extra_files_report.csv

=== 完整性报告预览 ===
          center                          PID    status    CT   PET  \
0  AKH_nifti_637   ABDALLA-ADEL-AHMED20091023  complete  True  True   
1  AKH_nifti_637         ABT-BRIGITTE20160818  complete  True  True   
2  AKH_nifti_637         ADAMEK-KARIN20211124  complete  True  True   
3  AKH_nifti_637        AHMED-MOHAMED20230731  complete  True  True   
4  AKH_nifti_637         AHMEDI-NAZIF20230111  complete  True  True   
5  AKH_nifti_637  ALIMANY-RAMOS-MARIA20141007  complete  True  True   
6  AKH_nifti_637     ANDABAKA-MARIJAN20091229  complete  True  True   
7  AKH_nifti_637    ANSARI-MEHR-AZITA20141216  complete  True  True   
8  A

In [4]:
# 检查额外文件的完整性并在原始metadata中查找
def check_extra_files_completeness():
    """
    检查额外文件是否为完整的四个模态，并在原始metadata中查找对应信息
    """
    print("=== 检查额外文件的完整性 ===")
    
    # 从之前的结果获取额外样本列表
    all_extra_samples = set()
    for dir_type, samples in extra_samples.items():
        all_extra_samples.update(samples)
    
    print(f"发现的额外样本总数: {len(all_extra_samples)}")
    
    # 定义数据路径模板
    data_paths = {
        'CT': '/mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/resampled_nifti/{center}/{pid}/CT.nii.gz',
        'PET': '/mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/resampled_nifti/{center}/{pid}/PET.nii.gz',
        'lung_mask': '/mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/lung_seg/{center}/{pid}/lung_seg.nii.gz',
        'lesion_mask': '/mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/lesion_seg/{center}/{pid}/lesion_seg.nii.gz'
    }
    
    # 检查每个额外样本的完整性
    complete_extra_samples = []
    incomplete_extra_samples = []
    
    print(f"\n=== 检查额外样本的四模态完整性 ===")
    
    for sample in sorted(all_extra_samples):
        center, pid = sample.split('/')
        
        sample_files = {}
        all_exist = True
        
        for modality, path_template in data_paths.items():
            file_path = path_template.format(center=center, pid=pid)
            exists = os.path.exists(file_path)
            sample_files[modality] = exists
            if not exists:
                all_exist = False
        
        if all_exist:
            complete_extra_samples.append({
                'center': center,
                'pid': pid,
                'files': sample_files
            })
            print(f"✓ 完整: {sample} (四个模态都存在)")
        else:
            incomplete_extra_samples.append({
                'center': center,
                'pid': pid,
                'files': sample_files
            })
            missing = [mod for mod, exists in sample_files.items() if not exists]
            print(f"✗ 不完整: {sample} (缺少: {', '.join(missing)})")
    
    print(f"\n=== 额外样本完整性统计 ===")
    print(f"完整的额外样本: {len(complete_extra_samples)}")
    print(f"不完整的额外样本: {len(incomplete_extra_samples)}")
    
    # 读取原始metadata
    print(f"\n=== 在原始metadata中查找完整的额外样本 ===")
    
    try:
        original_metadata = pd.read_csv("metadata/metadata.csv", dtype={'PID': str})
        print(f"原始metadata样本数: {len(original_metadata)}")
        
        found_in_original = []
        not_found_in_original = []
        
        for sample in complete_extra_samples:
            center = sample['center']
            pid = sample['pid']
            
            # 在原始metadata中查找
            matches = original_metadata[
                (original_metadata['center'] == center) & 
                (original_metadata['PID'] == pid)
            ]
            
            if len(matches) > 0:
                found_in_original.append({
                    'center': center,
                    'pid': pid,
                    'metadata_rows': matches
                })
                print(f"\n✓ 在原始metadata中找到: {center}/{pid}")
                print("对应的完整行信息:")
                for idx, row in matches.iterrows():
                    print(f"  行 {idx}:")
                    for col in matches.columns:
                        print(f"    {col}: {row[col]}")
                    print("  " + "-"*50)
            else:
                not_found_in_original.append({
                    'center': center,
                    'pid': pid
                })
                print(f"\n✗ 在原始metadata中未找到: {center}/{pid}")
        
        print(f"\n=== 原始metadata查找结果汇总 ===")
        print(f"在原始metadata中找到的完整额外样本: {len(found_in_original)}")
        print(f"在原始metadata中未找到的完整额外样本: {len(not_found_in_original)}")
        
        if found_in_original:
            print(f"\n找到的样本列表:")
            for item in found_in_original:
                print(f"  - {item['center']}/{item['pid']}")
        
        if not_found_in_original:
            print(f"\n未找到的样本列表:")
            for item in not_found_in_original:
                print(f"  - {item['center']}/{item['pid']}")
        
        return complete_extra_samples, found_in_original, not_found_in_original
        
    except FileNotFoundError:
        print("❌ 原始metadata文件不存在: metadata/metadata.csv")
        return complete_extra_samples, [], []
    except Exception as e:
        print(f"❌ 读取原始metadata时出错: {e}")
        return complete_extra_samples, [], []

# 执行检查
complete_extra, found_in_meta, not_found_in_meta = check_extra_files_completeness()

=== 检查额外文件的完整性 ===
发现的额外样本总数: 20

=== 检查额外样本的四模态完整性 ===
✓ 完整: AKH_nifti_637/AHAMER-GERHARD20240213 (四个模态都存在)
✓ 完整: AKH_nifti_637/CZERNY-JOHANN20190801 (四个模态都存在)
✓ 完整: AKH_nifti_637/ERISTAVI-DAVID20091022 (四个模态都存在)
✓ 完整: AKH_nifti_637/GEBAUER-GABRIELE20100929 (四个模态都存在)
✓ 完整: AKH_nifti_637/GINDL-PFLANZ-DIANA20230814 (四个模态都存在)
✓ 完整: AKH_nifti_637/GRUJA-LINA20230323 (四个模态都存在)
✓ 完整: AKH_nifti_637/HAJRULA-CENGIZ20210223 (四个模态都存在)
✓ 完整: AKH_nifti_637/HOBUSCH-DIETHARD20100817 (四个模态都存在)
✓ 完整: AKH_nifti_637/HOFER-EMMERICH20150224 (四个模态都存在)
✓ 完整: AKH_nifti_637/PUTZ-THERESIA20230831 (四个模态都存在)
✓ 完整: AKH_nifti_637/REINER-ERNST20220504 (四个模态都存在)
✓ 完整: AKH_nifti_637/SAVIC-PERO20090924 (四个模态都存在)
✓ 完整: AKH_nifti_637/STARCH-JOHANN-DI20230810 (四个模态都存在)
✓ 完整: AKH_nifti_637/TORDAY-ILSE-MARIA20110308 (四个模态都存在)
✓ 完整: AKH_nifti_637/VASAROS-BRIGITTA20100315 (四个模态都存在)
✓ 完整: Neimeng_nifti_425/156 (四个模态都存在)
✓ 完整: Neimeng_nifti_425/257 (四个模态都存在)
✓ 完整: Neimeng_nifti_425/270 (四个模态都存在)
✓ 完整: Neimeng_nifti_425/320 (四个模

Resample的文件中相比metadata_filtered.csv文件中的PID，多出20个，其中AKH多15个，Meimeng多5个。经过检查为metadata中，pathology为ADC、SCC、SCLC三个类别之外的subtyping，故排除。因此后续就以metadata_filtered.csv为核心目录处理数据。

## 检查组间的spacing是否一致，检查组内的shape是否一致。

In [6]:
# 检查每个PID的四个模态数据的shape和spacing一致性
import pandas as pd
import numpy as np
import nibabel as nib
from pathlib import Path
from collections import defaultdict

def check_shape_spacing_consistency():
    """
    检查每个PID的四个模态数据的shape和spacing一致性
    """
    print("=== 检查重采样数据的shape和spacing一致性 ===")
    
    # 读取筛选后的metadata
    metadata_df = pd.read_csv("metadata/metadata_filtered.csv", dtype={'PID': str})
    
    print(f"需要检查的样本数: {len(metadata_df)}")
    
    # 定义目标spacing
    TARGET_SPACING = np.array([4.07283, 4.07283, 3.0])
    print(f"目标spacing: {TARGET_SPACING}")
    
    # 定义数据路径模板
    data_paths = {
        'CT': '/mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/resampled_nifti/{center}/{pid}/CT.nii.gz',
        'PET': '/mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/resampled_nifti/{center}/{pid}/PET.nii.gz',
        'lung_mask': '/mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/lung_seg/{center}/{pid}/lung_seg.nii.gz',
        'lesion_mask': '/mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/lesion_seg/{center}/{pid}/lesion_seg.nii.gz'
    }
    
    # 记录问题样本
    shape_inconsistent_samples = []
    spacing_incorrect_samples = []
    file_missing_samples = []
    
    print(f"\n=== 开始逐个检查样本 ===")
    
    for idx, row in metadata_df.iterrows():
        center = row['center']
        pid = row['PID']
        
        print(f"检查 {idx+1}/{len(metadata_df)}: {center}/{pid}", end="")
        
        # 加载四个模态的数据
        modality_data = {}
        sample_has_issues = False
        
        for modality, path_template in data_paths.items():
            file_path = path_template.format(center=center, pid=pid)
            
            if not os.path.exists(file_path):
                file_missing_samples.append({
                    'center': center,
                    'pid': pid,
                    'missing_modality': modality,
                    'file_path': file_path
                })
                print(f" - 缺少{modality}文件")
                sample_has_issues = True
                continue
            
            try:
                # 加载NIfTI文件
                nii_img = nib.load(file_path)
                img_data = nii_img.get_fdata()
                header = nii_img.header
                spacing = np.array(header.get_zooms())
                
                modality_data[modality] = {
                    'shape': img_data.shape,
                    'spacing': spacing,
                    'file_path': file_path
                }
                
            except Exception as e:
                print(f" - 读取{modality}文件错误: {e}")
                sample_has_issues = True
                continue
        
        if sample_has_issues:
            continue
        
        if len(modality_data) < 4:
            continue
            
        # 检查spacing一致性
        spacing_issues = []
        for modality, data in modality_data.items():
            if not np.allclose(data['spacing'], TARGET_SPACING, atol=1e-3):
                spacing_issues.append({
                    'modality': modality,
                    'actual_spacing': data['spacing'],
                    'target_spacing': TARGET_SPACING
                })
        
        if spacing_issues:
            spacing_incorrect_samples.append({
                'center': center,
                'pid': pid,
                'spacing_issues': spacing_issues
            })
        
        # 检查shape一致性
        shapes = [data['shape'] for data in modality_data.values()]
        modalities = list(modality_data.keys())
        
        if not all(shape == shapes[0] for shape in shapes):
            shape_info = []
            for i, (modality, data) in enumerate(modality_data.items()):
                shape_info.append({
                    'modality': modality,
                    'shape': data['shape']
                })
            
            shape_inconsistent_samples.append({
                'center': center,
                'pid': pid,
                'shape_info': shape_info
            })
        
        # 显示检查状态
        if spacing_issues or not all(shape == shapes[0] for shape in shapes):
            print(" - 有问题")
        else:
            print(" - OK")
    
    return shape_inconsistent_samples, spacing_incorrect_samples, file_missing_samples

# 执行检查
shape_issues, spacing_issues, missing_files = check_shape_spacing_consistency()

=== 检查重采样数据的shape和spacing一致性 ===
需要检查的样本数: 1030
目标spacing: [4.07283 4.07283 3.     ]

=== 开始逐个检查样本 ===
检查 1/1030: AKH_nifti_637/ABDALLA-ADEL-AHMED20091023 - OK
检查 2/1030: AKH_nifti_637/ABT-BRIGITTE20160818 - OK
检查 3/1030: AKH_nifti_637/ADAMEK-KARIN20211124 - OK
检查 4/1030: AKH_nifti_637/AHMED-MOHAMED20230731 - OK
检查 5/1030: AKH_nifti_637/AHMEDI-NAZIF20230111 - OK
检查 6/1030: AKH_nifti_637/ALIMANY-RAMOS-MARIA20141007 - OK
检查 7/1030: AKH_nifti_637/ANDABAKA-MARIJAN20091229 - OK
检查 8/1030: AKH_nifti_637/ANSARI-MEHR-AZITA20141216 - OK
检查 9/1030: AKH_nifti_637/ANTIC-PRVOSLAV20150826 - OK
检查 10/1030: AKH_nifti_637/ANTIC-SLOBODAN20150908 - OK
检查 11/1030: AKH_nifti_637/ANVIEH-GOLPASHIN-DIANA20240108 - OK
检查 12/1030: AKH_nifti_637/ARNAUER-LEOPOLD20120820 - OK
检查 13/1030: AKH_nifti_637/ARSIC-DUSAN20120103 - OK
检查 14/1030: AKH_nifti_637/ARSLAN-SAMIL-DI20190715 - OK
检查 15/1030: AKH_nifti_637/ASCHENBRENNER-MARLENE20150430 - OK
检查 16/1030: AKH_nifti_637/AYDIN-ALI20220803 - OK
检查 17/1030: AKH_nifti_637/

In [7]:
# 生成详细的检查结果报告
def generate_consistency_report(shape_issues, spacing_issues, missing_files):
    """
    生成详细的shape和spacing一致性检查报告
    """
    print("\n" + "="*60)
    print("             数据一致性检查结果汇总")
    print("="*60)
    
    # 总体统计
    total_samples = len(pd.read_csv("metadata/metadata_filtered.csv"))
    
    print(f"\n1. 总体统计:")
    print(f"   - 检查的总样本数: {total_samples}")
    print(f"   - 文件缺失问题: {len(missing_files)} 个样本")
    print(f"   - Shape不一致问题: {len(shape_issues)} 个样本")
    print(f"   - Spacing不正确问题: {len(spacing_issues)} 个样本")
    
    successful_samples = total_samples - len(set(
        [f"{item['center']}/{item['pid']}" for item in missing_files] +
        [f"{item['center']}/{item['pid']}" for item in shape_issues] +
        [f"{item['center']}/{item['pid']}" for item in spacing_issues]
    ))
    
    print(f"   - 完全正确的样本: {successful_samples} ({successful_samples/total_samples*100:.1f}%)")
    
    # 详细报告
    if missing_files:
        print(f"\n2. 文件缺失问题详情:")
        missing_by_sample = defaultdict(list)
        for item in missing_files:
            key = f"{item['center']}/{item['pid']}"
            missing_by_sample[key].append(item['missing_modality'])
        
        for sample, missing_modalities in missing_by_sample.items():
            print(f"   - {sample}: 缺少 {', '.join(missing_modalities)}")
    
    if shape_issues:
        print(f"\n3. Shape不一致问题详情:")
        for item in shape_issues:
            sample = f"{item['center']}/{item['pid']}"
            print(f"   - {sample}:")
            for shape_info in item['shape_info']:
                print(f"     {shape_info['modality']}: {shape_info['shape']}")
    
    if spacing_issues:
        print(f"\n4. Spacing不正确问题详情:")
        for item in spacing_issues:
            sample = f"{item['center']}/{item['pid']}"
            print(f"   - {sample}:")
            for spacing_issue in item['spacing_issues']:
                modality = spacing_issue['modality']
                actual = spacing_issue['actual_spacing']
                target = spacing_issue['target_spacing']
                print(f"     {modality}: 实际 {actual} vs 目标 {target}")
    
    # 生成CSV报告
    print(f"\n5. 生成详细报告文件:")
    
    # Shape一致性报告
    if shape_issues:
        shape_report_data = []
        for item in shape_issues:
            for shape_info in item['shape_info']:
                shape_report_data.append({
                    'center': item['center'],
                    'PID': item['pid'],
                    'modality': shape_info['modality'],
                    'shape': str(shape_info['shape']),
                    'issue_type': 'shape_inconsistent'
                })
        
        shape_df = pd.DataFrame(shape_report_data)
        shape_report_path = "metadata/shape_consistency_issues.csv"
        shape_df.to_csv(shape_report_path, index=False)
        print(f"   ✓ Shape不一致报告: {shape_report_path}")
    
    # Spacing正确性报告
    if spacing_issues:
        spacing_report_data = []
        for item in spacing_issues:
            for spacing_issue in item['spacing_issues']:
                spacing_report_data.append({
                    'center': item['center'],
                    'PID': item['pid'],
                    'modality': spacing_issue['modality'],
                    'actual_spacing': str(spacing_issue['actual_spacing']),
                    'target_spacing': str(spacing_issue['target_spacing']),
                    'issue_type': 'spacing_incorrect'
                })
        
        spacing_df = pd.DataFrame(spacing_report_data)
        spacing_report_path = "metadata/spacing_consistency_issues.csv"
        spacing_df.to_csv(spacing_report_path, index=False)
        print(f"   ✓ Spacing不正确报告: {spacing_report_path}")
    
    # 文件缺失报告
    if missing_files:
        missing_df = pd.DataFrame(missing_files)
        missing_report_path = "metadata/missing_files_issues.csv"
        missing_df.to_csv(missing_report_path, index=False)
        print(f"   ✓ 文件缺失报告: {missing_report_path}")
    
    # 综合状态报告
    all_samples_status = []
    metadata_df = pd.read_csv("metadata/metadata_filtered.csv", dtype={'PID': str})
    
    # 创建问题样本集合
    missing_samples = set(f"{item['center']}/{item['pid']}" for item in missing_files)
    shape_problem_samples = set(f"{item['center']}/{item['pid']}" for item in shape_issues)
    spacing_problem_samples = set(f"{item['center']}/{item['pid']}" for item in spacing_issues)
    
    for _, row in metadata_df.iterrows():
        sample_key = f"{row['center']}/{row['PID']}"
        
        issues = []
        if sample_key in missing_samples:
            issues.append('missing_files')
        if sample_key in shape_problem_samples:
            issues.append('shape_inconsistent')
        if sample_key in spacing_problem_samples:
            issues.append('spacing_incorrect')
        
        all_samples_status.append({
            'center': row['center'],
            'PID': row['PID'],
            'status': 'OK' if not issues else 'ISSUES',
            'issues': ', '.join(issues) if issues else ''
        })
    
    status_df = pd.DataFrame(all_samples_status)
    status_report_path = "metadata/data_consistency_status.csv"
    status_df.to_csv(status_report_path, index=False)
    print(f"   ✓ 综合状态报告: {status_report_path}")
    
    print(f"\n6. 总结:")
    if not shape_issues and not spacing_issues and not missing_files:
        print("   🎉 所有数据都通过一致性检查！")
    else:
        print("   ⚠️  发现数据一致性问题，请查看详细报告文件")
    
    return status_df

# 生成报告
consistency_status = generate_consistency_report(shape_issues, spacing_issues, missing_files)


             数据一致性检查结果汇总

1. 总体统计:
   - 检查的总样本数: 1030
   - 文件缺失问题: 0 个样本
   - Shape不一致问题: 0 个样本
   - Spacing不正确问题: 0 个样本
   - 完全正确的样本: 1030 (100.0%)

5. 生成详细报告文件:
   ✓ 综合状态报告: metadata/data_consistency_status.csv

6. 总结:
   🎉 所有数据都通过一致性检查！


In [8]:
# 验证检查结果并显示统计信息
def verify_and_show_stats():
    """
    验证检查结果并显示一些样本的统计信息
    """
    print("=== 验证检查结果 ===")
    
    # 读取状态报告
    status_df = pd.read_csv("metadata/data_consistency_status.csv", dtype={'PID': str})
    
    print(f"状态报告中的样本数: {len(status_df)}")
    print(f"状态分布:")
    print(status_df['status'].value_counts())
    
    # 随机选择几个样本进行详细验证
    print(f"\n=== 随机抽样验证 ===")
    
    # 从每个中心随机选择2个样本进行详细检查
    sample_data = []
    
    for center in ['AKH_nifti_637', 'Neimeng_nifti_425']:
        center_samples = status_df[status_df['center'] == center].sample(2, random_state=42)
        
        for _, row in center_samples.iterrows():
            pid = row['PID']
            print(f"\n验证样本: {center}/{pid}")
            
            # 定义数据路径
            data_paths = {
                'CT': f'/mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/resampled_nifti/{center}/{pid}/CT.nii.gz',
                'PET': f'/mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/resampled_nifti/{center}/{pid}/PET.nii.gz',
                'lung_mask': f'/mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/lung_seg/{center}/{pid}/lung_seg.nii.gz',
                'lesion_mask': f'/mnt/HDD_1/FDG/LungCancer_Subtyping/data/processed_yaobo/lesion_seg/{center}/{pid}/lesion_seg.nii.gz'
            }
            
            sample_info = {'center': center, 'pid': pid}
            shapes = []
            spacings = []
            
            for modality, file_path in data_paths.items():
                if os.path.exists(file_path):
                    nii_img = nib.load(file_path)
                    img_data = nii_img.get_fdata()
                    spacing = nii_img.header.get_zooms()
                    
                    print(f"  {modality}: shape={img_data.shape}, spacing={np.round(spacing, 5)}")
                    shapes.append(img_data.shape)
                    spacings.append(spacing)
                    
                    sample_info[f'{modality}_shape'] = img_data.shape
                    sample_info[f'{modality}_spacing'] = spacing
                else:
                    print(f"  {modality}: 文件不存在")
            
            # 检查shape一致性
            if len(set(shapes)) == 1:
                print(f"  ✓ Shape一致: {shapes[0]}")
            else:
                print(f"  ✗ Shape不一致: {shapes}")
            
            # 检查spacing一致性
            target_spacing = np.array([4.07283, 4.07283, 3.0])
            all_spacing_correct = all(np.allclose(s, target_spacing, atol=1e-3) for s in spacings)
            
            if all_spacing_correct:
                print(f"  ✓ Spacing正确: 所有模态都为 {target_spacing}")
            else:
                print(f"  ✗ Spacing不正确")
                for i, s in enumerate(spacings):
                    print(f"    模态{i}: {s}")
            
            sample_data.append(sample_info)
    
    # 显示整体统计
    print(f"\n=== 整体数据统计 ===")
    
    # 按中心统计
    center_stats = status_df.groupby('center').size()
    print(f"各中心样本数:")
    for center, count in center_stats.items():
        print(f"  {center}: {count} 个样本")
    
    # 如果有问题样本，显示统计
    problem_samples = status_df[status_df['status'] != 'OK']
    if len(problem_samples) > 0:
        print(f"\n有问题的样本:")
        print(problem_samples)
    else:
        print(f"\n✓ 所有 {len(status_df)} 个样本都通过了一致性检查")
    
    return sample_data

# 执行验证
verification_data = verify_and_show_stats()

=== 验证检查结果 ===
状态报告中的样本数: 1030
状态分布:
status
OK    1030
Name: count, dtype: int64

=== 随机抽样验证 ===

验证样本: AKH_nifti_637/BRUKIC-MISA20210317
  CT: shape=(153, 153, 374), spacing=[4.07283 4.07283 3.     ]
  PET: shape=(153, 153, 374), spacing=[4.07283 4.07283 3.     ]
  lung_mask: shape=(153, 153, 374), spacing=[4.07283 4.07283 3.     ]
  lesion_mask: shape=(153, 153, 374), spacing=[4.07283 4.07283 3.     ]
  ✓ Shape一致: (153, 153, 374)
  ✓ Spacing正确: 所有模态都为 [4.07283 4.07283 3.     ]

验证样本: AKH_nifti_637/WEIDEMANN-MICHAEL20211019
  CT: shape=(175, 175, 323), spacing=[4.07283 4.07283 3.     ]
  PET: shape=(175, 175, 323), spacing=[4.07283 4.07283 3.     ]
  lung_mask: shape=(175, 175, 323), spacing=[4.07283 4.07283 3.     ]
  lesion_mask: shape=(175, 175, 323), spacing=[4.07283 4.07283 3.     ]
  ✓ Shape一致: (175, 175, 323)
  ✓ Spacing正确: 所有模态都为 [4.07283 4.07283 3.     ]

验证样本: Neimeng_nifti_425/288
  CT: shape=(123, 123, 290), spacing=[4.07283 4.07283 3.     ]
  PET: shape=(123, 123, 290), s