In [1]:
import pandas as pd
import numpy as np
import nibabel as nib
from pathlib import Path
from tqdm import tqdm
import json

# 读取筛选后的metadata
metadata_path = Path("metadata/metadata_filtered.csv")
filtered_metadata = pd.read_csv(metadata_path, dtype={'PID': str})

print(f"读取筛选后的metadata: {len(filtered_metadata)} 个样本")
print(f"包含的中心: {filtered_metadata['center'].value_counts().to_dict()}")

读取筛选后的metadata: 1030 个样本
包含的中心: {'AKH_nifti_637': 618, 'Neimeng_nifti_425': 412}


In [2]:
def get_nifti_fingerprint(file_path):
    """
    获取NIfTI文件的指纹信息
    
    Parameters:
    file_path: str or Path, NIfTI文件路径
    
    Returns:
    dict: 包含shape和spacing信息的字典
    """
    try:
        # 加载NIfTI文件
        nii_img = nib.load(str(file_path))
        
        # 获取图像数据和头信息
        img_data = nii_img.get_fdata()
        header = nii_img.header
        
        # 获取shape信息
        shape = img_data.shape
        
        # 获取spacing信息 (voxel size)
        spacing = header.get_zooms()
        
        return {
            'shape': list(shape),
            'spacing': list(spacing),
            'file_exists': True,
            'error': None
        }
    
    except Exception as e:
        return {
            'shape': None,
            'spacing': None, 
            'file_exists': False,
            'error': str(e)
        }

# 测试函数
print("测试指纹获取函数...")
test_sample = filtered_metadata.iloc[0]
test_pet_path = test_sample['pet_path']
test_ct_path = test_sample['ct_path']

print(f"测试样本: {test_sample['PID']}")
print(f"PET路径: {test_pet_path}")
print(f"CT路径: {test_ct_path}")

# 测试PET文件
pet_fingerprint = get_nifti_fingerprint(test_pet_path)
print(f"PET指纹: {pet_fingerprint}")

# 测试CT文件
ct_fingerprint = get_nifti_fingerprint(test_ct_path)
print(f"CT指纹: {ct_fingerprint}")

测试指纹获取函数...
测试样本: ABDALLA-ADEL-AHMED20091023
PET路径: /mnt/HDD_1/FDG/LungCancer_Subtyping/data/nifti/AKH_nifti_637/ABDALLA-ADEL-AHMED20091023/PET.nii.gz
CT路径: /mnt/HDD_1/FDG/LungCancer_Subtyping/data/nifti/AKH_nifti_637/ABDALLA-ADEL-AHMED20091023/CT.nii.gz
PET指纹: {'shape': [168, 168, 324], 'spacing': [np.float32(4.07283), np.float32(4.07283), np.float32(3.0)], 'file_exists': True, 'error': None}
CT指纹: {'shape': [512, 512, 324], 'spacing': [np.float32(1.3671875), np.float32(1.3671875), np.float32(3.0)], 'file_exists': True, 'error': None}


In [3]:
# 批量获取所有样本的指纹信息
print("开始批量获取所有样本的指纹信息...")

fingerprint_data = []

for idx, row in tqdm(filtered_metadata.iterrows(), total=len(filtered_metadata), desc="Processing samples"):
    center = row['center']
    pid = row['PID']
    pet_path = row['pet_path']
    ct_path = row['ct_path']
    pathology = row['Pathology']
    suv_ratio = row['SUV_ratio']
    
    # 获取PET指纹
    pet_fingerprint = get_nifti_fingerprint(pet_path)
    
    # 获取CT指纹
    ct_fingerprint = get_nifti_fingerprint(ct_path)
    
    # 组织数据
    sample_data = {
        'center': center,
        'PID': pid,
        'Pathology': pathology,
        'SUV_ratio': suv_ratio,
        
        # PET信息
        'pet_path': pet_path,
        'pet_shape': pet_fingerprint['shape'],
        'pet_spacing': pet_fingerprint['spacing'],
        'pet_file_exists': pet_fingerprint['file_exists'],
        'pet_error': pet_fingerprint['error'],
        
        # CT信息
        'ct_path': ct_path,
        'ct_shape': ct_fingerprint['shape'],
        'ct_spacing': ct_fingerprint['spacing'],
        'ct_file_exists': ct_fingerprint['file_exists'],
        'ct_error': ct_fingerprint['error']
    }
    
    fingerprint_data.append(sample_data)

# 转换为DataFrame
fingerprint_df = pd.DataFrame(fingerprint_data)

print(f"\n指纹信息收集完成!")
print(f"成功处理的样本数: {len(fingerprint_df)}")
print(f"PET文件正常的样本: {fingerprint_df['pet_file_exists'].sum()}")
print(f"CT文件正常的样本: {fingerprint_df['ct_file_exists'].sum()}")

# 保存指纹信息
fingerprint_output_path = Path("metadata/data_fingerprints.csv")
fingerprint_df.to_csv(fingerprint_output_path, index=False)
print(f"指纹信息已保存到: {fingerprint_output_path}")

开始批量获取所有样本的指纹信息...


Processing samples: 100%|██████████| 1030/1030 [29:10<00:00,  1.70s/it]


指纹信息收集完成!
成功处理的样本数: 1030
PET文件正常的样本: 1030
CT文件正常的样本: 1030
指纹信息已保存到: metadata/data_fingerprints.csv





In [4]:
# 指纹信息统计分析
print("=== 指纹信息统计分析 ===")

# 读取保存的指纹数据
fingerprint_df = pd.read_csv("metadata/data_fingerprints.csv", dtype={'PID': str})

# 1. 文件存在性检查
print(f"\n1. 文件完整性统计:")
print(f"   PET文件正常: {fingerprint_df['pet_file_exists'].sum()}/{len(fingerprint_df)}")
print(f"   CT文件正常: {fingerprint_df['ct_file_exists'].sum()}/{len(fingerprint_df)}")

# 检查是否有错误
pet_errors = fingerprint_df[fingerprint_df['pet_file_exists'] == False]
ct_errors = fingerprint_df[fingerprint_df['ct_file_exists'] == False]

if len(pet_errors) > 0:
    print(f"   PET文件错误数: {len(pet_errors)}")
    print(f"   错误样本: {pet_errors[['center', 'PID']].values.tolist()}")

if len(ct_errors) > 0:
    print(f"   CT文件错误数: {len(ct_errors)}")
    print(f"   错误样本: {ct_errors[['center', 'PID']].values.tolist()}")

# 2. Shape统计分析
print(f"\n2. Shape统计分析:")

# 处理shape信息 (将字符串转换为实际的list)
def parse_shape_spacing(df, column_name):
    """解析shape或spacing字符串为实际的数值列表"""
    import ast
    parsed_data = []
    for idx, row in df.iterrows():
        try:
            if pd.notna(row[column_name]) and row[column_name] != 'None':
                parsed = ast.literal_eval(row[column_name])
                parsed_data.append(parsed)
            else:
                parsed_data.append(None)
        except:
            parsed_data.append(None)
    return parsed_data

# 解析shape和spacing数据
fingerprint_df['pet_shape_parsed'] = parse_shape_spacing(fingerprint_df, 'pet_shape')
fingerprint_df['pet_spacing_parsed'] = parse_shape_spacing(fingerprint_df, 'pet_spacing')
fingerprint_df['ct_shape_parsed'] = parse_shape_spacing(fingerprint_df, 'ct_shape')
fingerprint_df['ct_spacing_parsed'] = parse_shape_spacing(fingerprint_df, 'ct_spacing')

# 统计有效数据
valid_data = fingerprint_df[
    (fingerprint_df['pet_file_exists'] == True) & 
    (fingerprint_df['ct_file_exists'] == True)
].copy()

print(f"   有效数据样本数: {len(valid_data)}")

# PET Shape统计
pet_shapes = [shape for shape in valid_data['pet_shape_parsed'] if shape is not None]
if pet_shapes:
    unique_pet_shapes = list(set([tuple(shape) for shape in pet_shapes]))
    print(f"\n   PET Shape统计 ({len(pet_shapes)}个有效样本):")
    for shape in unique_pet_shapes:
        count = sum(1 for s in pet_shapes if tuple(s) == shape)
        print(f"     {shape}: {count} 个样本")

# CT Shape统计
ct_shapes = [shape for shape in valid_data['ct_shape_parsed'] if shape is not None]
if ct_shapes:
    unique_ct_shapes = list(set([tuple(shape) for shape in ct_shapes]))
    print(f"\n   CT Shape统计 ({len(ct_shapes)}个有效样本):")
    for shape in unique_ct_shapes:
        count = sum(1 for s in ct_shapes if tuple(s) == shape)
        print(f"     {shape}: {count} 个样本")

# 3. Spacing统计分析
print(f"\n3. Spacing统计分析:")

# PET Spacing统计
pet_spacings = [spacing for spacing in valid_data['pet_spacing_parsed'] if spacing is not None]
if pet_spacings:
    unique_pet_spacings = list(set([tuple(spacing) for spacing in pet_spacings]))
    print(f"\n   PET Spacing统计 ({len(pet_spacings)}个有效样本):")
    for spacing in unique_pet_spacings:
        count = sum(1 for s in pet_spacings if tuple(s) == spacing)
        print(f"     {spacing}: {count} 个样本")

# CT Spacing统计
ct_spacings = [spacing for spacing in valid_data['ct_spacing_parsed'] if spacing is not None]
if ct_spacings:
    unique_ct_spacings = list(set([tuple(spacing) for spacing in ct_spacings]))
    print(f"\n   CT Spacing统计 ({len(ct_spacings)}个有效样本):")
    for spacing in unique_ct_spacings:
        count = sum(1 for s in ct_spacings if tuple(s) == spacing)
        print(f"     {spacing}: {count} 个样本")

print(f"\n=== 指纹分析完成 ===")
print(f"详细指纹信息已保存在: metadata/data_fingerprints.csv")

=== 指纹信息统计分析 ===

1. 文件完整性统计:
   PET文件正常: 1030/1030
   CT文件正常: 1030/1030

2. Shape统计分析:
   有效数据样本数: 1030

   PET Shape统计 (1030个有效样本):
     (220, 220, 531): 1 个样本
     (200, 200, 187): 14 个样本
     (150, 150, 345): 1 个样本
     (200, 200, 132): 1 个样本
     (220, 220, 421): 1 个样本
     (200, 200, 196): 13 个样本
     (220, 220, 485): 4 个样本
     (150, 150, 354): 6 个样本
     (200, 200, 205): 1 个样本
     (200, 200, 162): 3 个样本
     (220, 220, 451): 2 个样本
     (168, 168, 374): 15 个样本
     (220, 220, 515): 2 个样本
     (200, 200, 171): 2 个样本
     (200, 200, 180): 4 个样本
     (220, 220, 469): 2 个样本
     (220, 220, 533): 1 个样本
     (200, 200, 189): 9 个样本
     (200, 200, 70): 1 个样本
     (200, 200, 198): 7 个样本
     (220, 220, 487): 1 个样本
     (200, 200, 155): 1 个样本
     (220, 220, 444): 2 个样本
     (168, 168, 486): 1 个样本
     (200, 200, 164): 1 个样本
     (220, 220, 517): 1 个样本
     (200, 200, 54): 1 个样本
     (168, 168, 321): 1 个样本
     (200, 200, 173): 2 个样本
     (150, 150, 386): 1 个样本
     (220, 220, 526): 1 个

In [5]:
# 简化的指纹信息汇总
print("=== 数据指纹汇总 ===")

# 读取指纹数据
fingerprint_df = pd.read_csv("metadata/data_fingerprints.csv", dtype={'PID': str})

# 基本统计
print(f"\n总样本数: {len(fingerprint_df)}")
print(f"PET文件完整: {fingerprint_df['pet_file_exists'].sum()}")
print(f"CT文件完整: {fingerprint_df['ct_file_exists'].sum()}")

# 解析shape和spacing
import ast

def safe_eval(x):
    try:
        if pd.notna(x) and x != 'None':
            return ast.literal_eval(x)
        return None
    except:
        return None

# 获取有效数据的shape和spacing
valid_data = fingerprint_df[
    (fingerprint_df['pet_file_exists'] == True) & 
    (fingerprint_df['ct_file_exists'] == True)
].copy()

print(f"\n有效数据样本: {len(valid_data)}")

# 统计最常见的shape和spacing
pet_shapes = [safe_eval(x) for x in valid_data['pet_shape']]
pet_shapes = [tuple(x) for x in pet_shapes if x is not None]

ct_shapes = [safe_eval(x) for x in valid_data['ct_shape']]
ct_shapes = [tuple(x) for x in ct_shapes if x is not None]

pet_spacings = [safe_eval(x) for x in valid_data['pet_spacing']]
pet_spacings = [tuple(np.round(x, 4)) for x in pet_spacings if x is not None]

ct_spacings = [safe_eval(x) for x in valid_data['ct_spacing']]
ct_spacings = [tuple(np.round(x, 4)) for x in ct_spacings if x is not None]

print(f"\n=== PET数据统计 ===")
print(f"不同shape数量: {len(set(pet_shapes))}")
print(f"最常见的PET shape:")
from collections import Counter
pet_shape_counts = Counter(pet_shapes)
for shape, count in pet_shape_counts.most_common(5):
    print(f"  {shape}: {count} 个样本")

print(f"\n不同spacing数量: {len(set(pet_spacings))}")
print(f"最常见的PET spacing:")
pet_spacing_counts = Counter(pet_spacings)
for spacing, count in pet_spacing_counts.most_common(5):
    print(f"  {spacing}: {count} 个样本")

print(f"\n=== CT数据统计 ===")
print(f"不同shape数量: {len(set(ct_shapes))}")
print(f"最常见的CT shape:")
ct_shape_counts = Counter(ct_shapes)
for shape, count in ct_shape_counts.most_common(5):
    print(f"  {shape}: {count} 个样本")

print(f"\n不同spacing数量: {len(set(ct_spacings))}")
print(f"最常见的CT spacing:")
ct_spacing_counts = Counter(ct_spacings)
for spacing, count in ct_spacing_counts.most_common(5):
    print(f"  {spacing}: {count} 个样本")

# 按中心统计
print(f"\n=== 各中心数据分布 ===")
for center in ['AKH_nifti_637', 'Neimeng_nifti_425']:
    center_data = valid_data[valid_data['center'] == center]
    print(f"\n{center} ({len(center_data)} 样本):")
    
    # 该中心的PET shape分布
    center_pet_shapes = [safe_eval(x) for x in center_data['pet_shape']]
    center_pet_shapes = [tuple(x) for x in center_pet_shapes if x is not None]
    center_pet_shape_counts = Counter(center_pet_shapes)
    
    print(f"  主要PET shape:")
    for shape, count in center_pet_shape_counts.most_common(3):
        print(f"    {shape}: {count} 个")
    
    # 该中心的CT shape分布
    center_ct_shapes = [safe_eval(x) for x in center_data['ct_shape']]
    center_ct_shapes = [tuple(x) for x in center_ct_shapes if x is not None]
    center_ct_shape_counts = Counter(center_ct_shapes)
    
    print(f"  主要CT shape:")
    for shape, count in center_ct_shape_counts.most_common(3):
        print(f"    {shape}: {count} 个")

print(f"\n=== 指纹分析完成 ===")
print(f"详细数据已保存在: metadata/data_fingerprints.csv")

=== 数据指纹汇总 ===

总样本数: 1030
PET文件完整: 1030
CT文件完整: 1030

有效数据样本: 1030

=== PET数据统计 ===
不同shape数量: 148
最常见的PET shape:
  (168, 168, 324): 382 个样本
  (440, 440, 531): 83 个样本
  (168, 168, 274): 61 个样本
  (200, 200, 186): 36 个样本
  (150, 150, 348): 23 个样本

不同spacing数量: 0
最常见的PET spacing:

=== CT数据统计 ===
不同shape数量: 161
最常见的CT shape:
  (512, 512, 324): 381 个样本
  (512, 512, 531): 84 个样本
  (512, 512, 274): 61 个样本
  (512, 512, 232): 34 个样本
  (512, 512, 619): 23 个样本

不同spacing数量: 0
最常见的CT spacing:

=== 各中心数据分布 ===

AKH_nifti_637 (618 样本):
  主要PET shape:
    (168, 168, 324): 382 个
    (440, 440, 531): 83 个
    (168, 168, 274): 61 个
  主要CT shape:
    (512, 512, 324): 381 个
    (512, 512, 531): 84 个
    (512, 512, 274): 61 个

Neimeng_nifti_425 (412 样本):
  主要PET shape:
    (200, 200, 186): 36 个
    (150, 150, 348): 23 个
    (200, 200, 192): 16 个
  主要CT shape:
    (512, 512, 232): 34 个
    (512, 512, 619): 23 个
    (512, 512, 242): 12 个

=== 指纹分析完成 ===
详细数据已保存在: metadata/data_fingerprints.csv


In [6]:
# 修复spacing信息显示
print("=== 修复后的Spacing统计 ===")

def safe_eval_spacing(x):
    try:
        if pd.notna(x) and x != 'None':
            parsed = ast.literal_eval(x)
            # 处理numpy数组
            if isinstance(parsed, list):
                return [float(val) if not isinstance(val, str) else float(val.replace('np.float32(', '').replace(')', '')) for val in parsed]
            return parsed
        return None
    except Exception as e:
        return None

# 重新解析spacing数据
pet_spacings_fixed = []
ct_spacings_fixed = []

for idx, row in valid_data.iterrows():
    pet_spacing = safe_eval_spacing(row['pet_spacing'])
    ct_spacing = safe_eval_spacing(row['ct_spacing'])
    
    if pet_spacing:
        pet_spacings_fixed.append(tuple(np.round(pet_spacing, 3)))
    if ct_spacing:
        ct_spacings_fixed.append(tuple(np.round(ct_spacing, 3)))

print(f"\nPET Spacing统计 ({len(pet_spacings_fixed)} 个有效样本):")
pet_spacing_counts = Counter(pet_spacings_fixed)
for spacing, count in pet_spacing_counts.most_common(10):
    print(f"  {spacing}: {count} 个样本")

print(f"\nCT Spacing统计 ({len(ct_spacings_fixed)} 个有效样本):")
ct_spacing_counts = Counter(ct_spacings_fixed)
for spacing, count in ct_spacing_counts.most_common(10):
    print(f"  {spacing}: {count} 个样本")

# 按中心查看spacing分布
print(f"\n=== 各中心Spacing分布 ===")
for center in ['AKH_nifti_637', 'Neimeng_nifti_425']:
    center_data = valid_data[valid_data['center'] == center]
    print(f"\n{center}:")
    
    center_pet_spacings = []
    center_ct_spacings = []
    
    for idx, row in center_data.iterrows():
        pet_spacing = safe_eval_spacing(row['pet_spacing'])
        ct_spacing = safe_eval_spacing(row['ct_spacing'])
        
        if pet_spacing:
            center_pet_spacings.append(tuple(np.round(pet_spacing, 3)))
        if ct_spacing:
            center_ct_spacings.append(tuple(np.round(ct_spacing, 3)))
    
    print(f"  主要PET spacing:")
    center_pet_spacing_counts = Counter(center_pet_spacings)
    for spacing, count in center_pet_spacing_counts.most_common(3):
        print(f"    {spacing}: {count} 个")
    
    print(f"  主要CT spacing:")
    center_ct_spacing_counts = Counter(center_ct_spacings)
    for spacing, count in center_ct_spacing_counts.most_common(3):
        print(f"    {spacing}: {count} 个")

print(f"\n=== 数据指纹分析完成 ===")
print(f"Shape和Spacing变异很大，需要在预处理时统一标准化")

=== 修复后的Spacing统计 ===

PET Spacing统计 (0 个有效样本):

CT Spacing统计 (0 个有效样本):

=== 各中心Spacing分布 ===

AKH_nifti_637:
  主要PET spacing:
  主要CT spacing:

Neimeng_nifti_425:
  主要PET spacing:
  主要CT spacing:

=== 数据指纹分析完成 ===
Shape和Spacing变异很大，需要在预处理时统一标准化


In [7]:
# 检查原始spacing数据格式
print("=== 检查原始spacing数据格式 ===")

# 查看前几行的spacing数据
sample_data = fingerprint_df.head(3)
for idx, row in sample_data.iterrows():
    print(f"\n样本 {row['PID']}:")
    print(f"  PET spacing原始: {repr(row['pet_spacing'])}")
    print(f"  CT spacing原始: {repr(row['ct_spacing'])}")
    
    # 尝试不同的解析方法
    try:
        pet_spacing_str = str(row['pet_spacing'])
        # 移除np.float32包装
        import re
        pet_spacing_clean = re.sub(r'np\.float32\((.*?)\)', r'\1', pet_spacing_str)
        pet_spacing_parsed = ast.literal_eval(pet_spacing_clean)
        print(f"  PET spacing解析后: {pet_spacing_parsed}")
        
        ct_spacing_str = str(row['ct_spacing'])
        ct_spacing_clean = re.sub(r'np\.float32\((.*?)\)', r'\1', ct_spacing_str)
        ct_spacing_parsed = ast.literal_eval(ct_spacing_clean)
        print(f"  CT spacing解析后: {ct_spacing_parsed}")
        
    except Exception as e:
        print(f"  解析错误: {e}")

# 重新正确解析所有spacing数据
print(f"\n=== 重新解析Spacing数据 ===")

def parse_spacing_correctly(spacing_str):
    try:
        if pd.isna(spacing_str) or spacing_str == 'None':
            return None
        
        # 移除np.float32包装
        import re
        spacing_clean = re.sub(r'np\.float32\((.*?)\)', r'\1', str(spacing_str))
        spacing_parsed = ast.literal_eval(spacing_clean)
        
        # 转换为浮点数并四舍五入
        if isinstance(spacing_parsed, (list, tuple)):
            return tuple(round(float(x), 3) for x in spacing_parsed)
        return None
        
    except Exception as e:
        print(f"解析错误 {spacing_str}: {e}")
        return None

# 解析所有数据
pet_spacings_correct = []
ct_spacings_correct = []

for idx, row in fingerprint_df.iterrows():
    pet_spacing = parse_spacing_correctly(row['pet_spacing'])
    ct_spacing = parse_spacing_correctly(row['ct_spacing'])
    
    if pet_spacing:
        pet_spacings_correct.append(pet_spacing)
    if ct_spacing:
        ct_spacings_correct.append(ct_spacing)

print(f"\nPET Spacing统计 ({len(pet_spacings_correct)} 个有效样本):")
pet_spacing_counts = Counter(pet_spacings_correct)
for spacing, count in pet_spacing_counts.most_common(10):
    print(f"  {spacing}: {count} 个样本")

print(f"\nCT Spacing统计 ({len(ct_spacings_correct)} 个有效样本):")
ct_spacing_counts = Counter(ct_spacings_correct)
for spacing, count in ct_spacing_counts.most_common(10):
    print(f"  {spacing}: {count} 个样本")

print(f"\n=== Spacing范围统计 ===")
if pet_spacings_correct:
    pet_spacings_array = np.array(pet_spacings_correct)
    print(f"PET Spacing范围:")
    print(f"  X轴: {pet_spacings_array[:, 0].min():.3f} - {pet_spacings_array[:, 0].max():.3f}")
    print(f"  Y轴: {pet_spacings_array[:, 1].min():.3f} - {pet_spacings_array[:, 1].max():.3f}")
    print(f"  Z轴: {pet_spacings_array[:, 2].min():.3f} - {pet_spacings_array[:, 2].max():.3f}")

if ct_spacings_correct:
    ct_spacings_array = np.array(ct_spacings_correct)
    print(f"CT Spacing范围:")
    print(f"  X轴: {ct_spacings_array[:, 0].min():.3f} - {ct_spacings_array[:, 0].max():.3f}")
    print(f"  Y轴: {ct_spacings_array[:, 1].min():.3f} - {ct_spacings_array[:, 1].max():.3f}")
    print(f"  Z轴: {ct_spacings_array[:, 2].min():.3f} - {ct_spacings_array[:, 2].max():.3f}")

=== 检查原始spacing数据格式 ===

样本 ABDALLA-ADEL-AHMED20091023:
  PET spacing原始: '[np.float32(4.07283), np.float32(4.07283), np.float32(3.0)]'
  CT spacing原始: '[np.float32(1.3671875), np.float32(1.3671875), np.float32(3.0)]'
  PET spacing解析后: [4.07283, 4.07283, 3.0]
  CT spacing解析后: [1.3671875, 1.3671875, 3.0]

样本 ABT-BRIGITTE20160818:
  PET spacing原始: '[np.float32(4.07283), np.float32(4.07283), np.float32(3.0)]'
  CT spacing原始: '[np.float32(1.3671875), np.float32(1.3671875), np.float32(3.0)]'
  PET spacing解析后: [4.07283, 4.07283, 3.0]
  CT spacing解析后: [1.3671875, 1.3671875, 3.0]

样本 ADAMEK-KARIN20211124:
  PET spacing原始: '[np.float32(3.3), np.float32(3.3), np.float32(2.0)]'
  CT spacing原始: '[np.float32(1.5234375), np.float32(1.5234375), np.float32(2.0)]'
  PET spacing解析后: [3.3, 3.3, 2.0]
  CT spacing解析后: [1.5234375, 1.5234375, 2.0]

=== 重新解析Spacing数据 ===

PET Spacing统计 (1030 个有效样本):
  (4.073, 4.073, 3.0): 483 个样本
  (4.073, 4.073, 5.0): 329 个样本
  (1.65, 1.65, 2.0): 83 个样本
  (4.0, 4.0, 2.68): 83

## 数据指纹分析总结

### 数据完整性
- **总样本数**: 1030个
- **PET文件完整性**: 100% (1030/1030)
- **CT文件完整性**: 100% (1030/1030)

### Shape分布特点
**PET数据**:
- 不同shape数量: 148种
- 主要分布: 
  - AKH中心: (168, 168, 324) 占主导
  - Neimeng中心: 多样化的shape分布

**CT数据**:
- 不同shape数量: 161种
- 主要特点: X、Y维度多为512×512，Z维度变化较大

### Spacing分布特点
- **PET Spacing**: 变化范围较大，需要重采样标准化
- **CT Spacing**: 空间分辨率变化显著，需要统一处理

### 预处理建议
1. **重采样**: 统一spacing到标准分辨率
2. **裁剪/填充**: 统一shape尺寸
3. **配准**: 确保PET和CT在同一空间对齐
4. **标准化**: 强度值归一化

### 输出文件
- **详细指纹数据**: `metadata/data_fingerprints.csv`
- **包含字段**: shape, spacing, 文件路径, 病理信息等

In [8]:
# 计算各中心和全部数据的spacing中位数
print("=== Spacing中位数统计 ===")

# 读取指纹数据
fingerprint_df = pd.read_csv("metadata/data_fingerprints.csv", dtype={'PID': str})

# 解析spacing的函数
def parse_spacing_correctly(spacing_str):
    try:
        if pd.isna(spacing_str) or spacing_str == 'None':
            return None
        
        # 移除np.float32包装
        import re
        spacing_clean = re.sub(r'np\.float32\((.*?)\)', r'\1', str(spacing_str))
        spacing_parsed = ast.literal_eval(spacing_clean)
        
        # 转换为浮点数
        if isinstance(spacing_parsed, (list, tuple)):
            return [float(x) for x in spacing_parsed]
        return None
        
    except Exception as e:
        return None

# 解析所有spacing数据
all_pet_spacings = []
all_ct_spacings = []

akh_pet_spacings = []
akh_ct_spacings = []

neimeng_pet_spacings = []
neimeng_ct_spacings = []

for idx, row in fingerprint_df.iterrows():
    if row['pet_file_exists'] and row['ct_file_exists']:
        pet_spacing = parse_spacing_correctly(row['pet_spacing'])
        ct_spacing = parse_spacing_correctly(row['ct_spacing'])
        
        if pet_spacing:
            all_pet_spacings.append(pet_spacing)
            if row['center'] == 'AKH_nifti_637':
                akh_pet_spacings.append(pet_spacing)
            elif row['center'] == 'Neimeng_nifti_425':
                neimeng_pet_spacings.append(pet_spacing)
        
        if ct_spacing:
            all_ct_spacings.append(ct_spacing)
            if row['center'] == 'AKH_nifti_637':
                akh_ct_spacings.append(ct_spacing)
            elif row['center'] == 'Neimeng_nifti_425':
                neimeng_ct_spacings.append(ct_spacing)

# 转换为numpy数组以便计算中位数
all_pet_spacings_array = np.array(all_pet_spacings)
all_ct_spacings_array = np.array(all_ct_spacings)

akh_pet_spacings_array = np.array(akh_pet_spacings)
akh_ct_spacings_array = np.array(akh_ct_spacings)

neimeng_pet_spacings_array = np.array(neimeng_pet_spacings)
neimeng_ct_spacings_array = np.array(neimeng_ct_spacings)

print(f"\n=== 全部数据 (总计: {len(all_pet_spacings)} 样本) ===")
print(f"PET Spacing中位数:")
print(f"  X轴: {np.median(all_pet_spacings_array[:, 0]):.3f} mm")
print(f"  Y轴: {np.median(all_pet_spacings_array[:, 1]):.3f} mm")
print(f"  Z轴: {np.median(all_pet_spacings_array[:, 2]):.3f} mm")

print(f"\nCT Spacing中位数:")
print(f"  X轴: {np.median(all_ct_spacings_array[:, 0]):.3f} mm")
print(f"  Y轴: {np.median(all_ct_spacings_array[:, 1]):.3f} mm")
print(f"  Z轴: {np.median(all_ct_spacings_array[:, 2]):.3f} mm")

print(f"\n=== AKH_nifti_637 中心 ({len(akh_pet_spacings)} 样本) ===")
print(f"PET Spacing中位数:")
print(f"  X轴: {np.median(akh_pet_spacings_array[:, 0]):.3f} mm")
print(f"  Y轴: {np.median(akh_pet_spacings_array[:, 1]):.3f} mm")
print(f"  Z轴: {np.median(akh_pet_spacings_array[:, 2]):.3f} mm")

print(f"\nCT Spacing中位数:")
print(f"  X轴: {np.median(akh_ct_spacings_array[:, 0]):.3f} mm")
print(f"  Y轴: {np.median(akh_ct_spacings_array[:, 1]):.3f} mm")
print(f"  Z轴: {np.median(akh_ct_spacings_array[:, 2]):.3f} mm")

print(f"\n=== Neimeng_nifti_425 中心 ({len(neimeng_pet_spacings)} 样本) ===")
print(f"PET Spacing中位数:")
print(f"  X轴: {np.median(neimeng_pet_spacings_array[:, 0]):.3f} mm")
print(f"  Y轴: {np.median(neimeng_pet_spacings_array[:, 1]):.3f} mm")
print(f"  Z轴: {np.median(neimeng_pet_spacings_array[:, 2]):.3f} mm")

print(f"\nCT Spacing中位数:")
print(f"  X轴: {np.median(neimeng_ct_spacings_array[:, 0]):.3f} mm")
print(f"  Y轴: {np.median(neimeng_ct_spacings_array[:, 1]):.3f} mm")
print(f"  Z轴: {np.median(neimeng_ct_spacings_array[:, 2]):.3f} mm")

# 创建汇总表格
print(f"\n=== Spacing中位数汇总表 ===")
print(f"{'数据源':<20} {'模态':<5} {'X轴(mm)':<10} {'Y轴(mm)':<10} {'Z轴(mm)':<10}")
print("-" * 60)

# 全部数据
print(f"{'全部数据':<20} {'PET':<5} {np.median(all_pet_spacings_array[:, 0]):<10.3f} {np.median(all_pet_spacings_array[:, 1]):<10.3f} {np.median(all_pet_spacings_array[:, 2]):<10.3f}")
print(f"{'全部数据':<20} {'CT':<5} {np.median(all_ct_spacings_array[:, 0]):<10.3f} {np.median(all_ct_spacings_array[:, 1]):<10.3f} {np.median(all_ct_spacings_array[:, 2]):<10.3f}")

# AKH中心
print(f"{'AKH_nifti_637':<20} {'PET':<5} {np.median(akh_pet_spacings_array[:, 0]):<10.3f} {np.median(akh_pet_spacings_array[:, 1]):<10.3f} {np.median(akh_pet_spacings_array[:, 2]):<10.3f}")
print(f"{'AKH_nifti_637':<20} {'CT':<5} {np.median(akh_ct_spacings_array[:, 0]):<10.3f} {np.median(akh_ct_spacings_array[:, 1]):<10.3f} {np.median(akh_ct_spacings_array[:, 2]):<10.3f}")

# Neimeng中心
print(f"{'Neimeng_nifti_425':<20} {'PET':<5} {np.median(neimeng_pet_spacings_array[:, 0]):<10.3f} {np.median(neimeng_pet_spacings_array[:, 1]):<10.3f} {np.median(neimeng_pet_spacings_array[:, 2]):<10.3f}")
print(f"{'Neimeng_nifti_425':<20} {'CT':<5} {np.median(neimeng_ct_spacings_array[:, 0]):<10.3f} {np.median(neimeng_ct_spacings_array[:, 1]):<10.3f} {np.median(neimeng_ct_spacings_array[:, 2]):<10.3f}")

print(f"\n=== 建议的标准化spacing ===")
print(f"基于中位数，建议的标准spacing:")
print(f"  PET: ({np.median(all_pet_spacings_array[:, 0]):.1f}, {np.median(all_pet_spacings_array[:, 1]):.1f}, {np.median(all_pet_spacings_array[:, 2]):.1f}) mm")
print(f"  CT:  ({np.median(all_ct_spacings_array[:, 0]):.1f}, {np.median(all_ct_spacings_array[:, 1]):.1f}, {np.median(all_ct_spacings_array[:, 2]):.1f}) mm")
print(f"\n或者使用常用的标准spacing:")
print(f"  PET: (4.0, 4.0, 3.0) mm")
print(f"  CT:  (1.5, 1.5, 3.0) mm")

=== Spacing中位数统计 ===

=== 全部数据 (总计: 1030 样本) ===
PET Spacing中位数:
  X轴: 4.073 mm
  Y轴: 4.073 mm
  Z轴: 3.000 mm

CT Spacing中位数:
  X轴: 1.367 mm
  Y轴: 1.367 mm
  Z轴: 3.000 mm

=== AKH_nifti_637 中心 (618 样本) ===
PET Spacing中位数:
  X轴: 4.073 mm
  Y轴: 4.073 mm
  Z轴: 3.000 mm

CT Spacing中位数:
  X轴: 1.367 mm
  Y轴: 1.367 mm
  Z轴: 3.000 mm

=== Neimeng_nifti_425 中心 (412 样本) ===
PET Spacing中位数:
  X轴: 4.073 mm
  Y轴: 4.073 mm
  Z轴: 5.000 mm

CT Spacing中位数:
  X轴: 0.979 mm
  Y轴: 0.979 mm
  Z轴: 4.000 mm

=== Spacing中位数汇总表 ===
数据源                  模态    X轴(mm)     Y轴(mm)     Z轴(mm)    
------------------------------------------------------------
全部数据                 PET   4.073      4.073      3.000     
全部数据                 CT    1.367      1.367      3.000     
AKH_nifti_637        PET   4.073      4.073      3.000     
AKH_nifti_637        CT    1.367      1.367      3.000     
Neimeng_nifti_425    PET   4.073      4.073      5.000     
Neimeng_nifti_425    CT    0.979      0.979      4.000     

=== 建议

In [9]:
# 统计各中心在spacing中位数下的样本数量
print("=== 各中心Spacing中位数对应的样本数量统计 ===")

# 计算各中心的spacing中位数
akh_pet_median = [np.median(akh_pet_spacings_array[:, 0]), 
                  np.median(akh_pet_spacings_array[:, 1]), 
                  np.median(akh_pet_spacings_array[:, 2])]

akh_ct_median = [np.median(akh_ct_spacings_array[:, 0]), 
                 np.median(akh_ct_spacings_array[:, 1]), 
                 np.median(akh_ct_spacings_array[:, 2])]

neimeng_pet_median = [np.median(neimeng_pet_spacings_array[:, 0]), 
                      np.median(neimeng_pet_spacings_array[:, 1]), 
                      np.median(neimeng_pet_spacings_array[:, 2])]

neimeng_ct_median = [np.median(neimeng_ct_spacings_array[:, 0]), 
                     np.median(neimeng_ct_spacings_array[:, 1]), 
                     np.median(neimeng_ct_spacings_array[:, 2])]

print(f"\n=== AKH_nifti_637 中心 ===")
print(f"PET Spacing中位数: ({akh_pet_median[0]:.3f}, {akh_pet_median[1]:.3f}, {akh_pet_median[2]:.3f}) mm")
print(f"CT Spacing中位数: ({akh_ct_median[0]:.3f}, {akh_ct_median[1]:.3f}, {akh_ct_median[2]:.3f}) mm")

# 统计AKH中心各spacing的样本数
from collections import Counter
akh_pet_spacings_rounded = [tuple(np.round(spacing, 3)) for spacing in akh_pet_spacings]
akh_ct_spacings_rounded = [tuple(np.round(spacing, 3)) for spacing in akh_ct_spacings]

akh_pet_spacing_counts = Counter(akh_pet_spacings_rounded)
akh_ct_spacing_counts = Counter(akh_ct_spacings_rounded)

# 查找中位数对应的样本数
akh_pet_median_tuple = tuple(np.round(akh_pet_median, 3))
akh_ct_median_tuple = tuple(np.round(akh_ct_median, 3))

akh_pet_median_count = akh_pet_spacing_counts.get(akh_pet_median_tuple, 0)
akh_ct_median_count = akh_ct_spacing_counts.get(akh_ct_median_tuple, 0)

print(f"PET中位数spacing的样本数: {akh_pet_median_count} / {len(akh_pet_spacings)} ({akh_pet_median_count/len(akh_pet_spacings)*100:.1f}%)")
print(f"CT中位数spacing的样本数: {akh_ct_median_count} / {len(akh_ct_spacings)} ({akh_ct_median_count/len(akh_ct_spacings)*100:.1f}%)")

print(f"\nAKH中心PET spacing分布 (前5名):")
for spacing, count in akh_pet_spacing_counts.most_common(5):
    percentage = count / len(akh_pet_spacings) * 100
    marker = " ← 中位数" if spacing == akh_pet_median_tuple else ""
    print(f"  {spacing}: {count} 个样本 ({percentage:.1f}%){marker}")

print(f"\nAKH中心CT spacing分布 (前5名):")
for spacing, count in akh_ct_spacing_counts.most_common(5):
    percentage = count / len(akh_ct_spacings) * 100
    marker = " ← 中位数" if spacing == akh_ct_median_tuple else ""
    print(f"  {spacing}: {count} 个样本 ({percentage:.1f}%){marker}")

print(f"\n=== Neimeng_nifti_425 中心 ===")
print(f"PET Spacing中位数: ({neimeng_pet_median[0]:.3f}, {neimeng_pet_median[1]:.3f}, {neimeng_pet_median[2]:.3f}) mm")
print(f"CT Spacing中位数: ({neimeng_ct_median[0]:.3f}, {neimeng_ct_median[1]:.3f}, {neimeng_ct_median[2]:.3f}) mm")

# 统计Neimeng中心各spacing的样本数
neimeng_pet_spacings_rounded = [tuple(np.round(spacing, 3)) for spacing in neimeng_pet_spacings]
neimeng_ct_spacings_rounded = [tuple(np.round(spacing, 3)) for spacing in neimeng_ct_spacings]

neimeng_pet_spacing_counts = Counter(neimeng_pet_spacings_rounded)
neimeng_ct_spacing_counts = Counter(neimeng_ct_spacings_rounded)

# 查找中位数对应的样本数
neimeng_pet_median_tuple = tuple(np.round(neimeng_pet_median, 3))
neimeng_ct_median_tuple = tuple(np.round(neimeng_ct_median, 3))

neimeng_pet_median_count = neimeng_pet_spacing_counts.get(neimeng_pet_median_tuple, 0)
neimeng_ct_median_count = neimeng_ct_spacing_counts.get(neimeng_ct_median_tuple, 0)

print(f"PET中位数spacing的样本数: {neimeng_pet_median_count} / {len(neimeng_pet_spacings)} ({neimeng_pet_median_count/len(neimeng_pet_spacings)*100:.1f}%)")
print(f"CT中位数spacing的样本数: {neimeng_ct_median_count} / {len(neimeng_ct_spacings)} ({neimeng_ct_median_count/len(neimeng_ct_spacings)*100:.1f}%)")

print(f"\nNeimeng中心PET spacing分布 (前5名):")
for spacing, count in neimeng_pet_spacing_counts.most_common(5):
    percentage = count / len(neimeng_pet_spacings) * 100
    marker = " ← 中位数" if spacing == neimeng_pet_median_tuple else ""
    print(f"  {spacing}: {count} 个样本 ({percentage:.1f}%){marker}")

print(f"\nNeimeng中心CT spacing分布 (前5名):")
for spacing, count in neimeng_ct_spacing_counts.most_common(5):
    percentage = count / len(neimeng_ct_spacings) * 100
    marker = " ← 中位数" if spacing == neimeng_ct_median_tuple else ""
    print(f"  {spacing}: {count} 个样本 ({percentage:.1f}%){marker}")

print(f"\n=== 汇总统计 ===")
print(f"{'中心':<20} {'模态':<5} {'中位数spacing':<25} {'样本数':<10} {'占比':<10}")
print("-" * 80)
print(f"{'AKH_nifti_637':<20} {'PET':<5} {str(akh_pet_median_tuple):<25} {akh_pet_median_count:<10} {akh_pet_median_count/len(akh_pet_spacings)*100:<10.1f}%")
print(f"{'AKH_nifti_637':<20} {'CT':<5} {str(akh_ct_median_tuple):<25} {akh_ct_median_count:<10} {akh_ct_median_count/len(akh_ct_spacings)*100:<10.1f}%")
print(f"{'Neimeng_nifti_425':<20} {'PET':<5} {str(neimeng_pet_median_tuple):<25} {neimeng_pet_median_count:<10} {neimeng_pet_median_count/len(neimeng_pet_spacings)*100:<10.1f}%")
print(f"{'Neimeng_nifti_425':<20} {'CT':<5} {str(neimeng_ct_median_tuple):<25} {neimeng_ct_median_count:<10} {neimeng_ct_median_count/len(neimeng_ct_spacings)*100:<10.1f}%")

=== 各中心Spacing中位数对应的样本数量统计 ===

=== AKH_nifti_637 中心 ===
PET Spacing中位数: (4.073, 4.073, 3.000) mm
CT Spacing中位数: (1.367, 1.367, 3.000) mm
PET中位数spacing的样本数: 483 / 618 (78.2%)
CT中位数spacing的样本数: 483 / 618 (78.2%)

AKH中心PET spacing分布 (前5名):
  (np.float64(4.073), np.float64(4.073), np.float64(3.0)): 483 个样本 (78.2%) ← 中位数
  (np.float64(1.65), np.float64(1.65), np.float64(2.0)): 83 个样本 (13.4%)
  (np.float64(3.3), np.float64(3.3), np.float64(2.0)): 50 个样本 (8.1%)
  (np.float64(4.073), np.float64(4.073), np.float64(2.0)): 2 个样本 (0.3%)

AKH中心CT spacing分布 (前5名):
  (np.float64(1.367), np.float64(1.367), np.float64(3.0)): 483 个样本 (78.2%) ← 中位数
  (np.float64(1.523), np.float64(1.523), np.float64(2.0)): 133 个样本 (21.5%)
  (np.float64(1.367), np.float64(1.367), np.float64(2.0)): 2 个样本 (0.3%)

=== Neimeng_nifti_425 中心 ===
PET Spacing中位数: (4.073, 4.073, 5.000) mm
CT Spacing中位数: (0.979, 0.979, 4.000) mm
PET中位数spacing的样本数: 329 / 412 (79.9%)
CT中位数spacing的样本数: 324 / 412 (78.6%)

Neimeng中心PET spacing分布 (前5名):