## Train 資料集處理

In [None]:
"""
# ============================================================================
# 完整資料前處理：創建乾淨的 track1_fixed 資料夾
# 處理完整的資料結構：train/val -> covid/non-covid -> ct_scan_i -> *.jpg
# ============================================================================

資料結構:
track1/
├── train/
│   ├── annotations/          # CSV 檔案，直接複製
│   ├── covid/
│   │   ├── ct_scan_0/        # 包含 *.jpg
│   │   ├── ct_scan_1/
│   │   └── ...
│   └── non-covid/
│       ├── ct_scan_0/
│       ├── ct_scan_1/
│       └── ...
└── val/
    ├── annotations/          # CSV 檔案，直接複製
    ├── covid/
    │   ├── ct_scan_0/
    │   └── ...
    └── non-covid/
        ├── ct_scan_0/
        └── ...

Args:
    source_path: 原始 track1 路徑
    target_path: 目標 track1_fixed 路徑


import os
import shutil
from pathlib import Path

def create_clean_dataset_complete(source_path, target_path):

    source_path = Path(source_path)
    target_path = Path(target_path)
    
    print(f"開始處理完整資料集...")
    print(f"來源路徑: {source_path}")
    print(f"目標路徑: {target_path}")
    
    # 創建根目錄
    target_path.mkdir(exist_ok=True)
    
    total_copied = 0
    total_skipped = 0
    total_csv_copied = 0
    
    # 處理 train 和 val 資料夾
    for split in ['train', 'val']:
        source_split_path = source_path / split
        target_split_path = target_path / split
        
        if not source_split_path.exists():
            print(f"警告: {source_split_path} 不存在，跳過")
            continue
            
        print(f"\n{'='*50}")
        print(f"處理 {split.upper()} 資料夾")
        print(f"{'='*50}")
        
        # 創建 split 資料夾
        target_split_path.mkdir(exist_ok=True)
        
        # 1. 處理 annotations 資料夾（直接複製 CSV 檔案）
        source_annotations = source_split_path / 'annotations'
        if source_annotations.exists():
            target_annotations = target_split_path / 'annotations'
            if target_annotations.exists():
                shutil.rmtree(target_annotations)  # 先清空目標資料夾
            shutil.copytree(source_annotations, target_annotations)
            
            csv_files = list(target_annotations.glob('*.csv'))
            total_csv_copied += len(csv_files)
            print(f"✅ 複製 annotations: {len(csv_files)} 個 CSV 檔案")
        else:
            print(f"⚠️  {split}/annotations 不存在")
        
        # 2. 創建 covid 和 non-covid 資料夾
        target_split_path.joinpath('covid').mkdir(exist_ok=True)
        target_split_path.joinpath('non-covid').mkdir(exist_ok=True)
        
        # 3. 處理 covid 和 non-covid 資料夾中的圖片
        for category in ['covid', 'non-covid']:
            source_category_path = source_split_path / category
            target_category_path = target_split_path / category
            
            if not source_category_path.exists():
                print(f"⚠️  {split}/{category} 不存在，跳過")
                continue
                
            print(f"\n--- 處理 {split}/{category} ---")
            
            split_copied = 0
            split_skipped = 0
            ct_folder_count = 0
            
            # 遍歷所有 ct_scan_i 資料夾
            ct_folders = sorted([d for d in source_category_path.iterdir() 
                               if d.is_dir() and d.name.startswith('ct_scan_')])
            
            for ct_folder in ct_folders:
                target_ct_folder = target_category_path / ct_folder.name
                target_ct_folder.mkdir(exist_ok=True)
                
                copied_in_folder = 0
                skipped_in_folder = 0
                
                # 處理該 ct_scan 資料夾中的所有 jpg 檔案
                jpg_files = list(ct_folder.glob('*.jpg'))
                
                for jpg_file in jpg_files:
                    # 跳過 macOS 系統檔案
                    if jpg_file.name.startswith('._'):
                        skipped_in_folder += 1
                        continue
                    
                    # 檢查檔案大小（系統檔案通常很小）
                    file_size = jpg_file.stat().st_size
                    if file_size < 10000:  # 小於 10KB 的檔案可能有問題
                        print(f"    跳過小檔案: {ct_folder.name}/{jpg_file.name} ({file_size} bytes)")
                        skipped_in_folder += 1
                        continue
                    
                    # 複製正常檔案
                    target_file = target_ct_folder / jpg_file.name
                    shutil.copy2(jpg_file, target_file)
                    copied_in_folder += 1
                
                split_copied += copied_in_folder
                split_skipped += skipped_in_folder
                
                if copied_in_folder > 0:
                    ct_folder_count += 1
                    print(f"  ✅ {ct_folder.name}: {copied_in_folder} 個檔案 "
                          f"(跳過 {skipped_in_folder} 個)")
                else:
                    print(f"  ⚠️  {ct_folder.name}: 沒有有效檔案")
            
            total_copied += split_copied
            total_skipped += split_skipped
            
            print(f"--- {split}/{category} 總結 ---")
            print(f"  CT 資料夾數: {ct_folder_count}")
            print(f"  有效圖片: {split_copied} 個")
            print(f"  跳過檔案: {split_skipped} 個")
    
    print(f"\n{'='*50}")
    print(f"處理完成!")
    print(f"{'='*50}")
    print(f"總共複製圖片: {total_copied} 個")
    print(f"總共跳過檔案: {total_skipped} 個")
    print(f"總共複製 CSV: {total_csv_copied} 個")
    
    return total_copied, total_skipped, total_csv_copied

# 驗證清理後的資料集結構和內容
def verify_clean_dataset(dataset_path):
    dataset_path = Path(dataset_path)
    
    print(f"\n{'='*50}")
    print(f"驗證資料集: {dataset_path}")
    print(f"{'='*50}")
    
    for split in ['train', 'val']:
        split_path = dataset_path / split
        if not split_path.exists():
            print(f"⚠️  {split} 資料夾不存在")
            continue
            
        print(f"\n--- {split.upper()} ---")
        
        # 檢查 annotations
        annotations_path = split_path / 'annotations'
        if annotations_path.exists():
            csv_files = list(annotations_path.glob('*.csv'))
            print(f"  📄 annotations: {len(csv_files)} 個 CSV 檔案")
        else:
            print(f"  ⚠️  annotations 不存在")
        
        # 檢查 covid 和 non-covid
        for category in ['covid', 'non-covid']:
            category_path = split_path / category
            if category_path.exists():
                ct_folders = [d for d in category_path.iterdir() if d.is_dir()]
                total_images = sum(len(list(folder.glob('*.jpg'))) 
                                 for folder in ct_folders)
                print(f"  🏥 {category}: {len(ct_folders)} 個 CT 資料夾, "
                      f"{total_images} 張圖片")
            else:
                print(f"  ⚠️  {category} 不存在")

# ============================================================================
# 執行資料前處理
# ============================================================================

# 設定路徑
source_data_path = '/ssd7/ICCV2025_COVID19/track1'
target_data_path = '/ssd7/ICCV2025_COVID19/track1_fixed'

print("開始創建乾淨的資料集...")

# 創建乾淨的資料集
try:
    copied_files, skipped_files, csv_files = create_clean_dataset_complete(
        source_data_path, target_data_path)
    
    # 驗證結果
    verify_clean_dataset(target_data_path)
    
    print(f"\n🎉 成功! 乾淨的資料集已創建在: {target_data_path}")
    print("現在可以使用 track1_fixed 路徑進行後續處理！")
    
except Exception as e:
    print(f"❌ 處理過程中發生錯誤: {e}")
    import traceback
    traceback.print_exc()
"""

## 4醫院分開處理

In [None]:
"""
import os
import shutil
import pandas as pd
from pathlib import Path
from tqdm import tqdm

def reorganize_by_hospital(source_dir, target_dir):
    """
    根據CSV中的醫學中心標記重新組織資料集
    
    Args:
        source_dir: 原始資料目錄 (track1_fixed)
        target_dir: 目標資料目錄 (新的組織結構)
    """
    
    # 定義路徑
    source_path = Path(source_dir)
    target_path = Path(target_dir)
    
    # 創建目標根目錄
    target_path.mkdir(exist_ok=True)
    
    # 處理train和val兩個部分
    for split in ['train', 'val']:
        print(f"\n=== 處理 {split} 資料 ===")
        
        # 處理COVID和Non-COVID兩個類別
        for category in ['covid', 'non-covid']:
            print(f"處理 {split}/{category}...")
            
            # 讀取對應的CSV檔案
            csv_file = source_path / split / 'annotations' / f'{split}_{category.replace("-", "_")}.csv'
            
            if not csv_file.exists():
                print(f"警告: CSV檔案不存在 - {csv_file}")
                continue
                
            # 讀取CSV
            df = pd.read_csv(csv_file)
            print(f"讀取到 {len(df)} 筆資料")
            
            # 獲取所有醫學中心
            hospitals = df['data_centre'].unique()
            print(f"發現醫學中心: {hospitals}")
            
            # 為每個醫學中心創建目錄結構
            for hospital in hospitals:
                hospital_dir = target_path / f'hospital_{hospital}'
                hospital_split_dir = hospital_dir / split / category
                hospital_split_dir.mkdir(parents=True, exist_ok=True)
            
            # 根據CSV複製檔案
            hospital_data = df.groupby('data_centre')
            
            for hospital_id, group in hospital_data:
                print(f"  處理醫學中心 {hospital_id}: {len(group)} 個CT掃描")
                
                target_hospital_dir = target_path / f'hospital_{hospital_id}' / split / category
                
                for _, row in tqdm(group.iterrows(), desc=f"Hospital {hospital_id}", leave=False):
                    ct_scan_name = row['ct_scan_name']
                    
                    # 來源CT掃描資料夾
                    source_ct_dir = source_path / split / category / ct_scan_name
                    
                    # 目標CT掃描資料夾
                    target_ct_dir = target_hospital_dir / ct_scan_name
                    
                    if source_ct_dir.exists():
                        if not target_ct_dir.exists():
                            # 複製整個CT掃描資料夾
                            shutil.copytree(source_ct_dir, target_ct_dir)
                        else:
                            print(f"    跳過已存在的資料夾: {target_ct_dir}")
                    else:
                        print(f"    警告: 來源資料夾不存在 - {source_ct_dir}")
    
    print(f"\n=== 重新組織完成 ===")
    print(f"新的資料結構保存在: {target_path}")
    
    # 顯示最終結構
    print("\n最終目錄結構:")
    show_directory_structure(target_path)

def show_directory_structure(path, level=0, max_level=3):
    """顯示目錄結構"""
    if level > max_level:
        return
        
    items = list(Path(path).iterdir())
    items.sort()
    
    for item in items[:10]:  # 限制顯示數量
        indent = "  " * level
        if item.is_dir():
            print(f"{indent}{item.name}/")
            if level < max_level:
                show_directory_structure(item, level + 1, max_level)
        else:
            print(f"{indent}{item.name}")
    
    if len(items) > 10:
        print(f"{'  ' * level}... (還有 {len(items) - 10} 個項目)")

def verify_reorganization(target_dir):
    """驗證重新組織的結果"""
    target_path = Path(target_dir)
    
    print("\n=== 驗證重新組織結果 ===")
    
    hospitals = [d for d in target_path.iterdir() if d.is_dir() and d.name.startswith('hospital_')]
    
    for hospital_dir in hospitals:
        print(f"\n{hospital_dir.name}:")
        
        for split in ['train', 'val']:
            for category in ['covid', 'non-covid']:
                category_dir = hospital_dir / split / category
                if category_dir.exists():
                    ct_scans = [d for d in category_dir.iterdir() if d.is_dir()]
                    print(f"  {split}/{category}: {len(ct_scans)} CT掃描")
                else:
                    print(f"  {split}/{category}: 目錄不存在")

# 使用範例
if __name__ == "__main__":
    # 設定路徑
    source_directory = "/ssd7/ICCV2025_COVID19/track1_fixed"  # 原始資料目錄
    target_directory = "/ssd7/ICCV2025_COVID19/track1_by_hospital"  # 新的組織結構目錄
    
    # 執行重新組織
    reorganize_by_hospital(source_directory, target_directory)
    
    # 驗證結果
    verify_reorganization(target_directory)
"""