# Phase A: Prepare advCheX_Inter dataset

In [None]:
import pandas as pd
import os, re, shutil
from pathlib import Path
from PIL import Image
from sklearn.model_selection import train_test_split


In [None]:
# 1. Load annotation and parse multi-label field
anno_path = Path('data/注释文件.xlsx')
df = pd.read_excel(anno_path)
df.columns = ['index','patient_id','sex','diseases']
def parse_label(s):
    if pd.isna(s):
        return []
    parts = re.split('[,/]', str(s))
    return [int(p) for p in parts if p.strip().isdigit()]
df['disease_list'] = df['diseases'].apply(parse_label)


In [None]:
# 2. Map diseases to binary labels: CHD vs nonCHD
chd_set = {1,2,3}
def to_binary(labels):
    s = set(labels)
    if s & chd_set:
        return 1,0
    return 0,1
df[['CHD','nonCHD']] = df['disease_list'].apply(lambda x: pd.Series(to_binary(x)))
out_dir = Path('advCheX_Inter')
out_dir.mkdir(parents=True, exist_ok=True)
df.to_csv(out_dir/'parsed_annotations.csv', index=False)


In [None]:
# 3. Stratified train/test split (7:3)
train_df, test_df = train_test_split(df, test_size=0.3, stratify=df['CHD'], random_state=42)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)


In [None]:
# 4. Move images and build csvs
def locate_image(pid):
    base = Path('data')/str(pid)
    root = base / f'胸片_{pid}_正位'
    sub = next(root.glob('*_1'))
    img_path = next(sub.glob('*'))
    return img_path
def save_image(src, dst):
    dst.parent.mkdir(parents=True, exist_ok=True)
    img = Image.open(src).convert('RGB')
    img.save(dst, format='JPEG')
def process_split(split_df, split_name):
    rows = []
    for _, r in split_df.iterrows():
        pid = r['patient_id']
        src = locate_image(pid)
        rel = Path(split_name)/f'patient{pid}/study1/view1_frontal.jpg'
        dst = out_dir/rel
        save_image(src, dst)
        rows.append({'Path': str(rel), 'CHD': r['CHD'], 'nonCHD': r['nonCHD']})
    pd.DataFrame(rows).to_csv(out_dir/f'{split_name}.csv', index=False)
process_split(train_df, 'train')
process_split(test_df, 'test')


In [None]:
# 5. Compute sample weights and QA report
counts = train_df['CHD'].value_counts()
w_chd = 1.0 / counts[1]
w_non = 1.0 / counts[0]
weights = []
for _, row in train_df.iterrows():
    rel = Path('train')/f'patient{row.patient_id}/study1/view1_frontal.jpg'
    w = w_chd if row.CHD==1 else w_non
    weights.append({'Path': str(rel), 'sample_weight': w})
pd.DataFrame(weights).to_csv(out_dir/'train_weights.csv', index=False)
qa = {
    'total':[len(df)],
    'train_total':[len(train_df)],
    'test_total':[len(test_df)],
    'train_CHD':[train_df['CHD'].sum()],
    'test_CHD':[test_df['CHD'].sum()]
}
pd.DataFrame(qa).to_csv(out_dir/'qa_report.csv', index=False)
with open(out_dir/'README_phaseA.txt','w',encoding='utf-8') as f:
    f.write('Generated by first.ipynb
')
    f.write('Split ratio 7:3 with CHD stratification.
')
    f.write('Images stored under advCheX_Inter/{train,test}/patient<id>/study1/view1_frontal.jpg
')
