# Phase B: Build 5-fold advCheX_kf dataset

In [None]:
import pandas as pd
import shutil, os
from pathlib import Path
from sklearn.model_selection import StratifiedKFold


In [None]:
inter_dir = Path('advCheX_Inter')
out_dir = Path('advCheX_kf')
out_dir.mkdir(exist_ok=True)
train_df = pd.read_csv(inter_dir/'train.csv')
test_df = pd.read_csv(inter_dir/'test.csv')
# copy images once
for split in ['train','test']:
    src = inter_dir/split
    dst = out_dir/split
    if not dst.exists():
        shutil.copytree(src, dst)
test_df.to_csv(out_dir/'test.csv', index=False)


In [None]:
# 5-fold stratified split on training set
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, (tr_idx, val_idx) in enumerate(skf.split(train_df, train_df['CHD'])):
    fold_dir = out_dir/f'fold{fold}'
    fold_dir.mkdir(parents=True, exist_ok=True)
    tr_df = train_df.iloc[tr_idx].reset_index(drop=True)
    val_df = train_df.iloc[val_idx].reset_index(drop=True)
    tr_df.to_csv(fold_dir/'train.csv', index=False)
    val_df.to_csv(fold_dir/'valid.csv', index=False)
    counts = tr_df['CHD'].value_counts()
    w_chd = 1.0 / counts[1]
    w_non = 1.0 / counts[0]
    weights = []
    for _, row in tr_df.iterrows():
        w = w_chd if row.CHD==1 else w_non
        weights.append({'Path': row['Path'], 'sample_weight': w})
    pd.DataFrame(weights).to_csv(fold_dir/'train_weights.csv', index=False)
