### Lib

In [19]:
import sys
from models import ASTModel
import dataloader
import os
import torch
import matplotlib.pyplot as plt
import numpy as np
import json
import random
from collections import defaultdict
import argparse
from traintest import train, validate

sys.path.append('./src') 
os.environ['TORCH_HOME'] = '../pretrained_models'  

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


### Data Spliting

In [20]:
def prepare_ast_5fold_patient_split(root_path, k_folds=5):
    # 1. ‡∏à‡∏±‡∏î‡∏Å‡∏•‡∏∏‡πà‡∏°‡πÑ‡∏ü‡∏•‡πå‡∏ï‡∏≤‡∏°‡∏£‡∏´‡∏±‡∏™‡∏ú‡∏π‡πâ‡∏õ‡πà‡∏ß‡∏¢ (Patient ID: 00x)
    patient_groups = {
        "Cough_PTB": defaultdict(list),
        "Cough_Non-PTB": defaultdict(list)
    }
    class_map = {"Cough_PTB": "1", "Cough_Non-PTB": "0"}

    # ‡∏î‡∏∂‡∏á‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î‡πÄ‡∏Ç‡πâ‡∏≤‡∏Å‡∏•‡∏∏‡πà‡∏°
    for folder_name, label_idx in class_map.items():
        folder_path = os.path.join(root_path, folder_name)
        if not os.path.exists(folder_path): continue
        for file in os.listdir(folder_path):
            if file.endswith(".wav"):
                patient_id = file.split('_')[0]
                full_path = os.path.abspath(os.path.join(folder_path, file))
                patient_groups[folder_name][patient_id].append(full_path)

    # ‡∏£‡∏ß‡∏ö‡∏£‡∏ß‡∏°‡∏£‡∏≤‡∏¢‡∏ä‡∏∑‡πà‡∏≠‡∏ú‡∏π‡πâ‡∏õ‡πà‡∏ß‡∏¢‡∏ó‡∏µ‡πà‡πÑ‡∏°‡πà‡∏ã‡πâ‡∏≥‡∏Å‡∏±‡∏ô‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î
    all_patients = set()
    for folder_name in patient_groups:
        all_patients.update(patient_groups[folder_name].keys())
    all_patients = list(all_patients)
    
    # ‡∏•‡πá‡∏≠‡∏Å Seed ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏´‡πâ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏Ñ‡∏á‡∏ó‡∏µ‡πà‡πÅ‡∏•‡∏∞‡∏™‡∏∏‡πà‡∏°‡∏à‡∏±‡∏î‡πÄ‡∏£‡∏µ‡∏¢‡∏á‡∏ú‡∏π‡πâ‡∏õ‡πà‡∏ß‡∏¢
    random.seed(42)
    random.shuffle(all_patients)

    # ‡πÅ‡∏ö‡πà‡∏á‡∏ú‡∏π‡πâ‡∏õ‡πà‡∏ß‡∏¢‡∏≠‡∏≠‡∏Å‡πÄ‡∏õ‡πá‡∏ô 5 ‡∏Å‡∏•‡∏∏‡πà‡∏° (Folds)
    fold_size = len(all_patients) // k_folds
    folds = []
    for i in range(k_folds):
        start_idx = i * fold_size
        # ‡πÉ‡∏´‡πâ fold ‡∏™‡∏∏‡∏î‡∏ó‡πâ‡∏≤‡∏¢‡∏£‡∏±‡∏ö‡πÄ‡∏®‡∏©‡∏ó‡∏µ‡πà‡πÄ‡∏´‡∏•‡∏∑‡∏≠‡πÑ‡∏õ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î (‡∏ñ‡πâ‡∏≤‡∏´‡∏≤‡∏£‡πÑ‡∏°‡πà‡∏•‡∏á‡∏ï‡∏±‡∏ß)
        end_idx = (i + 1) * fold_size if i < k_folds - 1 else len(all_patients)
        folds.append(all_patients[start_idx:end_idx])

    print("="*50)
    print("AST 5-FOLD CV DATA PREPARATION SUMMARY")
    print("="*50)

    # 2. ‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÑ‡∏ü‡∏•‡πå JSON ‡πÅ‡∏•‡∏∞‡∏ô‡∏±‡∏ö‡∏™‡∏ñ‡∏¥‡∏ï‡∏¥‡πÉ‡∏ô‡πÅ‡∏ï‡πà‡∏•‡∏∞ Fold
    for fold_idx, eval_ids in enumerate(folds, 1):
        # ‡∏Å‡∏•‡∏∏‡πà‡∏° Eval ‡∏Ñ‡∏∑‡∏≠‡∏Å‡∏•‡∏∏‡πà‡∏°‡∏õ‡∏±‡∏à‡∏à‡∏∏‡∏ö‡∏±‡∏ô ‡∏™‡πà‡∏ß‡∏ô Train ‡∏Ñ‡∏∑‡∏≠‡∏Å‡∏•‡∏∏‡πà‡∏°‡∏ó‡∏µ‡πà‡πÄ‡∏´‡∏•‡∏∑‡∏≠‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î
        train_ids = [pid for pid in all_patients if pid not in eval_ids]
        
        train_list, eval_list = [], []
        stats = {
            "train": {"ptb": 0, "non_ptb": 0},
            "eval": {"ptb": 0, "non_ptb": 0}
        }

        for folder_name, groups in patient_groups.items():
            for p_id, paths in groups.items():
                for path in paths:
                    item = {"wav": path, "labels": class_map[folder_name]}
                    if p_id in eval_ids:
                        eval_list.append(item)
                        if class_map[folder_name] == "1": stats["eval"]["ptb"] += 1
                        else: stats["eval"]["non_ptb"] += 1
                    else:
                        train_list.append(item)
                        if class_map[folder_name] == "1": stats["train"]["ptb"] += 1
                        else: stats["train"]["non_ptb"] += 1

        # 3. ‡∏ö‡∏±‡∏ô‡∏ó‡∏∂‡∏Å‡πÑ‡∏ü‡∏•‡πå JSON ‡∏Ç‡∏≠‡∏á‡πÅ‡∏ï‡πà‡∏•‡∏∞ Fold
        with open(f'train_data_fold_{fold_idx}.json', 'w') as f:
            json.dump({"data": train_list}, f, indent=4)
        with open(f'eval_data_fold_{fold_idx}.json', 'w') as f:
            json.dump({"data": eval_list}, f, indent=4)

        # 4. ‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•‡∏™‡∏£‡∏∏‡∏õ‡∏Ç‡∏≠‡∏á Fold ‡∏ô‡∏±‡πâ‡∏ô‡πÜ
        print(f"\n[ FOLD {fold_idx} ]")
        print(f"  TRAIN SET (Patients: {len(train_ids)} ‡∏Ñ‡∏ô) -> IDs: {', '.join(sorted(train_ids))}")
        print(f"      Samples: PTB (Class 1) = {stats['train']['ptb']:3d} | Non-PTB (Class 0) = {stats['train']['non_ptb']:3d}")
        print(f"  EVAL SET  (Patients: {len(eval_ids)} ‡∏Ñ‡∏ô) -> IDs: {', '.join(sorted(eval_ids))}")
        print(f"      Samples: PTB (Class 1) = {stats['eval']['ptb']:3d} | Non-PTB (Class 0) = {stats['eval']['non_ptb']:3d}")

# ‡πÄ‡∏£‡∏µ‡∏¢‡∏Å‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô
prepare_ast_5fold_patient_split("./Data", k_folds=5)

AST 5-FOLD CV DATA PREPARATION SUMMARY

[ FOLD 1 ]
  TRAIN SET (Patients: 12 ‡∏Ñ‡∏ô) -> IDs: 001, 002, 003, 005, 006, 007, 008, 009, 011, 012, 013, 016
      Samples: PTB (Class 1) = 150 | Non-PTB (Class 0) = 234
  EVAL SET  (Patients: 3 ‡∏Ñ‡∏ô) -> IDs: 004, 014, 015
      Samples: PTB (Class 1) =  37 | Non-PTB (Class 0) =  91

[ FOLD 2 ]
  TRAIN SET (Patients: 12 ‡∏Ñ‡∏ô) -> IDs: 001, 002, 003, 004, 006, 007, 008, 009, 011, 014, 015, 016
      Samples: PTB (Class 1) = 167 | Non-PTB (Class 0) = 253
  EVAL SET  (Patients: 3 ‡∏Ñ‡∏ô) -> IDs: 005, 012, 013
      Samples: PTB (Class 1) =  20 | Non-PTB (Class 0) =  72

[ FOLD 3 ]
  TRAIN SET (Patients: 12 ‡∏Ñ‡∏ô) -> IDs: 001, 002, 003, 004, 005, 008, 009, 012, 013, 014, 015, 016
      Samples: PTB (Class 1) = 134 | Non-PTB (Class 0) = 314
  EVAL SET  (Patients: 3 ‡∏Ñ‡∏ô) -> IDs: 006, 007, 011
      Samples: PTB (Class 1) =  53 | Non-PTB (Class 0) =  11

[ FOLD 4 ]
  TRAIN SET (Patients: 12 ‡∏Ñ‡∏ô) -> IDs: 001, 002, 004, 005, 006, 007, 009, 01

In [21]:

# ‡∏™‡∏£‡πâ‡∏≤‡∏á‡∏ï‡∏±‡∏ß‡πÅ‡∏õ‡∏£‡πÄ‡∏Å‡πá‡∏ö‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏Ç‡∏≠‡∏á‡πÅ‡∏ï‡πà‡∏•‡∏∞ Fold
fold_results = {"acc": [], "auc": []}

# ‡∏ß‡∏ô‡∏•‡∏π‡∏õ 5 Folds (1 ‡∏ñ‡∏∂‡∏á 5)
for fold in range(1, 6):
    print(f"\n{'='*40}")
    print(f"üöÄ STARTING FOLD {fold}/5")
    print(f"{'='*40}")

    # 1. ‡∏Å‡∏≥‡∏´‡∏ô‡∏î‡πÑ‡∏ü‡∏•‡πå JSON ‡∏Ç‡∏≠‡∏á Fold ‡∏õ‡∏±‡∏à‡∏à‡∏∏‡∏ö‡∏±‡∏ô
    train_json = f'train_data_fold_{fold}.json'
    eval_json = f'eval_data_fold_{fold}.json'
    
    # ‡∏Å‡∏≥‡∏´‡∏ô‡∏î‡πÇ‡∏ü‡∏•‡πÄ‡∏î‡∏≠‡∏£‡πå‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡πÄ‡∏Å‡πá‡∏ö Checkpoint ‡πÅ‡∏¢‡∏Å‡∏ï‡∏≤‡∏° Fold ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÑ‡∏°‡πà‡πÉ‡∏´‡πâ‡πÑ‡∏ü‡∏•‡πå‡∏ó‡∏±‡∏ö‡∏Å‡∏±‡∏ô
    exp_dir = f'./exp/tb_ast_p_fold_{fold}'
    os.makedirs(f'{exp_dir}/models', exist_ok=True)

    # 2. ‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤ Data Config (‡∏≠‡πâ‡∏≤‡∏á‡∏≠‡∏¥‡∏á‡∏à‡∏≤‡∏Å‡πÇ‡∏Ñ‡πâ‡∏î‡πÄ‡∏î‡∏¥‡∏°‡∏Ç‡∏≠‡∏á‡∏Ñ‡∏∏‡∏ì)
    train_audio_conf = {'num_mel_bins': 128, 'target_length': 100, 'freqm': 0, 'timem': 0, 'mixup': 0.0, 'dataset': 'audioset', 'mode': 'train', 'mean': -3.3831, 'std': 5.1156, 'noise': False, 'skip_norm': False}
    eval_audio_conf = {'num_mel_bins': 128, 'target_length': 100, 'freqm': 0, 'timem': 0, 'mixup': 0.0, 'dataset': 'audioset', 'mode': 'evaluation', 'mean': -3.3831, 'std': 5.1156, 'noise': False, 'skip_norm': False}
    
    BATCH_SIZE = 8
    
    # ‡∏™‡∏£‡πâ‡∏≤‡∏á DataLoader ‡∏Ç‡∏≠‡∏á Fold ‡∏õ‡∏±‡∏à‡∏à‡∏∏‡∏ö‡∏±‡∏ô
    train_loader = torch.utils.data.DataLoader(
        dataloader.AudiosetDataset(train_json, label_csv='class_labels_indices.csv', audio_conf=train_audio_conf),
        batch_size=BATCH_SIZE, shuffle=True, num_workers=0, pin_memory=True
    )
    eval_loader = torch.utils.data.DataLoader(
        dataloader.AudiosetDataset(eval_json, label_csv='class_labels_indices.csv', audio_conf=eval_audio_conf),
        batch_size=BATCH_SIZE * 2, shuffle=False, num_workers=0, pin_memory=True
    )

    # 3. ‡∏™‡∏£‡πâ‡∏≤‡∏á AST-P Model ‡πÉ‡∏´‡∏°‡πà (‡∏ï‡πâ‡∏≠‡∏á‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÉ‡∏´‡∏°‡πà‡∏ó‡∏∏‡∏Å Fold ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡∏•‡πâ‡∏≤‡∏á‡∏Ñ‡πà‡∏≤‡∏ô‡πâ‡∏≥‡∏´‡∏ô‡∏±‡∏Å‡πÄ‡∏î‡∏¥‡∏°)
    ast_model = ASTModel(
        label_dim=2, 
        fstride=10, 
        tstride=10, 
        input_fdim=128, 
        input_tdim=100, 
        imagenet_pretrain=True, 
        audioset_pretrain=True,  # ‡πÉ‡∏ä‡πâ AudioSet Pretrained (AST-P)
        model_size='base384'     # ‡∏Ç‡∏ô‡∏≤‡∏î‡∏ó‡∏µ‡πà‡∏£‡∏≠‡∏á‡∏£‡∏±‡∏ö‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö AST-P
    )

    # 4. ‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤ Training Arguments
    args = argparse.Namespace(
        exp_dir=exp_dir, dataset='audioset', n_class=2, lr=1e-5, n_epochs=30, batch_size=BATCH_SIZE, 
        n_print_steps=10, save_model=True, loss='CE', metrics='acc', 
        lrscheduler_start=10, lrscheduler_step=5, lrscheduler_decay=0.5, 
        warmup=True, wa=True, wa_start=1, wa_end=30
    )

    # 5. ‡πÄ‡∏£‡∏¥‡πà‡∏° Train
    train(ast_model, train_loader, eval_loader, args)

    # 6. ‡∏õ‡∏£‡∏∞‡πÄ‡∏°‡∏¥‡∏ô‡∏ú‡∏• (Validate) ‡∏î‡πâ‡∏ß‡∏¢ Best Model ‡∏Ç‡∏≠‡∏á Fold ‡∏ô‡∏µ‡πâ
    best_model_path = f'{args.exp_dir}/models/best_audio_model.pth'
    
    # ‡πÇ‡∏´‡∏•‡∏î‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î‡∏Å‡∏•‡∏±‡∏ö‡∏°‡∏≤‡∏ó‡∏î‡∏™‡∏≠‡∏ö
    best_model = ASTModel(label_dim=2, fstride=10, tstride=10, input_fdim=128, input_tdim=100, imagenet_pretrain=False, audioset_pretrain=False, model_size='base384', verbose=False)
    best_model = torch.nn.DataParallel(best_model)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    best_model.load_state_dict(torch.load(best_model_path, map_location=device))
    
    stats, eval_loss = validate(best_model, eval_loader, args, epoch='best')
    acc = stats[0]['acc']
    auc = stats[0]['auc']
    
    # ‡πÄ‡∏Å‡πá‡∏ö‡∏Ñ‡πà‡∏≤‡∏™‡∏ñ‡∏¥‡∏ï‡∏¥‡πÑ‡∏ß‡πâ‡∏Ñ‡∏≥‡∏ô‡∏ß‡∏ì‡πÄ‡∏â‡∏•‡∏µ‡πà‡∏¢‡∏ï‡∏≠‡∏ô‡∏à‡∏ö
    fold_results["acc"].append(acc)
    fold_results["auc"].append(auc)

    print(f"\n[ RESULT FOLD {fold} ] Accuracy: {acc:.4f} | AUC: {auc:.4f}")

# 7. ‡∏™‡∏£‡∏∏‡∏õ‡∏ú‡∏•‡∏£‡∏ß‡∏°‡πÅ‡∏ö‡∏ö Standard 5-Cross Validation
print(f"\n{'='*40}")
print("üèÜ FINAL 5-FOLD CV RESULTS")
print(f"{'='*40}")
print(f"Averaged Accuracy : {np.mean(fold_results['acc']):.4f} ¬± {np.std(fold_results['acc']):.4f}")
print(f"Averaged AUC      : {np.mean(fold_results['auc']):.4f} ¬± {np.std(fold_results['auc']):.4f}")


üöÄ STARTING FOLD 1/5
---------------the train dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process audioset
use dataset mean -3.383 and std 5.116 to normalize the input.
number of classes is 2
---------------the evaluation dataloader---------------
now using following mask: 0 freq, 0 time
now using mix-up with rate 0.000000
now process audioset
use dataset mean -3.383 and std 5.116 to normalize the input.
number of classes is 2
---------------AST Model Summary---------------
ImageNet pretraining: True, AudioSet pretraining: True
frequncey stride=10, time stride=10
number of patches=108
running on cpu
Total parameter number is : 86.880 million
Total trainable parameter number is : 86.880 million
now training with audioset, main metrics: acc, loss function: CrossEntropyLoss(), learning rate scheduler: <torch.optim.lr_scheduler.MultiStepLR object at 0x000001E284ECB820>
The learning rate scheduler starts at 10 epoch with deca



warm-up learning rate is 0.000000


KeyboardInterrupt: 