In [1]:
# ============================================================================
# Cell 1: Setup
# ============================================================================
import sys
import os
project_root = os.path.abspath('..')
sys.path.insert(0, project_root)

from src.data.balance_data import run_balance_pipeline
from pathlib import Path

print("‚úÖ Imports successful!")

‚úÖ Imports successful!


In [2]:
# ============================================================================
# Cell 2: Analyze current distribution
# ============================================================================
def analyze_distribution(data_dir):
    """Ph√¢n t√≠ch distribution hi·ªán t·∫°i"""
    data_path = Path(data_dir)
    
    for split in ['train', 'val', 'test']:
        split_dir = data_path / split
        print(f"\n{split.upper()}:")
        
        total = 0
        for class_dir in sorted(split_dir.iterdir()):
            if class_dir.is_dir():
                count = len(list(class_dir.glob('*.jpg')) + 
                           list(class_dir.glob('*.png')) + 
                           list(class_dir.glob('*.jpeg')))
                total += count
                pct = (count / total) * 100 if total > 0 else 0
                print(f"  {class_dir.name:20s}: {count:5d}")
        
        print(f"  {'TOTAL':20s}: {total:5d}")

print("=" * 70)
print("ORIGINAL DATA DISTRIBUTION")
print("=" * 70)
analyze_distribution('../data/processed')

ORIGINAL DATA DISTRIBUTION

TRAIN:
  COVID-19            :   404
  Normal              :  4668
  Pneumonia           :  5984
  Tuberculosis        :   490
  TOTAL               : 11546

VAL:
  COVID-19            :    86
  Normal              :   999
  Pneumonia           :  1281
  Tuberculosis        :   105
  TOTAL               :  2471

TEST:
  COVID-19            :    86
  Normal              :   999
  Pneumonia           :  1281
  Tuberculosis        :   105
  TOTAL               :  2471


In [4]:
# ============================================================================
# Cell 3: Run balancing
# ============================================================================
"""
Strategy options:
- 'hybrid': Undersample majority + Oversample minority (RECOMMENDED)
- 'oversample': Only oversample (large dataset)
- 'undersample': Only undersample (lose data)

Target samples:
- None: Auto (median for hybrid, max for oversample, min for undersample)
- Number: Specific target (e.g., 2500)
"""

STRATEGY = 'hybrid'
TARGET_SAMPLES = 2500  # Train: 2500, Val/Test: 625 m·ªói class

print(f"Strategy: {STRATEGY}")
print(f"Target: {TARGET_SAMPLES} per class (train)")
print(f"        {TARGET_SAMPLES // 4} per class (val/test)")

Strategy: hybrid
Target: 2500 per class (train)
        625 per class (val/test)


In [5]:
# ============================================================================
# Cell 4: Execute balancing
# ============================================================================
results = run_balance_pipeline(
    input_dir='../data/processed',
    output_dir='../data/balanced',
    strategy=STRATEGY,
    target_samples=TARGET_SAMPLES,
    random_seed=42
)

print("\n‚úÖ Balancing completed!")
print("üìÅ Balanced data: data/balanced/")


üöÄ DATA BALANCING PIPELINE
Input: ../data/processed
Output: ../data/balanced
Strategy: hybrid

BALANCING TRAIN SPLIT

üìä Original Distribution:
  COVID-19            :   404
  Normal              :  4668
  Pneumonia           :  5984
  Tuberculosis        :   490

üéØ Target samples per class: 2500

  COVID-19:
    Original: 404
    ‚Üí Need 2096 augmented


                                                                    

    ‚Üí Oversampled to: 2500

  Normal:
    Original: 4668


                                                                

    ‚Üí Undersampled to: 2500

  Pneumonia:
    Original: 5984


                                                                

    ‚Üí Undersampled to: 2500

  Tuberculosis:
    Original: 490
    ‚Üí Need 2010 augmented


                                                                   

    ‚Üí Oversampled to: 2500

BALANCING VAL SPLIT

üìä Original Distribution:
  COVID-19            :    86
  Normal              :   999
  Pneumonia           :  1281
  Tuberculosis        :   105

üéØ Target samples per class: 625

  COVID-19:
    Original: 86
    ‚Üí Need 539 augmented


                                                                 

    ‚Üí Oversampled to: 625

  Normal:
    Original: 999


                                                              

    ‚Üí Undersampled to: 625

  Pneumonia:
    Original: 1281


                                                              

    ‚Üí Undersampled to: 625

  Tuberculosis:
    Original: 105
    ‚Üí Need 520 augmented


                                                                 

    ‚Üí Oversampled to: 625

BALANCING TEST SPLIT

üìä Original Distribution:
  COVID-19            :    86
  Normal              :   999
  Pneumonia           :  1281
  Tuberculosis        :   105

üéØ Target samples per class: 625

  COVID-19:
    Original: 86
    ‚Üí Need 539 augmented


                                                                 

    ‚Üí Oversampled to: 625

  Normal:
    Original: 999


                                                              

    ‚Üí Undersampled to: 625

  Pneumonia:
    Original: 1281


                                                              

    ‚Üí Undersampled to: 625

  Tuberculosis:
    Original: 105
    ‚Üí Need 520 augmented


                                                                 

    ‚Üí Oversampled to: 625

‚úÖ BALANCING COMPLETED

TRAIN: 10,000 samples
  COVID-19            :  2500 (25.00%)
  Normal              :  2500 (25.00%)
  Pneumonia           :  2500 (25.00%)
  Tuberculosis        :  2500 (25.00%)

VAL: 2,500 samples
  COVID-19            :   625 (25.00%)
  Normal              :   625 (25.00%)
  Pneumonia           :   625 (25.00%)
  Tuberculosis        :   625 (25.00%)

TEST: 2,500 samples
  COVID-19            :   625 (25.00%)
  Normal              :   625 (25.00%)
  Pneumonia           :   625 (25.00%)
  Tuberculosis        :   625 (25.00%)

üíæ Summary: ../data/balanced/balance_summary.json

‚úÖ Balancing completed!
üìÅ Balanced data: data/balanced/




In [6]:
# ============================================================================
# Cell 5: Verify balanced data
# ============================================================================
print("\n" + "=" * 70)
print("BALANCED DATA VERIFICATION")
print("=" * 70)
analyze_distribution('../data/balanced')

# Calculate balance ratio
from pathlib import Path

for split in ['train', 'val', 'test']:
    split_dir = Path('../data/balanced') / split
    counts = []
    
    for class_dir in split_dir.iterdir():
        if class_dir.is_dir():
            count = len(list(class_dir.glob('*.*')))
            counts.append(count)
    
    if counts:
        balance_ratio = min(counts) / max(counts)
        print(f"\n{split.upper()} balance ratio: {balance_ratio:.3f} (1.0 = perfect)")
        if balance_ratio >= 0.9:
            print("  ‚úÖ Well balanced!")
        elif balance_ratio >= 0.7:
            print("  ‚ö†Ô∏è  Moderately balanced")
        else:
            print("  ‚ùå Still imbalanced")


BALANCED DATA VERIFICATION

TRAIN:
  COVID-19            :  2500
  Normal              :  2500
  Pneumonia           :  2500
  Tuberculosis        :  2500
  TOTAL               : 10000

VAL:
  COVID-19            :   625
  Normal              :   625
  Pneumonia           :   625
  Tuberculosis        :   625
  TOTAL               :  2500

TEST:
  COVID-19            :   625
  Normal              :   625
  Pneumonia           :   625
  Tuberculosis        :   625
  TOTAL               :  2500

TRAIN balance ratio: 1.000 (1.0 = perfect)
  ‚úÖ Well balanced!

VAL balance ratio: 1.000 (1.0 = perfect)
  ‚úÖ Well balanced!

TEST balance ratio: 1.000 (1.0 = perfect)
  ‚úÖ Well balanced!
