# Diabetes Prediction Model Pipeline (Multi-Target)

Machine learning pipeline for predicting diabetes incidence in gallstone patients.

**Targets:**
- `outA`: Primary outcome
- `out2`: Secondary outcome

**Execution Order:**
1. Environment Setup
2. Generate Dummy Data (Optional)
3. For each target:
   - Baseline Characteristics (Table 1)
   - Data Preprocessing
   - Model Training (GridSearchCV)
   - Model Evaluation & SHAP Analysis
   - Performance Comparison Table
   - Model Comparison Figures

## 0. Environment Setup

In [None]:
import os
import sys
import warnings
import numpy as np
import pandas as pd
import glob
warnings.filterwarnings('ignore')

# Set project root
if os.path.basename(os.getcwd()) == 'notebooks':
    os.chdir('..')
PROJECT_ROOT = os.getcwd()

# Add code directory to path
CODE_DIR = os.path.join(PROJECT_ROOT, 'code')
if CODE_DIR not in sys.path:
    sys.path.insert(0, CODE_DIR)

print(f"Project Root: {PROJECT_ROOT}")
print(f"Code Directory: {CODE_DIR}")

In [None]:
# Configuration
CONFIG = {
    # Data paths
    'data_path': os.path.join(PROJECT_ROOT, 'data', 'dummy_diabetes_data.csv'),
    
    # Target variables (will run for each)
    'targets': ['outA', 'out2'],
    
    # Preprocessing settings
    'add_missing_indicator': True,
    'missing_threshold': 0.05,
    
    # Training settings
    'models': ['decision_tree', 'random_forest', 'xgboost', 'catboost', 'ann'],
    'cv_folds': 5,
    'scoring': 'roc_auc',
    'small_grid': True,  # True: quick test, False: full grid
    
    # Bootstrap settings
    'n_bootstrap': 1000,
}

def get_paths(target):
    """Get paths for a specific target"""
    return {
        'processed_dir': os.path.join(PROJECT_ROOT, 'data', 'processed', target),
        'models_dir': os.path.join(PROJECT_ROOT, 'models', target),
        'results_dir': os.path.join(PROJECT_ROOT, 'results', target),
        'tables_dir': os.path.join(PROJECT_ROOT, 'results', target, 'tables'),
        'comparison_dir': os.path.join(PROJECT_ROOT, 'results', target, 'comparison'),
    }

print("Configuration:")
print(f"  Targets: {CONFIG['targets']}")
print(f"  Models: {CONFIG['models']}")
print(f"  Small grid: {CONFIG['small_grid']}")
print(f"  Bootstrap: {CONFIG['n_bootstrap']}")

## 1. Generate Dummy Data (Optional)

Skip this step if you have real data.

In [None]:
GENERATE_DUMMY_DATA = True  # Set False to skip

if GENERATE_DUMMY_DATA:
    from make_dummy import generate_dummy_data
    
    df = generate_dummy_data(n_samples=10000)
    df.to_csv(CONFIG['data_path'], index=False)
    
    print(f"Dummy data saved: {CONFIG['data_path']}")
    print(f"  Samples: {len(df):,}")
    print(f"  Features: {len(df.columns)}")
    print(f"\n  Target distribution:")
    for target in CONFIG['targets']:
        print(f"  - {target}: {df[target].mean():.3f}")
else:
    print("Skipping dummy data generation")

---
## 2. Run Pipeline for Each Target

The following cells will run the complete pipeline for each target variable.

In [None]:
# Import all modules
from create_table1 import create_all_tables
from preprocessing import preprocess_and_save
from train_gridsearch import ModelTrainer
from evaluate import evaluate_model
from create_performance_table import create_performance_table
from create_comparison_figures import create_comparison_figures

print("All modules imported successfully")

In [None]:
# Store results for all targets
all_results = {}

for target in CONFIG['targets']:
    print(f"\n{'='*70}")
    print(f"ðŸŽ¯ TARGET: {target}")
    print(f"{'='*70}")
    
    # Get paths for this target
    paths = get_paths(target)
    
    # Create directories
    for dir_path in paths.values():
        os.makedirs(dir_path, exist_ok=True)
    
    target_results = {
        'paths': paths,
        'training': {},
        'evaluation': {},
    }
    
    # =========================================================================
    # Step 2: Table 1
    # =========================================================================
    print(f"\n--- [{target}] Step 2: Baseline Characteristics ---")
    try:
        tables = create_all_tables(
            data_path=CONFIG['data_path'],
            output_dir=paths['tables_dir'],
            target_col=target
        )
        print(f"  Table 1 saved to: {paths['tables_dir']}")
    except Exception as e:
        print(f"  Error: {e}")
    
    # =========================================================================
    # Step 3: Preprocessing
    # =========================================================================
    print(f"\n--- [{target}] Step 3: Preprocessing ---")
    try:
        result = preprocess_and_save(
            data_path=CONFIG['data_path'],
            output_dir=paths['processed_dir'],
            target_col=target,
            add_missing_indicator=CONFIG['add_missing_indicator'],
            missing_threshold=CONFIG['missing_threshold']
        )
        print(f"  Preprocessing complete: {paths['processed_dir']}")
    except Exception as e:
        print(f"  Error: {e}")
        continue
    
    # Load preprocessed data
    X_train = np.load(os.path.join(paths['processed_dir'], 'X_train.npy'))
    X_test = np.load(os.path.join(paths['processed_dir'], 'X_test.npy'))
    y_train = np.load(os.path.join(paths['processed_dir'], 'y_train.npy'))
    y_test = np.load(os.path.join(paths['processed_dir'], 'y_test.npy'))
    
    with open(os.path.join(paths['processed_dir'], 'feature_names.txt'), 'r') as f:
        feature_names = [line.strip() for line in f.readlines()]
    
    print(f"  Train: {X_train.shape[0]:,} samples, {X_train.shape[1]} features")
    print(f"  Test:  {X_test.shape[0]:,} samples")
    print(f"  Positive rate - Train: {y_train.mean():.3f}, Test: {y_test.mean():.3f}")
    
    # =========================================================================
    # Step 4: Model Training
    # =========================================================================
    print(f"\n--- [{target}] Step 4: Model Training ---")
    
    trainer = ModelTrainer(
        cv=CONFIG['cv_folds'],
        scoring=CONFIG['scoring'],
        use_small_grid=CONFIG['small_grid']
    )
    
    for model_name in CONFIG['models']:
        print(f"\n  Training {model_name}...")
        try:
            best_model, best_params = trainer.train_model(model_name, X_train, y_train)
            eval_result = trainer.evaluate_model(model_name, X_test, y_test)
            target_results['training'][model_name] = eval_result
            print(f"    AUROC: {eval_result['auroc']:.4f}, AUPRC: {eval_result['auprc']:.4f}")
        except Exception as e:
            print(f"    Error: {e}")
    
    # Save models
    saved_paths = trainer.save_all_models(paths['models_dir'], feature_names)
    print(f"\n  Models saved: {list(saved_paths.keys())}")
    
    # =========================================================================
    # Step 5: Model Evaluation & SHAP
    # =========================================================================
    print(f"\n--- [{target}] Step 5: Model Evaluation & SHAP ---")
    
    model_files = glob.glob(os.path.join(paths['models_dir'], '*_best_model.*'))
    
    for model_file in model_files:
        model_name = os.path.basename(model_file).replace('_best_model', '').split('.')[0]
        print(f"\n  Evaluating {model_name}...")
        try:
            result = evaluate_model(
                model_path=model_file,
                data_dir=paths['processed_dir'],
                output_dir=paths['results_dir'],
                model_name=model_name
            )
            target_results['evaluation'][model_name] = result
        except Exception as e:
            print(f"    Error: {e}")
    
    # =========================================================================
    # Step 6: Performance Table
    # =========================================================================
    print(f"\n--- [{target}] Step 6: Performance Table ---")
    try:
        model_paths = glob.glob(os.path.join(paths['models_dir'], '*_best_model.*'))
        performance_table = create_performance_table(
            model_paths=model_paths,
            data_dir=paths['processed_dir'],
            n_bootstrap=CONFIG['n_bootstrap'],
            output_path=os.path.join(paths['tables_dir'], 'model_performance.xlsx')
        )
        target_results['performance_table'] = performance_table
        print(f"  Performance table saved: {paths['tables_dir']}/model_performance.xlsx")
    except Exception as e:
        print(f"  Error: {e}")
    
    # =========================================================================
    # Step 7: Comparison Figures
    # =========================================================================
    print(f"\n--- [{target}] Step 7: Comparison Figures ---")
    try:
        create_comparison_figures(
            models_dir=paths['models_dir'],
            data_dir=paths['processed_dir'],
            output_dir=paths['comparison_dir']
        )
        print(f"  Comparison figures saved: {paths['comparison_dir']}")
    except Exception as e:
        print(f"  Error: {e}")
    
    # Store results
    all_results[target] = target_results
    print(f"\nâœ… Target {target} complete!")

print(f"\n{'='*70}")
print(f"âœ… All targets complete!")
print(f"{'='*70}")

## 3. Results Summary

In [None]:
# Summary of training results
print("Training Results Summary")
print("=" * 70)

for target in CONFIG['targets']:
    if target in all_results:
        print(f"\nðŸŽ¯ Target: {target}")
        print("-" * 50)
        print(f"{'Model':<20} {'AUROC':>10} {'AUPRC':>10}")
        print("-" * 50)
        
        training_results = all_results[target].get('training', {})
        for model_name, result in sorted(training_results.items(), 
                                          key=lambda x: x[1].get('auroc', 0), 
                                          reverse=True):
            auroc = result.get('auroc', 0)
            auprc = result.get('auprc', 0)
            print(f"{model_name:<20} {auroc:>10.4f} {auprc:>10.4f}")

print("\n" + "=" * 70)

In [None]:
# Display performance tables
for target in CONFIG['targets']:
    if target in all_results and 'performance_table' in all_results[target]:
        print(f"\nðŸŽ¯ Performance Table: {target}")
        print("=" * 100)
        display(all_results[target]['performance_table'])

In [None]:
# Display comparison figures
from IPython.display import Image, display

for target in CONFIG['targets']:
    if target in all_results:
        paths = all_results[target]['paths']
        
        print(f"\nðŸŽ¯ Comparison Figures: {target}")
        print("=" * 70)
        
        # Combined figure
        combined_fig = os.path.join(paths['comparison_dir'], 'comparison_combined.png')
        if os.path.exists(combined_fig):
            print(f"\nROC, PR, Calibration:")
            display(Image(filename=combined_fig, width=1200))
        
        # SHAP comparison
        shap_fig = os.path.join(paths['comparison_dir'], 'comparison_shap.png')
        if os.path.exists(shap_fig):
            print(f"\nSHAP Comparison:")
            display(Image(filename=shap_fig, width=1200))

## 4. Output Files Summary

In [None]:
print("="*70)
print("Pipeline Complete!")
print("="*70)

print("\nOutput Files:")
print(f"\n  [Data]")
print(f"    - Raw: {CONFIG['data_path']}")

for target in CONFIG['targets']:
    if target in all_results:
        paths = all_results[target]['paths']
        print(f"\n  [{target}]")
        print(f"    - Processed: {paths['processed_dir']}")
        print(f"    - Models: {paths['models_dir']}")
        print(f"    - Results: {paths['results_dir']}")
        print(f"    - Tables: {paths['tables_dir']}")
        print(f"    - Comparison: {paths['comparison_dir']}")

print("\n" + "="*70)