# Supervised Clustering & Model Training (Reference Exploration Notebook)

This notebook demonstrates supervised classification model training on balanced data.

**Note**: For final results, use the `from_scratch.py` scripts instead:
- `decision_tree_from_scratch.py`
- `random_forest_from_scratch.py`
- `knn_from_scratch.py`

This notebook uses pre-processed balanced dataset (`engineered_features_tomek_enn_balanced.csv`) and is for interactive exploration only.

In [7]:
# Load Pre-processed Data (already balanced)
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, precision_recall_fscore_support

# Paths to pre-processed data
DATA_PATH = Path('f:/DATA/DATA_CLEANED/processed/engineered_features_tomek_enn_balanced.csv')

# Load balanced dataset
df = pd.read_csv(DATA_PATH)
X = df.drop(columns=['fire']).values
y = df['fire'].values

print(f"âœ“ Loaded balanced dataset: {len(df)} samples")
print(f"  Features: {X.shape[1]}")
print(f"  Class distribution: {np.bincount(y.astype(int))}")
print(f"  Fire: {(y==1).sum()} ({100*(y==1).mean():.1f}%), No-fire: {(y==0).sum()} ({100*(y==0).mean():.1f}%)")

âœ“ Loaded balanced dataset: 61524 samples
  Features: 22
  Class distribution: [47314 14210]
  Fire: 14210 (23.1%), No-fire: 47314 (76.9%)


## SECTION 3: Train Models with Cross-Validation

In [8]:
# SECTION 3: Setup Cross-Validation (run once)
# Initialize results dictionary and cross-validation setup
print("="*80)
print("SETTING UP CROSS-VALIDATION")
print("="*80)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, precision_score, recall_score, f1_score

# Setup cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize results dictionary (will be filled by each model's cell)
if 'sklearn_results' not in globals():
    sklearn_results = {}

print("âœ“ Cross-validation setup complete")
print(f"âœ“ Results dictionary initialized: {len(sklearn_results)} models loaded so far")


SETTING UP CROSS-VALIDATION
âœ“ Cross-validation setup complete
âœ“ Results dictionary initialized: 2 models loaded so far


# Decision Tree with Cross-Validation

In [9]:
# Train Decision Tree with Cross-Validation
print("\n" + "="*80)
print("TRAINING: DECISION TREE")
print("="*80)

model_name = 'DecisionTree'
model = DecisionTreeClassifier(max_depth=15, min_samples_leaf=5, random_state=42)

print(f"\n{model_name}:")
metrics_per_fold = []

fold_num = 0
for train_idx, val_idx in cv.split(X, y):
    fold_num += 1
    X_train_fold = X[train_idx]
    X_val_fold = X[val_idx]
    y_train_fold = y[train_idx]
    y_val_fold = y[val_idx]
    
    # Train model
    model.fit(X_train_fold, y_train_fold)
    
    # Get predictions
    y_pred = model.predict(X_val_fold)
    y_proba = model.predict_proba(X_val_fold)[:, 1]
    
    # Compute metrics
    roc_auc = roc_auc_score(y_val_fold, y_proba)
    precision, recall, pr_thresholds = precision_recall_curve(y_val_fold, y_proba)
    pr_auc = auc(recall, precision)
    
    # Find best threshold based on F1
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
    best_threshold_idx = np.argmax(f1_scores)
    best_threshold = pr_thresholds[best_threshold_idx] if best_threshold_idx < len(pr_thresholds) else 0.5
    
    y_pred_tuned = (y_proba >= best_threshold).astype(int)
    
    metrics = {
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'precision': precision_score(y_val_fold, y_pred_tuned, zero_division=0),
        'recall': recall_score(y_val_fold, y_pred_tuned, zero_division=0),
        'f1': f1_score(y_val_fold, y_pred_tuned, zero_division=0),
        'threshold': best_threshold
    }
    
    metrics_per_fold.append(metrics)
    print(f"  Fold {fold_num}: ROC-AUC={roc_auc:.4f}, F1={metrics['f1']:.4f}, Threshold={best_threshold:.4f}")

# Compute aggregated metrics
agg_metrics = {
    'roc_auc_mean': np.mean([m['roc_auc'] for m in metrics_per_fold]),
    'roc_auc_std': np.std([m['roc_auc'] for m in metrics_per_fold]),
    'pr_auc_mean': np.mean([m['pr_auc'] for m in metrics_per_fold]),
    'pr_auc_std': np.std([m['pr_auc'] for m in metrics_per_fold]),
    'precision_mean': np.mean([m['precision'] for m in metrics_per_fold]),
    'precision_std': np.std([m['precision'] for m in metrics_per_fold]),
    'recall_mean': np.mean([m['recall'] for m in metrics_per_fold]),
    'recall_std': np.std([m['recall'] for m in metrics_per_fold]),
    'f1_mean': np.mean([m['f1'] for m in metrics_per_fold]),
    'f1_std': np.std([m['f1'] for m in metrics_per_fold]),
    'threshold_mean': np.mean([m['threshold'] for m in metrics_per_fold]),
    'threshold_std': np.std([m['threshold'] for m in metrics_per_fold])
}

sklearn_results[model_name] = {
    'model_config': {
        'model': model_name,
        'source': 'sklearn',
        'params': model.get_params()
    },
    'metrics_per_fold': metrics_per_fold,
    'aggregated': agg_metrics
}

print(f"  Mean ROC-AUC: {agg_metrics['roc_auc_mean']:.4f} Â± {agg_metrics['roc_auc_std']:.4f}")
print(f"  Mean F1: {agg_metrics['f1_mean']:.4f} Â± {agg_metrics['f1_std']:.4f}")
print(f"\nâœ“ {model_name} training complete!")


TRAINING: DECISION TREE

DecisionTree:
  Fold 1: ROC-AUC=0.8841, F1=0.7618, Threshold=0.5000
  Fold 2: ROC-AUC=0.8945, F1=0.7637, Threshold=0.4375
  Fold 3: ROC-AUC=0.8805, F1=0.7587, Threshold=0.6154
  Fold 4: ROC-AUC=0.8907, F1=0.7656, Threshold=0.5000
  Fold 5: ROC-AUC=0.8793, F1=0.7615, Threshold=0.5556
  Mean ROC-AUC: 0.8858 Â± 0.0059
  Mean F1: 0.7622 Â± 0.0023

âœ“ DecisionTree training complete!


In [10]:
# Train Random Forest with Cross-Validation
print("\n" + "="*80)
print("TRAINING: RANDOM FOREST")
print("="*80)

model_name = 'RandomForest'
model = RandomForestClassifier(n_estimators=100, max_depth=15, min_samples_leaf=5, random_state=42, n_jobs=-1)

print(f"\n{model_name}:")
metrics_per_fold = []

fold_num = 0
for train_idx, val_idx in cv.split(X, y):
    fold_num += 1
    X_train_fold = X[train_idx]
    X_val_fold = X[val_idx]
    y_train_fold = y[train_idx]
    y_val_fold = y[val_idx]
    
    # Train model
    model.fit(X_train_fold, y_train_fold)
    
    # Get predictions
    y_pred = model.predict(X_val_fold)
    y_proba = model.predict_proba(X_val_fold)[:, 1]
    
    # Compute metrics
    roc_auc = roc_auc_score(y_val_fold, y_proba)
    precision, recall, pr_thresholds = precision_recall_curve(y_val_fold, y_proba)
    pr_auc = auc(recall, precision)
    
    # Find best threshold based on F1
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
    best_threshold_idx = np.argmax(f1_scores)
    best_threshold = pr_thresholds[best_threshold_idx] if best_threshold_idx < len(pr_thresholds) else 0.5
    
    y_pred_tuned = (y_proba >= best_threshold).astype(int)
    
    metrics = {
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'precision': precision_score(y_val_fold, y_pred_tuned, zero_division=0),
        'recall': recall_score(y_val_fold, y_pred_tuned, zero_division=0),
        'f1': f1_score(y_val_fold, y_pred_tuned, zero_division=0),
        'threshold': best_threshold
    }
    
    metrics_per_fold.append(metrics)
    print(f"  Fold {fold_num}: ROC-AUC={roc_auc:.4f}, F1={metrics['f1']:.4f}, Threshold={best_threshold:.4f}")

# Compute aggregated metrics
agg_metrics = {
    'roc_auc_mean': np.mean([m['roc_auc'] for m in metrics_per_fold]),
    'roc_auc_std': np.std([m['roc_auc'] for m in metrics_per_fold]),
    'pr_auc_mean': np.mean([m['pr_auc'] for m in metrics_per_fold]),
    'pr_auc_std': np.std([m['pr_auc'] for m in metrics_per_fold]),
    'precision_mean': np.mean([m['precision'] for m in metrics_per_fold]),
    'precision_std': np.std([m['precision'] for m in metrics_per_fold]),
    'recall_mean': np.mean([m['recall'] for m in metrics_per_fold]),
    'recall_std': np.std([m['recall'] for m in metrics_per_fold]),
    'f1_mean': np.mean([m['f1'] for m in metrics_per_fold]),
    'f1_std': np.std([m['f1'] for m in metrics_per_fold]),
    'threshold_mean': np.mean([m['threshold'] for m in metrics_per_fold]),
    'threshold_std': np.std([m['threshold'] for m in metrics_per_fold])
}

sklearn_results[model_name] = {
    'model_config': {
        'model': model_name,
        'source': 'sklearn',
        'params': model.get_params()
    },
    'metrics_per_fold': metrics_per_fold,
    'aggregated': agg_metrics
}

print(f"  Mean ROC-AUC: {agg_metrics['roc_auc_mean']:.4f} Â± {agg_metrics['roc_auc_std']:.4f}")
print(f"  Mean F1: {agg_metrics['f1_mean']:.4f} Â± {agg_metrics['f1_std']:.4f}")
print(f"\nâœ“ {model_name} training complete!")


TRAINING: RANDOM FOREST

RandomForest:
  Fold 1: ROC-AUC=0.9025, F1=0.8089, Threshold=0.3603
  Fold 2: ROC-AUC=0.9095, F1=0.8144, Threshold=0.3353
  Fold 3: ROC-AUC=0.8957, F1=0.8046, Threshold=0.3442
  Fold 4: ROC-AUC=0.9118, F1=0.8208, Threshold=0.3540
  Fold 5: ROC-AUC=0.8986, F1=0.8124, Threshold=0.4136
  Mean ROC-AUC: 0.9036 Â± 0.0062
  Mean F1: 0.8122 Â± 0.0054

âœ“ RandomForest training complete!


In [11]:
# Train KNN with Cross-Validation
print("\n" + "="*80)
print("TRAINING: KNN")
print("="*80)

model_name = 'KNN'
model = KNeighborsClassifier(n_neighbors=11, metric='manhattan', n_jobs=-1)

print(f"\n{model_name}:")
metrics_per_fold = []

fold_num = 0
for train_idx, val_idx in cv.split(X, y):
    fold_num += 1
    X_train_fold = X[train_idx]
    X_val_fold = X[val_idx]
    y_train_fold = y[train_idx]
    y_val_fold = y[val_idx]
    
    # Train model
    model.fit(X_train_fold, y_train_fold)
    
    # Get predictions
    y_pred = model.predict(X_val_fold)
    y_proba = model.predict_proba(X_val_fold)[:, 1]
    
    # Compute metrics
    roc_auc = roc_auc_score(y_val_fold, y_proba)
    precision, recall, pr_thresholds = precision_recall_curve(y_val_fold, y_proba)
    pr_auc = auc(recall, precision)
    
    # Find best threshold based on F1
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
    best_threshold_idx = np.argmax(f1_scores)
    best_threshold = pr_thresholds[best_threshold_idx] if best_threshold_idx < len(pr_thresholds) else 0.5
    
    y_pred_tuned = (y_proba >= best_threshold).astype(int)
    
    metrics = {
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'precision': precision_score(y_val_fold, y_pred_tuned, zero_division=0),
        'recall': recall_score(y_val_fold, y_pred_tuned, zero_division=0),
        'f1': f1_score(y_val_fold, y_pred_tuned, zero_division=0),
        'threshold': best_threshold
    }
    
    metrics_per_fold.append(metrics)
    print(f"  Fold {fold_num}: ROC-AUC={roc_auc:.4f}, F1={metrics['f1']:.4f}, Threshold={best_threshold:.4f}")

# Compute aggregated metrics
agg_metrics = {
    'roc_auc_mean': np.mean([m['roc_auc'] for m in metrics_per_fold]),
    'roc_auc_std': np.std([m['roc_auc'] for m in metrics_per_fold]),
    'pr_auc_mean': np.mean([m['pr_auc'] for m in metrics_per_fold]),
    'pr_auc_std': np.std([m['pr_auc'] for m in metrics_per_fold]),
    'precision_mean': np.mean([m['precision'] for m in metrics_per_fold]),
    'precision_std': np.std([m['precision'] for m in metrics_per_fold]),
    'recall_mean': np.mean([m['recall'] for m in metrics_per_fold]),
    'recall_std': np.std([m['recall'] for m in metrics_per_fold]),
    'f1_mean': np.mean([m['f1'] for m in metrics_per_fold]),
    'f1_std': np.std([m['f1'] for m in metrics_per_fold]),
    'threshold_mean': np.mean([m['threshold'] for m in metrics_per_fold]),
    'threshold_std': np.std([m['threshold'] for m in metrics_per_fold])
}

sklearn_results[model_name] = {
    'model_config': {
        'model': model_name,
        'source': 'sklearn',
        'params': model.get_params()
    },
    'metrics_per_fold': metrics_per_fold,
    'aggregated': agg_metrics
}

print(f"  Mean ROC-AUC: {agg_metrics['roc_auc_mean']:.4f} Â± {agg_metrics['roc_auc_std']:.4f}")
print(f"  Mean F1: {agg_metrics['f1_mean']:.4f} Â± {agg_metrics['f1_std']:.4f}")
print(f"\nâœ“ {model_name} training complete!")


TRAINING: KNN

KNN:
  Fold 1: ROC-AUC=0.8934, F1=0.7737, Threshold=0.6364
  Fold 2: ROC-AUC=0.9038, F1=0.7936, Threshold=0.6364
  Fold 3: ROC-AUC=0.8914, F1=0.7823, Threshold=0.6364
  Fold 4: ROC-AUC=0.9053, F1=0.7960, Threshold=0.6364
  Fold 5: ROC-AUC=0.8933, F1=0.7849, Threshold=0.6364
  Mean ROC-AUC: 0.8974 Â± 0.0059
  Mean F1: 0.7861 Â± 0.0080

âœ“ KNN training complete!


## SECTION 3D: Train KNN (Run independently)

## SECTION 3C: Train Random Forest (Run independently)

## SECTION 3B: Train Decision Tree (Run independently)

In [12]:
import json
from pathlib import Path

print("\n" + "="*80)
print("EXPORTING SKLEARN RESULTS FOR COMPARISON WITH FROM-SCRATCH")
print("="*80)

if 'sklearn_results' in globals() and sklearn_results:
    output_dir = Path('f:/DATA/results/supervised')
    output_dir.mkdir(parents=True, exist_ok=True)
    
    print("\nðŸ“Š Exporting sklearn model results...")
    
    exported_count = 0
    # Export each model's results as JSON (matching from-scratch format)
    for model_name, results in sklearn_results.items():
        filename = output_dir / f'{model_name.lower()}_sklearn_summary.json'
        
        with open(filename, 'w') as f:
            json.dump(results, f, indent=2)
        
        agg = results['aggregated']
        print(f"\nâœ“ {model_name} exported")
        print(f"  ROC-AUC: {agg['roc_auc_mean']:.4f} Â± {agg['roc_auc_std']:.4f}")
        print(f"  PR-AUC:  {agg['pr_auc_mean']:.4f} Â± {agg['pr_auc_std']:.4f}")
        print(f"  F1:      {agg['f1_mean']:.4f} Â± {agg['f1_std']:.4f}")
        print(f"  â†’ {filename.name}")
        exported_count += 1
    
    print(f"\nâœ“ All {exported_count} sklearn results exported to: {output_dir}")
    print("\nâœ… Ready for comparison with from-scratch implementations!")
else:
    print("âš  sklearn_results not found or empty.")
    print("   Make sure to run all 3 model training cells first:")


EXPORTING SKLEARN RESULTS FOR COMPARISON WITH FROM-SCRATCH

ðŸ“Š Exporting sklearn model results...

âœ“ DecisionTree exported
  ROC-AUC: 0.8858 Â± 0.0059
  PR-AUC:  0.8315 Â± 0.0064
  F1:      0.7622 Â± 0.0023
  â†’ decisiontree_sklearn_summary.json

âœ“ RandomForest exported
  ROC-AUC: 0.9036 Â± 0.0062
  PR-AUC:  0.8699 Â± 0.0065
  F1:      0.8122 Â± 0.0054
  â†’ randomforest_sklearn_summary.json

âœ“ KNN exported
  ROC-AUC: 0.8974 Â± 0.0059
  PR-AUC:  0.8567 Â± 0.0071
  F1:      0.7861 Â± 0.0080
  â†’ knn_sklearn_summary.json

âœ“ All 3 sklearn results exported to: f:\DATA\results\supervised

âœ… Ready for comparison with from-scratch implementations!


## SECTION 4: Export Sklearn Results