# YOLO Model Testing and Analysis Notebook
This notebook tests the latest trained YOLO models on the test dataset with comprehensive confidence score analysis, prediction filtering, and detailed visualizations.

## 1. Import Required Libraries

In [1]:
import os

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from ultralytics import YOLO
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.calibration import calibration_curve
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)

# Set up matplotlib defaults
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("Libraries imported successfully!")

: 

## 2. Load Trained Model and Test Dataset

In [None]:
# Configuration
MODELS_DIR = Path("DeTect-BMMS/runs")
DATASET_PATH = Path("../dataset/csvs/splits")
TEST_FILE = DATASET_PATH / "test.txt"

# Class names from DeTect dataset
CLASS_NAMES = {
    0: 'bat',
    1: 'bird',
    2: 'insect',
    3: 'drone',
    4: 'plane',
    5: 'other',
    6: 'unknown'
}

# Find the latest trained model
def find_latest_model(models_dir):
    """Find the most recent best.pt model"""
    model_paths = list(models_dir.glob("*/weights/best.pt"))
    if not model_paths:
        print("No trained models found!")
        return None
    
    # Sort by modification time
    latest_model = max(model_paths, key=lambda p: p.stat().st_mtime)
    return latest_model

latest_model_path = find_latest_model(MODELS_DIR)
print(f"Latest model found: {latest_model_path}")

# Load the model
model = YOLO(str(latest_model_path))
print(f"Model loaded successfully!")

# Load test image paths
with open(TEST_FILE, 'r') as f:
    test_images = [Path(line.strip()) for line in f.readlines() if line.strip()]

print(f"Test dataset loaded: {len(test_images)} images")

## 3. Generate Predictions on Test Data

In [None]:
def load_annotations(image_path):
    """Load YOLO format annotations from .txt file"""
    txt_path = image_path.with_suffix('.txt')
    
    if not txt_path.exists():
        return None
    
    annotations = []
    with open(txt_path, 'r') as f:
        for line in f.readlines():
            parts = line.strip().split()
            if len(parts) >= 5:
                class_id = int(parts[0])
                x_center = float(parts[1])
                y_center = float(parts[2])
                width = float(parts[3])
                height = float(parts[4])
                annotations.append({
                    'class_id': class_id,
                    'class_name': CLASS_NAMES.get(class_id, 'unknown'),
                    'x_center': x_center,
                    'y_center': y_center,
                    'width': width,
                    'height': height
                })
    
    return annotations if annotations else None

# Run inference on test dataset
predictions_data = []

print(f"Running inference on {len(test_images)} test images...")

for idx, img_path in enumerate(test_images):
    if (idx + 1) % 100 == 0:
        print(f"Processed {idx + 1}/{len(test_images)} images")
    
    # Run prediction
    results = model.predict(source=str(img_path), conf=0.1, verbose=False)
    result = results[0]
    
    # Load ground truth annotations
    gt_annotations = load_annotations(img_path)
    
    # Extract predictions
    if result.boxes is not None and len(result.boxes) > 0:
        for box in result.boxes:
            pred_class_id = int(box.cls)
            confidence = float(box.conf)
            
            predictions_data.append({
                'image_path': str(img_path),
                'image_name': img_path.name,
                'pred_class_id': pred_class_id,
                'pred_class_name': CLASS_NAMES.get(pred_class_id, 'unknown'),
                'confidence': confidence,
                'gt_annotations': gt_annotations
            })
    else:
        # Image with no detections
        predictions_data.append({
            'image_path': str(img_path),
            'image_name': img_path.name,
            'pred_class_id': -1,
            'pred_class_name': 'no_detection',
            'confidence': 0.0,
            'gt_annotations': gt_annotations
        })

# Create DataFrame
predictions_df = pd.DataFrame(predictions_data)
print(f"\nInference complete! Generated {len(predictions_df)} predictions")
print(f"DataFrame shape: {predictions_df.shape}")
predictions_df.head(10)

## 4. Calculate Confidence Scores and Extract Ground Truth

In [None]:
# Extract ground truth class (take the first annotation if multiple exist)
def extract_ground_truth(annotations):
    """Extract ground truth class from annotations"""
    if annotations is None or len(annotations) == 0:
        return -1, 'no_annotation'
    return annotations[0]['class_id'], annotations[0]['class_name']

predictions_df[['gt_class_id', 'gt_class_name']] = predictions_df['gt_annotations'].apply(
    lambda x: pd.Series(extract_ground_truth(x))
)

# Determine if prediction is correct
predictions_df['is_correct'] = (predictions_df['pred_class_id'] == predictions_df['gt_class_id']) & \
                                (predictions_df['gt_class_id'] >= 0)

# Confidence ranges
def assign_confidence_range(conf):
    if conf < 0.5:
        return '<0.5'
    elif conf < 0.7:
        return '0.5-0.7'
    elif conf < 0.9:
        return '0.7-0.9'
    else:
        return '>0.9'

predictions_df['confidence_range'] = predictions_df['confidence'].apply(assign_confidence_range)

# Statistics
print("=" * 60)
print("PREDICTION STATISTICS")
print("=" * 60)
print(f"\nTotal predictions: {len(predictions_df)}")
print(f"Correct predictions: {predictions_df['is_correct'].sum()}")
print(f"Incorrect predictions: {(~predictions_df['is_correct']).sum()}")
print(f"Accuracy: {predictions_df['is_correct'].mean():.4f}")

print(f"\nConfidence score statistics:")
print(f"  Mean: {predictions_df['confidence'].mean():.4f}")
print(f"  Median: {predictions_df['confidence'].median():.4f}")
print(f"  Min: {predictions_df['confidence'].min():.4f}")
print(f"  Max: {predictions_df['confidence'].max():.4f}")
print(f"  Std: {predictions_df['confidence'].std():.4f}")

print(f"\nPredictions by confidence range:")
print(predictions_df['confidence_range'].value_counts().sort_index())

predictions_df.head(15)

## 5. Filter Predictions by Confidence Thresholds

In [None]:
def filter_by_confidence(df, min_conf=0.0, max_conf=1.0):
    """Filter predictions by confidence threshold range"""
    return df[(df['confidence'] >= min_conf) & (df['confidence'] < max_conf)].copy()

# Interactive confidence filtering
def get_filtered_stats(df, min_conf=0.0, max_conf=1.0):
    """Get statistics for filtered predictions"""
    filtered = filter_by_confidence(df, min_conf, max_conf)
    
    if len(filtered) == 0:
        return {}
    
    return {
        'count': len(filtered),
        'accuracy': filtered['is_correct'].mean(),
        'correct': filtered['is_correct'].sum(),
        'incorrect': (~filtered['is_correct']).sum(),
        'mean_confidence': filtered['confidence'].mean(),
    }

# Show filtering examples
print("Filtering examples by confidence thresholds:")
print("\nConf >= 0.5:")
print(get_filtered_stats(predictions_df, 0.5, 1.0))

print("\nConf >= 0.7:")
print(get_filtered_stats(predictions_df, 0.7, 1.0))

print("\nConf >= 0.9:")
print(get_filtered_stats(predictions_df, 0.9, 1.0))

print("\nConf < 0.5:")
print(get_filtered_stats(predictions_df, 0.0, 0.5))

# Function to display predictions
def display_predictions(df, title="", n=10):
    """Display sample predictions"""
    print(f"\n{title} (showing {min(n, len(df))} of {len(df)} records)")
    print("=" * 120)
    cols_to_show = ['image_name', 'pred_class_name', 'confidence', 'gt_class_name', 'is_correct']
    print(df[cols_to_show].head(n).to_string(index=False))
    print("=" * 120)

## 6. Identify Correct and Incorrect Predictions

In [None]:
# Separate correct and incorrect predictions
correct_predictions = predictions_df[predictions_df['is_correct']].copy()
incorrect_predictions = predictions_df[~predictions_df['is_correct']].copy()

print("=" * 60)
print("PREDICTION ACCURACY BREAKDOWN")
print("=" * 60)
print(f"\nCorrect Predictions: {len(correct_predictions)} ({len(correct_predictions)/len(predictions_df)*100:.2f}%)")
print(f"Incorrect Predictions: {len(incorrect_predictions)} ({len(incorrect_predictions)/len(predictions_df)*100:.2f}%)")

print(f"\nCorrect predictions by confidence range:")
print(correct_predictions['confidence_range'].value_counts().sort_index())

print(f"\nIncorrect predictions by confidence range:")
print(incorrect_predictions['confidence_range'].value_counts().sort_index())

print(f"\nMean confidence for correct predictions: {correct_predictions['confidence'].mean():.4f}")
print(f"Mean confidence for incorrect predictions: {incorrect_predictions['confidence'].mean():.4f}")

# Show some examples
display_predictions(correct_predictions.sort_values('confidence'), 
                   title="SAMPLE CORRECT PREDICTIONS (sorted by confidence)", n=10)
display_predictions(incorrect_predictions.sort_values('confidence'), 
                   title="SAMPLE INCORRECT PREDICTIONS (sorted by confidence)", n=10)

## 7. Plot Lowest Confidence Score Predictions

In [None]:
from PIL import Image
import os

def plot_image_with_info(ax, image_path, pred_class, pred_conf, gt_class, is_correct, title_suffix=""):
    """Plot image with prediction information"""
    try:
        img = Image.open(image_path)
        ax.imshow(img)
    except Exception as e:
        ax.text(0.5, 0.5, f"Error loading image:\n{str(e)}", 
               ha='center', va='center', transform=ax.transAxes)
    
    # Create title with prediction info
    title_color = 'green' if is_correct else 'red'
    title = f"Pred: {pred_class} ({pred_conf:.3f})\nGT: {gt_class}\n{title_suffix}"
    ax.set_title(title, fontsize=10, fontweight='bold', color=title_color)
    ax.axis('off')

# Get lowest confidence predictions
lowest_conf_predictions = predictions_df.nsmallest(9, 'confidence')

print(f"Visualizing {len(lowest_conf_predictions)} lowest confidence predictions...")
print(lowest_conf_predictions[['image_name', 'pred_class_name', 'confidence', 'gt_class_name', 'is_correct']].to_string(index=False))

fig, axes = plt.subplots(3, 3, figsize=(15, 12))
fig.suptitle('Lowest Confidence Score Predictions', fontsize=16, fontweight='bold')

for idx, (ax, (_, row)) in enumerate(zip(axes.flat, lowest_conf_predictions.iterrows())):
    image_path = row['image_path']
    if os.path.exists(image_path):
        plot_image_with_info(
            ax, image_path,
            row['pred_class_name'], row['confidence'],
            row['gt_class_name'], row['is_correct'],
            f"Conf: {row['confidence']:.3f}"
        )
    else:
        ax.text(0.5, 0.5, f"Image not found:\n{row['image_name']}", 
               ha='center', va='center', transform=ax.transAxes)
        ax.axis('off')

plt.tight_layout()
plt.show()

print("Lowest confidence predictions visualization complete!")

## 8. Plot Highest Confidence Score Predictions

In [None]:
# Get highest confidence predictions
highest_conf_predictions = predictions_df.nlargest(9, 'confidence')

print(f"Visualizing {len(highest_conf_predictions)} highest confidence predictions...")
print(highest_conf_predictions[['image_name', 'pred_class_name', 'confidence', 'gt_class_name', 'is_correct']].to_string(index=False))

fig, axes = plt.subplots(3, 3, figsize=(15, 12))
fig.suptitle('Highest Confidence Score Predictions', fontsize=16, fontweight='bold')

for idx, (ax, (_, row)) in enumerate(zip(axes.flat, highest_conf_predictions.iterrows())):
    image_path = row['image_path']
    if os.path.exists(image_path):
        plot_image_with_info(
            ax, image_path,
            row['pred_class_name'], row['confidence'],
            row['gt_class_name'], row['is_correct'],
            f"Conf: {row['confidence']:.3f}"
        )
    else:
        ax.text(0.5, 0.5, f"Image not found:\n{row['image_name']}", 
               ha='center', va='center', transform=ax.transAxes)
        ax.axis('off')

plt.tight_layout()
plt.show()

print("Highest confidence predictions visualization complete!")

## 9. Plot Incorrect Predictions

In [None]:
# Get sample of incorrect predictions (sorted by confidence - highest first)
sample_incorrect = incorrect_predictions.nlargest(9, 'confidence')

print(f"Visualizing {len(sample_incorrect)} incorrect predictions (highest confidence errors)...")
print(sample_incorrect[['image_name', 'pred_class_name', 'confidence', 'gt_class_name']].to_string(index=False))

fig, axes = plt.subplots(3, 3, figsize=(15, 12))
fig.suptitle('Incorrect Predictions (High Confidence Errors)', fontsize=16, fontweight='bold')

for idx, (ax, (_, row)) in enumerate(zip(axes.flat, sample_incorrect.iterrows())):
    image_path = row['image_path']
    if os.path.exists(image_path):
        plot_image_with_info(
            ax, image_path,
            row['pred_class_name'], row['confidence'],
            row['gt_class_name'], False,
            f"ERROR: Conf {row['confidence']:.3f}"
        )
    else:
        ax.text(0.5, 0.5, f"Image not found:\n{row['image_name']}", 
               ha='center', va='center', transform=ax.transAxes)
        ax.axis('off')

plt.tight_layout()
plt.show()

print("Incorrect predictions visualization complete!")

## 10. Plot Correct Predictions

In [None]:
# Get sample of correct predictions (sorted by confidence - lowest first to see uncertain correct ones)
sample_correct_uncertain = correct_predictions.nsmallest(9, 'confidence')

print(f"Visualizing {len(sample_correct_uncertain)} correct predictions (uncertain but correct)...")
print(sample_correct_uncertain[['image_name', 'pred_class_name', 'confidence', 'gt_class_name']].to_string(index=False))

fig, axes = plt.subplots(3, 3, figsize=(15, 12))
fig.suptitle('Correct Predictions (Uncertain but Correct)', fontsize=16, fontweight='bold')

for idx, (ax, (_, row)) in enumerate(zip(axes.flat, sample_correct_uncertain.iterrows())):
    image_path = row['image_path']
    if os.path.exists(image_path):
        plot_image_with_info(
            ax, image_path,
            row['pred_class_name'], row['confidence'],
            row['gt_class_name'], True,
            f"CORRECT: Conf {row['confidence']:.3f}"
        )
    else:
        ax.text(0.5, 0.5, f"Image not found:\n{row['image_name']}", 
               ha='center', va='center', transform=ax.transAxes)
        ax.axis('off')

plt.tight_layout()
plt.show()

print("Correct predictions visualization complete!")

## 11. Visualize Ground Truth vs Predictions - Confidence Distribution

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Ground Truth vs Predictions Analysis', fontsize=16, fontweight='bold')

# 1. Confidence distribution by correctness
ax = axes[0, 0]
correct_predictions['confidence'].hist(bins=30, alpha=0.6, label='Correct', ax=ax, color='green')
incorrect_predictions['confidence'].hist(bins=30, alpha=0.6, label='Incorrect', ax=ax, color='red')
ax.set_xlabel('Confidence Score')
ax.set_ylabel('Frequency')
ax.set_title('Confidence Score Distribution: Correct vs Incorrect')
ax.legend()
ax.grid(True, alpha=0.3)

# 2. Predictions by confidence range
ax = axes[0, 1]
conf_range_counts = predictions_df['confidence_range'].value_counts().sort_index()
colors = ['#ff6b6b', '#ffa500', '#4ecdc4', '#45b7d1']
bars = ax.bar(range(len(conf_range_counts)), conf_range_counts.values, color=colors)
ax.set_xticks(range(len(conf_range_counts)))
ax.set_xticklabels(conf_range_counts.index, rotation=0)
ax.set_ylabel('Count')
ax.set_title('Predictions by Confidence Range')
ax.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height)}',
            ha='center', va='bottom', fontweight='bold')

# 3. Accuracy by confidence range
ax = axes[1, 0]
accuracy_by_range = predictions_df.groupby('confidence_range')['is_correct'].agg(['sum', 'count'])
accuracy_by_range['accuracy'] = accuracy_by_range['sum'] / accuracy_by_range['count']
x_pos = range(len(accuracy_by_range))
bars = ax.bar(x_pos, accuracy_by_range['accuracy'].values, color=colors)
ax.set_xticks(x_pos)
ax.set_xticklabels(accuracy_by_range.index, rotation=0)
ax.set_ylabel('Accuracy')
ax.set_ylim([0, 1])
ax.set_title('Accuracy by Confidence Range')
ax.grid(True, alpha=0.3, axis='y')
ax.axhline(y=predictions_df['is_correct'].mean(), color='r', linestyle='--', label='Overall Accuracy')
ax.legend()

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.2%}',
            ha='center', va='bottom', fontweight='bold')

# 4. Confidence score statistics
ax = axes[1, 1]
ax.axis('off')

stats_text = f"""
CONFIDENCE STATISTICS

Overall:
  Total Predictions: {len(predictions_df)}
  Mean Confidence: {predictions_df['confidence'].mean():.4f}
  Median Confidence: {predictions_df['confidence'].median():.4f}
  Std Dev: {predictions_df['confidence'].std():.4f}

Correct Predictions:
  Count: {len(correct_predictions)}
  Mean Confidence: {correct_predictions['confidence'].mean():.4f}
  Accuracy: {len(correct_predictions)/len(predictions_df):.2%}

Incorrect Predictions:
  Count: {len(incorrect_predictions)}
  Mean Confidence: {incorrect_predictions['confidence'].mean():.4f}
"""

ax.text(0.1, 0.9, stats_text, transform=ax.transAxes, fontsize=11,
        verticalalignment='top', fontfamily='monospace',
        bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

plt.tight_layout()
plt.show()

## 12. Create Confidence Score Calibration Plots

In [None]:
# Prepare data for calibration (only valid predictions with ground truth)
valid_predictions = predictions_df[predictions_df['gt_class_id'] >= 0].copy()

if len(valid_predictions) > 0:
    # Convert is_correct to binary (1 for correct, 0 for incorrect)
    y_true = valid_predictions['is_correct'].astype(int).values
    y_scores = valid_predictions['confidence'].values
    
    # Calculate calibration curve
    prob_true, prob_pred = calibration_curve(y_true, y_scores, n_bins=10, strategy='uniform')
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    fig.suptitle('Confidence Score Calibration Analysis', fontsize=16, fontweight='bold')
    
    # 1. Calibration curve
    ax = axes[0]
    ax.plot([0, 1], [0, 1], 'k--', label='Perfectly calibrated')
    ax.plot(prob_pred, prob_true, 's-', label='Model', linewidth=2, markersize=8)
    ax.set_xlabel('Mean Predicted Confidence', fontsize=12)
    ax.set_ylabel('Empirical Probability', fontsize=12)
    ax.set_title('Calibration Curve\n(Expected Calibration Error)')
    ax.legend(loc='lower right', fontsize=11)
    ax.grid(True, alpha=0.3)
    ax.set_xlim([0, 1])
    ax.set_ylim([0, 1])
    
    # 2. Reliability diagram (histogram)
    ax = axes[1]
    n, bins, patches = ax.hist(y_scores, bins=10, range=(0, 1), alpha=0.3, 
                               edgecolor='black', label='Sample distribution')
    
    # Color bars based on bin accuracy
    for i in range(len(prob_pred)):
        bin_center = (bins[i] + bins[i+1]) / 2
        patch_idx = i
        if patch_idx < len(patches):
            # Color based on calibration goodness
            if abs(prob_pred[i] - prob_true[i]) < 0.1:
                patches[patch_idx].set_facecolor('green')
            elif abs(prob_pred[i] - prob_true[i]) < 0.2:
                patches[patch_idx].set_facecolor('yellow')
            else:
                patches[patch_idx].set_facecolor('red')
    
    ax.plot(prob_pred, len(y_scores) * 0.1, 'rs-', label='Bin accuracy', markersize=8)
    ax.set_xlabel('Confidence Score Bins', fontsize=12)
    ax.set_ylabel('Count / Accuracy', fontsize=12)
    ax.set_title('Reliability Diagram')
    ax.legend(loc='upper left', fontsize=11)
    ax.grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.show()
    
    # Calculate calibration metrics
    expected_calibration_error = np.mean(np.abs(prob_pred - prob_true))
    
    print("=" * 60)
    print("CALIBRATION METRICS")
    print("=" * 60)
    print(f"Expected Calibration Error (ECE): {expected_calibration_error:.4f}")
    print(f"\nCalibration by bin:")
    print(f"{'Bin':<15} {'Mean Pred':<15} {'Accuracy':<15} {'Difference':<15}")
    print("-" * 60)
    for i, (pred, true) in enumerate(zip(prob_pred, prob_true)):
        print(f"Bin {i+1:<10} {pred:<15.4f} {true:<15.4f} {abs(pred-true):<15.4f}")
else:
    print("No valid predictions with ground truth available for calibration analysis")

## 13. Generate Confusion Matrix

In [None]:
# Generate confusion matrix for valid predictions
valid_preds = predictions_df[predictions_df['gt_class_id'] >= 0].copy()

if len(valid_preds) > 0:
    y_true = valid_preds['gt_class_id'].values
    y_pred = valid_preds['pred_class_id'].values
    
    # Create confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    
    # Get unique classes
    unique_classes = np.unique(np.concatenate([y_true, y_pred]))
    class_labels = [CLASS_NAMES.get(c, f'Class {c}') for c in unique_classes]
    
    # Plot confusion matrix
    fig, ax = plt.subplots(figsize=(12, 10))
    
    # Create heatmap
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax, 
                xticklabels=class_labels, yticklabels=class_labels,
                cbar_kws={'label': 'Count'})
    
    ax.set_xlabel('Predicted Label', fontsize=12, fontweight='bold')
    ax.set_ylabel('True Label', fontsize=12, fontweight='bold')
    ax.set_title('Confusion Matrix - All Predictions', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    # Print detailed classification report
    print("\n" + "=" * 80)
    print("DETAILED CLASSIFICATION REPORT")
    print("=" * 80)
    print(classification_report(y_true, y_pred, target_names=class_labels, digits=4))
    
    # Print overall statistics
    print("=" * 80)
    print("OVERALL STATISTICS")
    print("=" * 80)
    print(f"Total valid predictions: {len(valid_preds)}")
    print(f"Overall Accuracy: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Macro Precision: {precision_score(y_true, y_pred, average='macro', zero_division=0):.4f}")
    print(f"Macro Recall: {recall_score(y_true, y_pred, average='macro', zero_division=0):.4f}")
    print(f"Macro F1: {f1_score(y_true, y_pred, average='macro', zero_division=0):.4f}")
    print(f"Weighted Precision: {precision_score(y_true, y_pred, average='weighted', zero_division=0):.4f}")
    print(f"Weighted Recall: {recall_score(y_true, y_pred, average='weighted', zero_division=0):.4f}")
    print(f"Weighted F1: {f1_score(y_true, y_pred, average='weighted', zero_division=0):.4f}")
else:
    print("No valid predictions with ground truth for confusion matrix")

## 14. Calculate Performance Metrics by Confidence Range

In [None]:
# Calculate metrics for each confidence range
confidence_ranges = ['<0.5', '0.5-0.7', '0.7-0.9', '>0.9']
metrics_by_range = []

for conf_range in confidence_ranges:
    range_data = predictions_df[predictions_df['confidence_range'] == conf_range]
    
    if len(range_data) == 0:
        continue
    
    # Filter for valid predictions with ground truth
    valid_range_data = range_data[range_data['gt_class_id'] >= 0]
    
    if len(valid_range_data) == 0:
        metrics_by_range.append({
            'confidence_range': conf_range,
            'count': len(range_data),
            'accuracy': np.nan,
            'precision': np.nan,
            'recall': np.nan,
            'f1': np.nan,
        })
        continue
    
    y_true_range = valid_range_data['gt_class_id'].values
    y_pred_range = valid_range_data['pred_class_id'].values
    
    metrics_by_range.append({
        'confidence_range': conf_range,
        'count': len(range_data),
        'valid_count': len(valid_range_data),
        'accuracy': accuracy_score(y_true_range, y_pred_range),
        'precision': precision_score(y_true_range, y_pred_range, average='weighted', zero_division=0),
        'recall': recall_score(y_true_range, y_pred_range, average='weighted', zero_division=0),
        'f1': f1_score(y_true_range, y_pred_range, average='weighted', zero_division=0),
    })

metrics_df = pd.DataFrame(metrics_by_range)

# Create visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Performance Metrics by Confidence Range', fontsize=16, fontweight='bold')

# 1. Accuracy by range
ax = axes[0, 0]
ax.bar(metrics_df['confidence_range'], metrics_df['accuracy'], color='steelblue', alpha=0.7)
ax.set_ylabel('Accuracy', fontsize=11)
ax.set_title('Accuracy by Confidence Range')
ax.set_ylim([0, 1])
ax.grid(True, alpha=0.3, axis='y')
for i, v in enumerate(metrics_df['accuracy']):
    if not np.isnan(v):
        ax.text(i, v + 0.02, f'{v:.3f}', ha='center', fontweight='bold')

# 2. Precision by range
ax = axes[0, 1]
ax.bar(metrics_df['confidence_range'], metrics_df['precision'], color='seagreen', alpha=0.7)
ax.set_ylabel('Precision', fontsize=11)
ax.set_title('Precision by Confidence Range')
ax.set_ylim([0, 1])
ax.grid(True, alpha=0.3, axis='y')
for i, v in enumerate(metrics_df['precision']):
    if not np.isnan(v):
        ax.text(i, v + 0.02, f'{v:.3f}', ha='center', fontweight='bold')

# 3. Recall by range
ax = axes[1, 0]
ax.bar(metrics_df['confidence_range'], metrics_df['recall'], color='coral', alpha=0.7)
ax.set_ylabel('Recall', fontsize=11)
ax.set_title('Recall by Confidence Range')
ax.set_ylim([0, 1])
ax.grid(True, alpha=0.3, axis='y')
for i, v in enumerate(metrics_df['recall']):
    if not np.isnan(v):
        ax.text(i, v + 0.02, f'{v:.3f}', ha='center', fontweight='bold')

# 4. F1-Score by range
ax = axes[1, 1]
ax.bar(metrics_df['confidence_range'], metrics_df['f1'], color='mediumpurple', alpha=0.7)
ax.set_ylabel('F1-Score', fontsize=11)
ax.set_title('F1-Score by Confidence Range')
ax.set_ylim([0, 1])
ax.grid(True, alpha=0.3, axis='y')
for i, v in enumerate(metrics_df['f1']):
    if not np.isnan(v):
        ax.text(i, v + 0.02, f'{v:.3f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

# Print detailed table
print("\n" + "=" * 100)
print("PERFORMANCE METRICS BY CONFIDENCE RANGE")
print("=" * 100)
print(metrics_df.to_string(index=False))
print("=" * 100)

## 15. Interactive Prediction Filtering Tool

In [None]:
# Function to easily filter and display predictions
def filter_and_analyze(min_confidence=0.5, max_confidence=1.0, 
                       correct_only=False, incorrect_only=False, 
                       class_filter=None, show_count=10):
    """
    Filter predictions and display statistics
    
    Parameters:
    - min_confidence: minimum confidence threshold
    - max_confidence: maximum confidence threshold
    - correct_only: show only correct predictions
    - incorrect_only: show only incorrect predictions
    - class_filter: filter by predicted class name (e.g., 'bird', 'bat')
    - show_count: number of samples to display
    """
    
    # Start with all predictions
    filtered = predictions_df.copy()
    
    # Apply confidence filter
    filtered = filtered[(filtered['confidence'] >= min_confidence) & 
                       (filtered['confidence'] < max_confidence)]
    
    # Apply correctness filter
    if correct_only:
        filtered = filtered[filtered['is_correct']]
    if incorrect_only:
        filtered = filtered[~filtered['is_correct']]
    
    # Apply class filter
    if class_filter:
        filtered = filtered[filtered['pred_class_name'] == class_filter.lower()]
    
    print(f"\nFiltered Results:")
    print(f"{'=' * 100}")
    print(f"Total matching predictions: {len(filtered)}")
    if len(filtered) > 0:
        print(f"Mean confidence: {filtered['confidence'].mean():.4f}")
        print(f"Accuracy: {filtered['is_correct'].mean():.4f}")
        print(f"\nPrediction distribution:")
        print(filtered['pred_class_name'].value_counts())
        print(f"\n{'=' * 100}")
        print(f"Sample predictions (showing {min(show_count, len(filtered))} of {len(filtered)}):")
        cols = ['image_name', 'pred_class_name', 'confidence', 'gt_class_name', 'is_correct']
        display_df = filtered[cols].sort_values('confidence', ascending=False).head(show_count)
        print(display_df.to_string(index=False))
    print(f"{'=' * 100}\n")
    
    return filtered

# Example usage:
print("Example 1: High confidence predictions (>0.9)")
example1 = filter_and_analyze(min_confidence=0.9)

print("\nExample 2: Uncertain but correct predictions (0.5-0.7, correct only)")
example2 = filter_and_analyze(min_confidence=0.5, max_confidence=0.7, correct_only=True)

print("\nExample 3: High confidence errors (>0.7, incorrect only)")
example3 = filter_and_analyze(min_confidence=0.7, incorrect_only=True, show_count=15)

print("\nExample 4: Bird predictions")
example4 = filter_and_analyze(class_filter='bird', show_count=10)

## 16. Summary Report and Export Options

In [None]:
# Generate comprehensive summary report
from datetime import datetime

print("\n" + "=" * 100)
print(" " * 30 + "TEST RESULTS SUMMARY REPORT")
print("=" * 100)
print(f"Report generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Model path: {latest_model_path}")
print(f"Test dataset size: {len(test_images)} images")
print(f"\n" + "-" * 100)

print("\nOVERALL PERFORMANCE METRICS")
print("-" * 100)

valid_preds_for_metrics = predictions_df[predictions_df['gt_class_id'] >= 0]
if len(valid_preds_for_metrics) > 0:
    y_true_all = valid_preds_for_metrics['gt_class_id'].values
    y_pred_all = valid_preds_for_metrics['pred_class_id'].values
    
    print(f"Total valid predictions: {len(valid_preds_for_metrics)}")
    print(f"Overall Accuracy: {accuracy_score(y_true_all, y_pred_all):.4f}")
    print(f"Precision (weighted): {precision_score(y_true_all, y_pred_all, average='weighted', zero_division=0):.4f}")
    print(f"Recall (weighted): {recall_score(y_true_all, y_pred_all, average='weighted', zero_division=0):.4f}")
    print(f"F1-Score (weighted): {f1_score(y_true_all, y_pred_all, average='weighted', zero_division=0):.4f}")

print(f"\n" + "-" * 100)
print("\nCONFIDENCE SCORE ANALYSIS")
print("-" * 100)
print(f"Mean confidence: {predictions_df['confidence'].mean():.4f}")
print(f"Median confidence: {predictions_df['confidence'].median():.4f}")
print(f"Min confidence: {predictions_df['confidence'].min():.4f}")
print(f"Max confidence: {predictions_df['confidence'].max():.4f}")
print(f"Std deviation: {predictions_df['confidence'].std():.4f}")

print(f"\nCorrect predictions mean confidence: {correct_predictions['confidence'].mean():.4f}")
print(f"Incorrect predictions mean confidence: {incorrect_predictions['confidence'].mean():.4f}")
print(f"Confidence difference: {correct_predictions['confidence'].mean() - incorrect_predictions['confidence'].mean():.4f}")

if len(valid_predictions) > 0:
    print(f"\nCalibration Error (ECE): {expected_calibration_error:.4f}")

print(f"\n" + "-" * 100)
print("\nCLASS-WISE PERFORMANCE")
print("-" * 100)
if len(valid_preds_for_metrics) > 0:
    for class_id, class_name in sorted(CLASS_NAMES.items()):
        mask = y_true_all == class_id
        if mask.sum() > 0:
            class_accuracy = accuracy_score(y_true_all[mask], y_pred_all[mask])
            class_count = mask.sum()
            print(f"{class_name:<15}: {class_count:>4} samples, Accuracy: {class_accuracy:.4f}")

print(f"\n" + "=" * 100)

# Export predictions to CSV
output_file = Path("test_predictions.csv")
predictions_df.to_csv(output_file, index=False)
print(f"\nPredictions exported to: {output_file}")

# Export summary statistics
summary_file = Path("test_summary.txt")
with open(summary_file, 'w') as f:
    f.write("=" * 100 + "\n")
    f.write(" " * 30 + "TEST RESULTS SUMMARY\n")
    f.write("=" * 100 + "\n")
    f.write(f"Report generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
    f.write(f"Model path: {latest_model_path}\n")
    f.write(f"Test dataset: {len(test_images)} images\n")
    f.write(f"\nOverall Accuracy: {accuracy_score(y_true_all, y_pred_all):.4f}\n")
    f.write(f"Precision (weighted): {precision_score(y_true_all, y_pred_all, average='weighted', zero_division=0):.4f}\n")
    f.write(f"Recall (weighted): {recall_score(y_true_all, y_pred_all, average='weighted', zero_division=0):.4f}\n")
    f.write(f"F1-Score (weighted): {f1_score(y_true_all, y_pred_all, average='weighted', zero_division=0):.4f}\n")

print(f"Summary exported to: {summary_file}")
print("\nâœ“ Test analysis complete!")