In [3]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (classification_report, confusion_matrix,
                           roc_auc_score, average_precision_score,
                           matthews_corrcoef, balanced_accuracy_score,
                           cohen_kappa_score, roc_curve, auc)
from model import get_model
from data_prep import get_dataloaders, custom_collate
import os
import json
from torch.utils.data import DataLoader


# Configuration
class Config:
    MODEL_CHECKPOINT = "checkpoints/best_model_fold1.pth"  # best model path
    PLOTS_DIR = "plots_test"
    BATCH_SIZE = 32
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Setup directories
os.makedirs(Config.PLOTS_DIR, exist_ok=True)

def load_model(model_path: str) -> torch.nn.Module:
    """Load trained model from checkpoint"""
    model = get_model()
    model.load_state_dict(torch.load(model_path, map_location=Config.DEVICE))
    model.to(Config.DEVICE)
    model.eval()
    return model

def evaluate_model(model: torch.nn.Module, test_loader: DataLoader) -> dict:
    """Run evaluation on test set and return metrics"""
    all_preds = []
    all_labels = []
    all_probs = []
    
    with torch.no_grad():
        for images, labels in test_loader:
            images = images.to(Config.DEVICE)
            labels = labels.to(Config.DEVICE)
            
            outputs = model(images)
            probs = torch.softmax(outputs, dim=1)
            _, preds = torch.max(outputs, 1)
            
            all_preds.append(preds.cpu().numpy())
            all_labels.append(labels.cpu().numpy())
            all_probs.append(probs.cpu().numpy())
    
    y_true = np.concatenate(all_labels)
    y_pred = np.concatenate(all_preds)
    y_probs = np.concatenate(all_probs)

    # Calculate metrics
    accuracy = balanced_accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_probs[:, 1])
    pr_auc = average_precision_score(y_true, y_probs[:, 1])
    mcc = matthews_corrcoef(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)
    
    # Confusion matrix metrics
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    ppv = tp / (tp + fp)
    npv = tn / (tn + fn)
    plr = sensitivity / (1 - specificity)
    nlr = (1 - sensitivity) / specificity
    
    return {
        'y_true': y_true.tolist(),
        'y_pred': y_pred.tolist(),
        'y_probs': y_probs.tolist(),
        'accuracy': accuracy,
        'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'mcc': mcc,
        'kappa': kappa,
        'sensitivity': sensitivity,
        'specificity': specificity,
        'ppv': ppv,
        'npv': npv,
        'plr': plr,
        'nlr': nlr,
        'confusion_matrix': cm.tolist(),
        'classification_report': classification_report(y_true, y_pred, target_names=['Normal', 'Pneumonia'], output_dict=True)
    }

def generate_plots(metrics: dict):
    """Generate evaluation visualizations"""
    y_true = np.array(metrics['y_true'])
    y_probs = np.array(metrics['y_probs'])
    
    # Confusion Matrix
    plt.figure(figsize=(8, 6))
    sns.heatmap(metrics['confusion_matrix'], annot=True, fmt='d', cmap='Blues',
                xticklabels=['Normal', 'Pneumonia'],
                yticklabels=['Normal', 'Pneumonia'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.savefig(f'{Config.PLOTS_DIR}/confusion_matrix.png')
    plt.close()
    
    # ROC Curve
    fpr, tpr, _ = roc_curve(y_true, y_probs[:, 1])
    roc_auc = auc(fpr, tpr)
    
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.savefig(f'{Config.PLOTS_DIR}/roc_curve.png')
    plt.close()
    
    # Probability Distribution
    plt.figure(figsize=(10, 6))
    for i, class_name in enumerate(['Normal', 'Pneumonia']):
        sns.kdeplot(y_probs[y_true == i, 1], label=class_name, shade=True)
    plt.xlabel('Predicted Probability of Pneumonia')
    plt.ylabel('Density')
    plt.title('Predicted Probability Distribution by True Class')
    plt.legend()
    plt.savefig(f'{Config.PLOTS_DIR}/probability_distribution.png')
    plt.close()

def print_results(metrics: dict):
    """Print formatted evaluation results"""
    print("\n=== Evaluation Results ===")
    print(f"\nModel: {Config.MODEL_CHECKPOINT}")
    print(f"\nOverall Accuracy: {metrics['accuracy']:.4f}")
    print(f"Balanced Accuracy: {metrics['balanced_accuracy']:.4f}")
    print(f"ROC AUC: {metrics['roc_auc']:.4f}")
    print(f"PR AUC: {metrics['pr_auc']:.4f}")
    print(f"Matthew's CC: {metrics['mcc']:.4f}")
    print(f"Cohen's Kappa: {metrics['kappa']:.4f}")
    
    print("\nClinical Metrics:")
    print(f"Sensitivity/Recall: {metrics['sensitivity']:.4f}")
    print(f"Specificity: {metrics['specificity']:.4f}")
    print(f"PPV: {metrics['ppv']:.4f}")
    print(f"NPV: {metrics['npv']:.4f}")
    print(f"Positive LR: {metrics['plr']:.2f}")
    print(f"Negative LR: {metrics['nlr']:.2f}")
    
    print("\nClassification Report:")
    print(classification_report(
        metrics['y_true'],
        metrics['y_pred'],
        target_names=['Normal', 'Pneumonia']
    ))
    
    print("\nConfusion Matrix:")
    print(metrics['confusion_matrix'])

def save_results(metrics: dict, filename: str = "evaluation_results.json"):
    """Save evaluation metrics to JSON file"""
    with open(filename, 'w') as f:
        json.dump(metrics, f, indent=4)
    print(f"\nResults saved to {filename}")

def main():
    # Load data and model
    _, _, test_loader = get_dataloaders()
    model = load_model(Config.MODEL_CHECKPOINT)
    
    # Run evaluation
    metrics = evaluate_model(model, test_loader)
    
    # Generate outputs
    generate_plots(metrics)
    print_results(metrics)
    save_results(metrics)
    
    print("\nEvaluation complete. Plots saved to 'plots/' directory.")

if __name__ == "__main__":
    main()

✅ DataLoaders created successfully!

=== Evaluation Results ===

Model: checkpoints/best_model_fold1.pth

Overall Accuracy: 0.9966
Balanced Accuracy: 0.9966
ROC AUC: 0.9999
PR AUC: 1.0000
Matthew's CC: 0.9932
Cohen's Kappa: 0.9932

Clinical Metrics:
Sensitivity/Recall: 0.9974
Specificity: 0.9957
PPV: 0.9974
NPV: 0.9957
Positive LR: 233.40
Negative LR: 0.00

Classification Report:
              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00       234
   Pneumonia       1.00      1.00      1.00       390

    accuracy                           1.00       624
   macro avg       1.00      1.00      1.00       624
weighted avg       1.00      1.00      1.00       624


Confusion Matrix:
[[233, 1], [1, 389]]

Results saved to evaluation_results.json

Evaluation complete. Plots saved to 'plots/' directory.



`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(y_probs[y_true == i, 1], label=class_name, shade=True)

`shade` is now deprecated in favor of `fill`; setting `fill=True`.
This will become an error in seaborn v0.14.0; please update your code.

  sns.kdeplot(y_probs[y_true == i, 1], label=class_name, shade=True)


# Evluate 2

In [8]:
import torch
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (classification_report, confusion_matrix,
                           roc_auc_score, average_precision_score,
                           matthews_corrcoef, balanced_accuracy_score,
                           cohen_kappa_score, roc_curve, auc,
                           precision_recall_curve)
from model import get_model
from data_prep import get_dataloaders, custom_collate
import os
import json
from typing import Dict, List
import pandas as pd

class Evaluator:
    def __init__(self, model_path: str = "checkpoints/best_model_fold1.pth"):
        self.config = {
            "model_path": model_path,
            "plots_dir": "evaluation_plots",
            "batch_size": 32,
            "device": torch.device("cuda" if torch.cuda.is_available() else "cpu")
        }
        os.makedirs(self.config["plots_dir"], exist_ok=True)
        
    def load_model(self) -> torch.nn.Module:
        model = get_model()
        model.load_state_dict(torch.load(self.config["model_path"], 
                                      map_location=self.config["device"]))
        model.to(self.config["device"])
        model.eval()
        return model
    
    def evaluate(self) -> Dict:
        model = self.load_model()
        _, _, test_loader = get_dataloaders()
        
        results = self._get_predictions(model, test_loader)
        metrics = self._calculate_metrics(results)
        self._generate_visualizations(results, metrics)
        self._save_results(metrics)
        
        return metrics
    
    def _get_predictions(self, model: torch.nn.Module, 
                       test_loader: DataLoader) -> Dict:
        all_preds = []
        all_labels = []
        all_probs = []
        all_images = []
        
        with torch.no_grad():
            for images, labels in test_loader:
                images = images.to(self.config["device"])
                outputs = model(images)
                probs = torch.softmax(outputs, dim=1)
                _, preds = torch.max(outputs, 1)
                
                all_preds.append(preds.cpu().numpy())
                all_labels.append(labels.cpu().numpy())
                all_probs.append(probs.cpu().numpy())
                all_images.append(images.cpu().numpy())
        
        return {
            "y_true": np.concatenate(all_labels),
            "y_pred": np.concatenate(all_preds),
            "y_probs": np.concatenate(all_probs),
            "images": np.concatenate(all_images)
        }
    
    def _calculate_metrics(self, results: Dict) -> Dict:
        y_true = results["y_true"]
        y_pred = results["y_pred"]
        y_probs = results["y_probs"]
        
        metrics = {
            "accuracy": balanced_accuracy_score(y_true, y_pred),
            "roc_auc": roc_auc_score(y_true, y_probs[:, 1]),
            "pr_auc": average_precision_score(y_true, y_probs[:, 1]),
            "mcc": matthews_corrcoef(y_true, y_pred),
            "kappa": cohen_kappa_score(y_true, y_pred),
            "y_true": y_true.tolist(),
            "y_pred": y_pred.tolist()
        }
        
        cm = confusion_matrix(y_true, y_pred)
        tn, fp, fn, tp = cm.ravel()
        
        metrics.update({
            "confusion_matrix": cm.tolist(),
            "sensitivity": tp / (tp + fn),
            "specificity": tn / (tn + fp),
            "ppv": tp / (tp + fp),
            "npv": tn / (tn + fn),
            "plr": (tp / (tp + fn)) / (1 - (tn / (tn + fp))),
            "nlr": (1 - (tp / (tp + fn))) / (tn / (tn + fp)),
            "classification_report": classification_report(
                y_true, y_pred, 
                target_names=['Normal', 'Pneumonia'], 
                output_dict=True
            )
        })
        
        return metrics
    
    def _generate_visualizations(self, results: Dict, metrics: Dict):
        y_true = results["y_true"]
        y_pred = results["y_pred"]
        y_probs = results["y_probs"]
        
        # Confusion Matrix
        plt.figure(figsize=(8, 6))
        sns.heatmap(metrics["confusion_matrix"], 
                   annot=True, fmt='d', cmap='Blues',
                   xticklabels=['Normal', 'Pneumonia'],
                   yticklabels=['Normal', 'Pneumonia'])
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title('Confusion Matrix')
        plt.savefig(os.path.join(self.config["plots_dir"], 'confusion_matrix.png'), 
                   bbox_inches='tight', dpi=300)
        plt.close()
        
        # ROC Curve
        fpr, tpr, _ = roc_curve(y_true, y_probs[:, 1])
        roc_auc = auc(fpr, tpr)
        
        plt.figure()
        plt.plot(fpr, tpr, color='darkorange', lw=2, 
                label=f'ROC (AUC = {roc_auc:.4f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic')
        plt.legend(loc="lower right")
        plt.savefig(os.path.join(self.config["plots_dir"], 'roc_curve.png'), 
                   bbox_inches='tight', dpi=300)
        plt.close()
        
        # Precision-Recall Curve
        precision, recall, _ = precision_recall_curve(y_true, y_probs[:, 1])
        
        plt.figure()
        plt.plot(recall, precision, color='blue', lw=2, 
                label=f'PR (AUC = {metrics["pr_auc"]:.4f})')
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title('Precision-Recall Curve')
        plt.legend(loc="lower left")
        plt.savefig(os.path.join(self.config["plots_dir"], 'pr_curve.png'), 
                   bbox_inches='tight', dpi=300)
        plt.close()
        
        # Probability Distribution
        plt.figure(figsize=(10, 6))
        for i, class_name in enumerate(['Normal', 'Pneumonia']):
            sns.kdeplot(y_probs[y_true == i, 1], 
                       label=class_name, 
                       fill=True)
        plt.xlabel('Predicted Probability of Pneumonia')
        plt.ylabel('Density')
        plt.title('Predicted Probability Distribution by True Class')
        plt.legend()
        plt.savefig(os.path.join(self.config["plots_dir"], 'probability_distribution.png'), 
                   bbox_inches='tight', dpi=300)
        plt.close()
        
        # Misclassified Examples (if any)
        if (y_true != y_pred).sum() > 0:
            self._plot_misclassified(results)
    
    def _plot_misclassified(self, results: Dict):
        incorrect = results["y_true"] != results["y_pred"]
        num_samples = min(5, incorrect.sum())
        indices = np.where(incorrect)[0][:num_samples]
        
        plt.figure(figsize=(15, 3 * num_samples))
        for i, idx in enumerate(indices):
            img = results["images"][idx].transpose(1, 2, 0)
            img = (img - img.min()) / (img.max() - img.min())
            
            plt.subplot(num_samples, 1, i+1)
            plt.imshow(img)
            plt.title(
                f"True: {'Normal' if results['y_true'][idx]==0 else 'Pneumonia'} | "
                f"Pred: {'Normal' if results['y_pred'][idx]==0 else 'Pneumonia'} | "
                f"Prob: {results['y_probs'][idx, 1]:.3f}"
            )
            plt.axis('off')
        
        plt.tight_layout()
        plt.savefig(os.path.join(self.config["plots_dir"], 'misclassified_examples.png'), 
                   bbox_inches='tight', dpi=300)
        plt.close()
    
    def _save_results(self, metrics: Dict):
        """Save evaluation results to JSON and CSV"""
        # Save full metrics to JSON
        with open(os.path.join(self.config["plots_dir"], 'evaluation_results.json'), 'w') as f:
            json.dump(metrics, f, indent=4)
        
        # Save key metrics to CSV
        metrics_df = pd.DataFrame({
            "Metric": [
                "Accuracy", "Balanced Accuracy", "ROC AUC", "PR AUC",
                "Matthew's CC", "Cohen's Kappa", "Sensitivity", "Specificity",
                "PPV", "NPV", "Positive LR", "Negative LR"
            ],
            "Value": [
                metrics["accuracy"], metrics["accuracy"],
                metrics["roc_auc"], metrics["pr_auc"],
                metrics["mcc"], metrics["kappa"],
                metrics["sensitivity"], metrics["specificity"],
                metrics["ppv"], metrics["npv"],
                metrics["plr"], metrics["nlr"]
            ]
        })
        metrics_df.to_csv(os.path.join(self.config["plots_dir"], 'key_metrics.csv'), index=False)
    
    def print_results(self, metrics: Dict):
        print("\n=== Evaluation Results ===")
        print(f"\nModel: {self.config['model_path']}")
        
        print("\nPerformance Metrics:")
        print(f"Accuracy: {metrics['accuracy']:.4f}")
        print(f"ROC AUC: {metrics['roc_auc']:.4f}")
        print(f"PR AUC: {metrics['pr_auc']:.4f}")
        print(f"Matthew's CC: {metrics['mcc']:.4f}")
        print(f"Cohen's Kappa: {metrics['kappa']:.4f}")
        
        print("\nClinical Metrics:")
        print(f"Sensitivity/Recall: {metrics['sensitivity']:.4f}")
        print(f"Specificity: {metrics['specificity']:.4f}")
        print(f"PPV: {metrics['ppv']:.4f}")
        print(f"NPV: {metrics['npv']:.4f}")
        print(f"Positive LR: {metrics['plr']:.2f}")
        print(f"Negative LR: {metrics['nlr']:.2f}")
        
        print("\nClassification Report:")
        print(classification_report(
            metrics["y_true"],
            metrics["y_pred"],
            target_names=['Normal', 'Pneumonia']
        ))
        
        print("\nConfusion Matrix:")
        print(np.array(metrics["confusion_matrix"]))

if __name__ == "__main__":
    evaluator = Evaluator()
    metrics = evaluator.evaluate()
    evaluator.print_results(metrics)
    print(f"\nEvaluation complete. Results saved to '{evaluator.config['plots_dir']}/'")



✅ DataLoaders created successfully!

=== Evaluation Results ===

Model: checkpoints/best_model_fold1.pth

Performance Metrics:
Accuracy: 0.9966
ROC AUC: 0.9999
PR AUC: 1.0000
Matthew's CC: 0.9932
Cohen's Kappa: 0.9932

Clinical Metrics:
Sensitivity/Recall: 0.9974
Specificity: 0.9957
PPV: 0.9974
NPV: 0.9957
Positive LR: 233.40
Negative LR: 0.00

Classification Report:
              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00       234
   Pneumonia       1.00      1.00      1.00       390

    accuracy                           1.00       624
   macro avg       1.00      1.00      1.00       624
weighted avg       1.00      1.00      1.00       624


Confusion Matrix:
[[233   1]
 [  1 389]]

Evaluation complete. Results saved to 'evaluation_plots/'
