In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, load_digits
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_curve, auc,
    roc_auc_score, average_precision_score, precision_recall_curve
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from scipy import stats
import itertools
import warnings
warnings.filterwarnings('ignore')

# Set style and random seed
plt.style.use('seaborn-v0_8')
np.random.seed(42)

print("Libraries imported successfully!")


In [None]:
# Create comprehensive evaluation framework
class ImageClassificationEvaluator:
    """Comprehensive evaluation suite for image classification models"""
    
    def __init__(self):
        self.results = {}
        self.models = {}
    
    def prepare_data(self, imbalanced=False):
        """Prepare datasets for evaluation"""
        
        # Use digits dataset as our "image" data
        digits = load_digits()
        X, y = digits.data, digits.target
        
        if imbalanced:
            # Create imbalanced dataset
            class_counts = np.bincount(y)
            print("Creating imbalanced dataset...")
            
            # Keep different amounts for each class
            keep_ratios = [1.0, 0.8, 0.6, 0.4, 0.3, 0.2, 0.15, 0.1, 0.05, 0.02]
            
            indices_to_keep = []
            for class_idx in range(10):
                class_indices = np.where(y == class_idx)[0]
                n_keep = int(len(class_indices) * keep_ratios[class_idx])
                indices_to_keep.extend(class_indices[:n_keep])
            
            X = X[indices_to_keep]
            y = y[indices_to_keep]
            
            print("Class distribution after imbalancing:")
            unique, counts = np.unique(y, return_counts=True)
            for cls, count in zip(unique, counts):
                print(f"  Class {cls}: {count} samples")
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.3, stratify=y, random_state=42)
        
        # Standardize features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        return X_train_scaled, X_test_scaled, y_train, y_test
    
    def train_models(self, X_train, y_train):
        """Train multiple models for comparison"""
        
        models = {
            'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
            'SVM': SVC(probability=True, random_state=42),
            'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
        }
        
        print("Training models...")
        trained_models = {}
        
        for name, model in models.items():
            print(f"  Training {name}...")
            model.fit(X_train, y_train)
            trained_models[name] = model
        
        self.models = trained_models
        return trained_models
    
    def evaluate_single_model(self, model, model_name, X_test, y_test):
        """Comprehensive evaluation of a single model"""
        
        # Basic predictions
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)
        
        # Basic metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        
        # Per-class metrics
        precision_per_class = precision_score(y_test, y_pred, average=None)
        recall_per_class = recall_score(y_test, y_pred, average=None)
        f1_per_class = f1_score(y_test, y_pred, average=None)
        
        # Multi-class AUC
        y_test_bin = label_binarize(y_test, classes=np.unique(y_test))
        if y_test_bin.shape[1] > 1:
            auc_score = roc_auc_score(y_test_bin, y_prob, average='weighted', multi_class='ovr')
        else:
            auc_score = np.nan
        
        # Confusion matrix
        cm = confusion_matrix(y_test, y_pred)
        
        # Top-k accuracy (k=3)
        top3_accuracy = self._top_k_accuracy(y_test, y_prob, k=3)
        
        results = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'auc_score': auc_score,
            'top3_accuracy': top3_accuracy,
            'precision_per_class': precision_per_class,
            'recall_per_class': recall_per_class,
            'f1_per_class': f1_per_class,
            'confusion_matrix': cm,
            'y_pred': y_pred,
            'y_prob': y_prob
        }
        
        self.results[model_name] = results
        return results
    
    def _top_k_accuracy(self, y_true, y_prob, k=3):
        """Calculate top-k accuracy"""
        top_k_preds = np.argsort(y_prob, axis=1)[:, -k:]
        correct = 0
        for i, true_label in enumerate(y_true):
            if true_label in top_k_preds[i]:
                correct += 1
        return correct / len(y_true)
    
    def plot_confusion_matrices(self, class_names=None):
        """Plot confusion matrices for all models"""
        
        n_models = len(self.results)
        fig, axes = plt.subplots(1, n_models, figsize=(6*n_models, 5))
        if n_models == 1:
            axes = [axes]
        
        for idx, (model_name, results) in enumerate(self.results.items()):
            cm = results['confusion_matrix']
            
            # Normalize confusion matrix
            cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            
            # Plot
            im = axes[idx].imshow(cm_normalized, interpolation='nearest', cmap=plt.cm.Blues)
            axes[idx].set_title(f'{model_name}\\nAccuracy: {results["accuracy"]:.3f}')
            
            # Add colorbar
            plt.colorbar(im, ax=axes[idx])
            
            # Add text annotations
            thresh = cm_normalized.max() / 2
            for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
                axes[idx].text(j, i, f'{cm[i, j]}\\n({cm_normalized[i, j]:.2f})',
                             horizontalalignment="center",
                             color="white" if cm_normalized[i, j] > thresh else "black",
                             fontsize=8)
            
            axes[idx].set_ylabel('True Label')
            axes[idx].set_xlabel('Predicted Label')
        
        plt.tight_layout()
        plt.show()
    
    def plot_roc_curves(self, X_test, y_test):
        """Plot ROC curves for multi-class classification"""
        
        n_classes = len(np.unique(y_test))
        y_test_bin = label_binarize(y_test, classes=range(n_classes))
        
        # Plot for each model
        fig, axes = plt.subplots(1, len(self.models), figsize=(6*len(self.models), 5))
        if len(self.models) == 1:
            axes = [axes]
        
        for model_idx, (model_name, model) in enumerate(self.models.items()):
            y_prob = model.predict_proba(X_test)
            
            # Calculate ROC curve and AUC for each class
            fpr = dict()
            tpr = dict()
            roc_auc = dict()
            
            for i in range(n_classes):
                if y_test_bin.shape[1] > 1:
                    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_prob[:, i])
                    roc_auc[i] = auc(fpr[i], tpr[i])
            
            # Plot ROC curves
            colors = plt.cm.Set1(np.linspace(0, 1, n_classes))
            for i, color in enumerate(colors):
                if i in roc_auc:
                    axes[model_idx].plot(fpr[i], tpr[i], color=color, lw=2,
                                       label=f'Class {i} (AUC = {roc_auc[i]:.2f})')
            
            axes[model_idx].plot([0, 1], [0, 1], 'k--', lw=2)
            axes[model_idx].set_xlim([0.0, 1.0])
            axes[model_idx].set_ylim([0.0, 1.05])
            axes[model_idx].set_xlabel('False Positive Rate')
            axes[model_idx].set_ylabel('True Positive Rate')
            axes[model_idx].set_title(f'{model_name} - ROC Curves')
            axes[model_idx].legend(loc="lower right", fontsize=8)
            axes[model_idx].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
    
    def cross_validation_analysis(self, X, y, cv_folds=5):
        """Perform cross-validation analysis"""
        
        print(f"\\n=== {cv_folds}-Fold Cross-Validation Analysis ===")
        
        cv_results = {}
        skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
        
        for model_name, model in self.models.items():
            print(f"\\nEvaluating {model_name}...")
            
            # Cross-validation scores
            cv_scores = cross_val_score(model, X, y, cv=skf, scoring='accuracy')
            cv_f1 = cross_val_score(model, X, y, cv=skf, scoring='f1_weighted')
            
            cv_results[model_name] = {
                'accuracy_scores': cv_scores,
                'f1_scores': cv_f1,
                'accuracy_mean': cv_scores.mean(),
                'accuracy_std': cv_scores.std(),
                'f1_mean': cv_f1.mean(),
                'f1_std': cv_f1.std()
            }
            
            print(f"  Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
            print(f"  F1-Score: {cv_f1.mean():.4f} (+/- {cv_f1.std() * 2:.4f})")
        
        # Plot cross-validation results
        fig, axes = plt.subplots(1, 2, figsize=(15, 5))
        
        model_names = list(cv_results.keys())
        accuracy_means = [cv_results[name]['accuracy_mean'] for name in model_names]
        accuracy_stds = [cv_results[name]['accuracy_std'] for name in model_names]
        f1_means = [cv_results[name]['f1_mean'] for name in model_names]
        f1_stds = [cv_results[name]['f1_std'] for name in model_names]
        
        # Accuracy comparison
        x = np.arange(len(model_names))
        axes[0].bar(x, accuracy_means, yerr=accuracy_stds, capsize=5, alpha=0.7)
        axes[0].set_xlabel('Models')
        axes[0].set_ylabel('Accuracy')
        axes[0].set_title('Cross-Validation Accuracy Comparison')
        axes[0].set_xticks(x)
        axes[0].set_xticklabels(model_names, rotation=45)
        axes[0].grid(True, alpha=0.3)
        
        # F1-score comparison
        axes[1].bar(x, f1_means, yerr=f1_stds, capsize=5, alpha=0.7, color='orange')
        axes[1].set_xlabel('Models')
        axes[1].set_ylabel('F1-Score')
        axes[1].set_title('Cross-Validation F1-Score Comparison')
        axes[1].set_xticks(x)
        axes[1].set_xticklabels(model_names, rotation=45)
        axes[1].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        return cv_results
    
    def statistical_significance_test(self, cv_results):
        """Perform statistical significance testing between models"""
        
        print("\\n=== Statistical Significance Testing ===")
        
        model_names = list(cv_results.keys())
        
        # Perform pairwise t-tests
        for i in range(len(model_names)):
            for j in range(i+1, len(model_names)):
                model1, model2 = model_names[i], model_names[j]
                
                scores1 = cv_results[model1]['accuracy_scores']
                scores2 = cv_results[model2]['accuracy_scores']
                
                # Paired t-test
                t_stat, p_value = stats.ttest_rel(scores1, scores2)
                
                print(f"\\n{model1} vs {model2}:")
                print(f"  Mean difference: {scores1.mean() - scores2.mean():.4f}")
                print(f"  t-statistic: {t_stat:.4f}")
                print(f"  p-value: {p_value:.4f}")
                print(f"  Significant (p < 0.05): {'Yes' if p_value < 0.05 else 'No'}")
    
    def generate_comprehensive_report(self):
        """Generate a comprehensive evaluation report"""
        
        print("\\n" + "="*60)
        print("COMPREHENSIVE MODEL EVALUATION REPORT")
        print("="*60)
        
        # Create summary dataframe
        summary_data = []
        for model_name, results in self.results.items():
            summary_data.append({
                'Model': model_name,
                'Accuracy': f"{results['accuracy']:.4f}",
                'Precision': f"{results['precision']:.4f}",
                'Recall': f"{results['recall']:.4f}",
                'F1-Score': f"{results['f1_score']:.4f}",
                'AUC': f"{results['auc_score']:.4f}" if not np.isnan(results['auc_score']) else 'N/A',
                'Top-3 Accuracy': f"{results['top3_accuracy']:.4f}"
            })
        
        summary_df = pd.DataFrame(summary_data)
        print(summary_df.to_string(index=False))
        
        # Best performing model
        best_model = max(self.results.items(), key=lambda x: x[1]['accuracy'])
        print(f"\\nBest Performing Model: {best_model[0]} (Accuracy: {best_model[1]['accuracy']:.4f})")
        
        return summary_df

# Demonstrate evaluation framework
print("=== Image Classification Model Evaluation Framework ===")

# Initialize evaluator
evaluator = ImageClassificationEvaluator()

# Prepare data (balanced)
print("\\n1. Preparing balanced dataset...")
X_train, X_test, y_train, y_test = evaluator.prepare_data(imbalanced=False)
print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")

# Train models
print("\\n2. Training models...")
models = evaluator.train_models(X_train, y_train)

# Evaluate each model
print("\\n3. Evaluating models...")
for name, model in models.items():
    print(f"\\nEvaluating {name}...")
    evaluator.evaluate_single_model(model, name, X_test, y_test)

# Generate visualizations
print("\\n4. Generating evaluation visualizations...")
evaluator.plot_confusion_matrices()
evaluator.plot_roc_curves(X_test, y_test)

# Cross-validation analysis
print("\\n5. Cross-validation analysis...")
X_full = np.vstack([X_train, X_test])
y_full = np.concatenate([y_train, y_test])
cv_results = evaluator.cross_validation_analysis(X_full, y_full)

# Statistical significance testing
evaluator.statistical_significance_test(cv_results)

# Comprehensive report
print("\\n6. Generating comprehensive report...")
summary_report = evaluator.generate_comprehensive_report()

print("\\nEvaluation framework demonstration completed!")
