# Model Training and Experimentation

This notebook demonstrates interactive model training and hyperparameter experimentation for the text classification pipeline.




In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support, 
    confusion_matrix, classification_report,
    roc_curve, auc, roc_auc_score
)
from sklearn.preprocessing import label_binarize

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    pipeline, Trainer
)
from datasets import load_dataset

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("🔍 Evaluation Analysis Environment Ready!")


🔍 Evaluation Analysis Environment Ready!


In [2]:

class ComprehensiveEvaluator:
    """Advanced evaluation system for text classification models"""
    
    def __init__(self, model_path, tokenizer_path):
        self.model_path = Path(model_path)
        self.tokenizer_path = Path(tokenizer_path)
        self.model = None
        self.tokenizer = None
        self.classifier = None
        self.load_model()
    
    def load_model(self):
        """Load trained model and tokenizer"""
        print(f"Loading model from {self.model_path}...")
        
        self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path)
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_path)
        self.classifier = pipeline(
            'text-classification',
            model=self.model,
            tokenizer=self.tokenizer,
            device=0 if torch.cuda.is_available() else -1
        )
        
        print("Model loaded successfully!")
    
    def evaluate_on_dataset(self, dataset, dataset_name="test"):
        """Comprehensive evaluation on dataset"""
        print(f"Evaluating on {dataset_name} dataset...")
        
        # Get predictions
        texts = dataset['text']
        true_labels = dataset['label']
        
        # Batch predictions for efficiency
        batch_size = 32
        predictions = []
        prediction_scores = []
        
        for i in tqdm(range(0, len(texts), batch_size)):
            batch_texts = texts[i:i+batch_size]
            batch_preds = self.classifier(batch_texts)
            
            for pred in batch_preds:
                predictions.append(pred['label'])
                prediction_scores.append(pred['score'])
        
        # Convert string labels to numeric if needed
        label_map = {'NEGATIVE': 0, 'POSITIVE': 1} if 'NEGATIVE' in predictions else None
        if label_map:
            pred_labels = [label_map[p] for p in predictions]
        else:
            pred_labels = predictions
        
        # Calculate metrics
        metrics = self.calculate_comprehensive_metrics(true_labels, pred_labels, prediction_scores)
        
        # Store results
        self.evaluation_results = {
            'dataset_name': dataset_name,
            'predictions': pred_labels,
            'true_labels': true_labels,
            'prediction_scores': prediction_scores,
            'metrics': metrics,
            'texts': texts
        }
        
        return self.evaluation_results
    
    def calculate_comprehensive_metrics(self, y_true, y_pred, scores):
        """Calculate comprehensive evaluation metrics"""
        accuracy = accuracy_score(y_true, y_pred)
        precision, recall, f1, support = precision_recall_fscore_support(
            y_true, y_pred, average='weighted'
        )
        
        # Per-class metrics
        per_class_metrics = precision_recall_fscore_support(
            y_true, y_pred, average=None
        )
        
        # ROC AUC for binary classification
        roc_auc = None
        if len(set(y_true)) == 2:
            roc_auc = roc_auc_score(y_true, scores)
        
        metrics = {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'roc_auc': roc_auc,
            'per_class_precision': per_class_metrics[0],
            'per_class_recall': per_class_metrics[1],
            'per_class_f1': per_class_metrics[2],
            'support': per_class_metrics[3]
        }
        
        return metrics
    
    def create_confusion_matrix_plot(self, class_names=None):
        """Create advanced confusion matrix visualization"""
        if not hasattr(self, 'evaluation_results'):
            raise ValueError("Run evaluation first!")
        
        y_true = self.evaluation_results['true_labels']
        y_pred = self.evaluation_results['predictions']
        
        # Calculate confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        
        # Normalize confusion matrix
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
        # Create subplots
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))
        
        # Raw confusion matrix
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=class_names, yticklabels=class_names, 
                   ax=axes[0])
        axes[0].set_title('Confusion Matrix (Raw Counts)')
        axes[0].set_xlabel('Predicted')
        axes[0].set_ylabel('Actual')
        
        # Normalized confusion matrix
        sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Blues',
                   xticklabels=class_names, yticklabels=class_names,
                   ax=axes[1])
        axes[1].set_title('Confusion Matrix (Normalized)')
        axes[1].set_xlabel('Predicted')
        axes[1].set_ylabel('Actual')
        
        plt.tight_layout()
        plt.savefig('reports/confusion_matrix.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        return cm, cm_normalized
    
    def create_roc_curve_analysis(self):
        """Create ROC curve analysis for binary classification"""
        if not hasattr(self, 'evaluation_results'):
            raise ValueError("Run evaluation first!")
        
        metrics = self.evaluation_results['metrics']
        if metrics['roc_auc'] is None:
            print("⚠️ ROC analysis only available for binary classification")
            return
        
        y_true = self.evaluation_results['true_labels']
        scores = self.evaluation_results['prediction_scores']
        
        # Calculate ROC curve
        fpr, tpr, thresholds = roc_curve(y_true, scores)
        roc_auc = auc(fpr, tpr)
        
        # Create plot
        plt.figure(figsize=(10, 8))
        plt.plot(fpr, tpr, color='darkorange', lw=2, 
                label=f'ROC curve (AUC = {roc_auc:.3f})')
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic (ROC) Curve')
        plt.legend(loc="lower right")
        plt.grid(True, alpha=0.3)
        plt.savefig('reports/roc_curve.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        return fpr, tpr, thresholds
    
    def error_analysis(self, num_examples=10):
        """Detailed error analysis with examples"""
        if not hasattr(self, 'evaluation_results'):
            raise ValueError("Run evaluation first!")
        
        results = self.evaluation_results
        
        # Find misclassified examples
        misclassified_indices = [
            i for i, (true, pred) in enumerate(zip(results['true_labels'], results['predictions']))
            if true != pred
        ]
        
        print(f"Error Analysis: {len(misclassified_indices)} misclassified examples")
        print(f"Error rate: {len(misclassified_indices)/len(results['true_labels']):.2%}")
        
        # Analyze error patterns
        error_patterns = {}
        for idx in misclassified_indices:
            true_label = results['true_labels'][idx]
            pred_label = results['predictions'][idx]
            pattern = f"{true_label} -> {pred_label}"
            error_patterns[pattern] = error_patterns.get(pattern, 0) + 1
        
        print("\nError Patterns:")
        for pattern, count in sorted(error_patterns.items(), key=lambda x: x[1], reverse=True):
            print(f"  {pattern}: {count} examples")
        
        # Show examples of misclassified texts
        print(f"\nSample Misclassified Examples (showing {num_examples}):")
        for i, idx in enumerate(misclassified_indices[:num_examples]):
            text = results['texts'][idx]
            true_label = results['true_labels'][idx]
            pred_label = results['predictions'][idx]
            confidence = results['prediction_scores'][idx]
            
            print(f"\nExample {i+1}:")
            print(f"Text: {text[:200]}...")
            print(f"True: {true_label}, Predicted: {pred_label}, Confidence: {confidence:.3f}")
        
        return misclassified_indices, error_patterns
    
    def save_evaluation_report(self, output_path='reports/evaluation_metrics.json'):
        """Save comprehensive evaluation report"""
        if not hasattr(self, 'evaluation_results'):
            raise ValueError("Run evaluation first!")
        
        # Prepare report data
        report_data = {
            'dataset_info': {
                'name': self.evaluation_results['dataset_name'],
                'size': len(self.evaluation_results['true_labels'])
            },
            'metrics': {
                'accuracy': float(self.evaluation_results['metrics']['accuracy']),
                'precision': float(self.evaluation_results['metrics']['precision']),
                'recall': float(self.evaluation_results['metrics']['recall']),
                'f1': float(self.evaluation_results['metrics']['f1']),
                'roc_auc': float(self.evaluation_results['metrics']['roc_auc']) if self.evaluation_results['metrics']['roc_auc'] else None
            },
            'per_class_metrics': {
                'precision': [float(x) for x in self.evaluation_results['metrics']['per_class_precision']],
                'recall': [float(x) for x in self.evaluation_results['metrics']['per_class_recall']],
                'f1': [float(x) for x in self.evaluation_results['metrics']['per_class_f1']],
                'support': [int(x) for x in self.evaluation_results['metrics']['support']]
            }
        }
        
        # Save to file
        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, 'w') as f:
            json.dump(report_data, f, indent=2)
        
        print(f"Evaluation report saved to {output_path}")
        
        return report_data

# Initialize evaluator
evaluator = ComprehensiveEvaluator('./models/trained_model', './models/tokenizer')


Loading model from models\trained_model...


OSError: models\tokenizer does not appear to have a file named config.json. Checkout 'https://huggingface.co/models\tokenizer/tree/None' for available files.

In [None]:

def comprehensive_evaluation_session():
    """Interactive evaluation session with multiple analysis options"""
    
    # Load test dataset
    print("Loading test dataset...")
    test_dataset = load_dataset('imdb')['test']
    
    # Run evaluation
    print("Running comprehensive evaluation...")
    results = evaluator.evaluate_on_dataset(test_dataset, "IMDB Test")
    
    # Display results
    metrics = results['metrics']
    print(f"\nModel Performance Summary:")
    print(f"{'='*50}")
    print(f"Accuracy:  {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall:    {metrics['recall']:.4f}")
    print(f"F1 Score:  {metrics['f1']:.4f}")
    if metrics['roc_auc']:
        print(f"ROC AUC:   {metrics['roc_auc']:.4f}")
    
    # Interactive analysis menu
    while True:
        print(f"\nAnalysis Options:")
        print("1. Confusion Matrix Analysis")
        print("2. ROC Curve Analysis")
        print("3. Error Analysis")
        print("4. Per-Class Performance")
        print("5. Save Evaluation Report")
        print("6. Generate All Visualizations")
        print("0. Exit")
        
        choice = input("\nSelect option (0-6): ").strip()
        
        if choice == '0':
            break
        elif choice == '1':
            evaluator.create_confusion_matrix_plot(['Negative', 'Positive'])
        elif choice == '2':
            evaluator.create_roc_curve_analysis()
        elif choice == '3':
            evaluator.error_analysis(num_examples=15)
        elif choice == '4':
            display_per_class_performance(results)
        elif choice == '5':
            evaluator.save_evaluation_report()
        elif choice == '6':
            generate_all_visualizations(results)
        else:
            print("Invalid option!")

def display_per_class_performance(results):
    """Display detailed per-class performance metrics"""
    metrics = results['metrics']
    class_names = ['Negative', 'Positive']  # Adjust based on your dataset
    
    # Create DataFrame for better display
    per_class_df = pd.DataFrame({
        'Class': class_names,
        'Precision': metrics['per_class_precision'],
        'Recall': metrics['per_class_recall'],
        'F1-Score': metrics['per_class_f1'],
        'Support': metrics['support']
    })
    
    print("\n📈 Per-Class Performance:")
    print(per_class_df.round(4))
    
    # Visualize per-class metrics
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    metrics_to_plot = ['Precision', 'Recall', 'F1-Score']
    colors = ['skyblue', 'lightcoral', 'lightgreen']
    
    for i, metric in enumerate(metrics_to_plot):
        axes[i].bar(class_names, per_class_df[metric], color=colors[i], alpha=0.7)
        axes[i].set_title(f'Per-Class {metric}')
        axes[i].set_ylabel(metric)
        axes[i].set_ylim(0, 1)
        
        # Add value labels on bars
        for j, v in enumerate(per_class_df[metric]):
            axes[i].text(j, v + 0.02, f'{v:.3f}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.savefig('reports/per_class_performance.png', dpi=300, bbox_inches='tight')
    plt.show()

def generate_all_visualizations(results):
    """Generate comprehensive visualization suite"""
    print(" Generating comprehensive visualization suite...")
    
    # 1. Confusion Matrix
    evaluator.create_confusion_matrix_plot(['Negative', 'Positive'])
    
    # 2. ROC Curve
    evaluator.create_roc_curve_analysis()
    
    # 3. Per-class performance
    display_per_class_performance(results)
    
    # 4. Performance summary dashboard
    create_performance_dashboard(results)
    
    print("All visualizations generated and saved!")

def create_performance_dashboard(results):
    """Create comprehensive performance dashboard"""
    metrics = results['metrics']
    
    # Create dashboard figure
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Overall Metrics', 'Score Distribution', 
                       'Error Analysis', 'Model Confidence'),
        specs=[[{"type": "bar"}, {"type": "histogram"}],
               [{"type": "pie"}, {"type": "scatter"}]]
    )
    
    # Overall metrics bar chart
    metric_names = ['Accuracy', 'Precision', 'Recall', 'F1']
    metric_values = [metrics['accuracy'], metrics['precision'], 
                    metrics['recall'], metrics['f1']]
    
    fig.add_trace(
        go.Bar(x=metric_names, y=metric_values, 
               marker_color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']),
        row=1, col=1
    )
    
    # Score distribution histogram
    scores = results['prediction_scores']
    fig.add_trace(
        go.Histogram(x=scores, nbinsx=30, marker_color='lightblue'),
        row=1, col=2
    )
    
    # Error analysis pie chart
    correct = sum(1 for t, p in zip(results['true_labels'], results['predictions']) if t == p)
    incorrect = len(results['true_labels']) - correct
    
    fig.add_trace(
        go.Pie(labels=['Correct', 'Incorrect'], values=[correct, incorrect],
               marker_colors=['lightgreen', 'lightcoral']),
        row=2, col=1
    )
    
    # Confidence vs Accuracy scatter
    confidences = results['prediction_scores']
    accuracies = [1 if t == p else 0 for t, p in zip(results['true_labels'], results['predictions'])]
    
    fig.add_trace(
        go.Scatter(x=confidences, y=accuracies, mode='markers',
                  marker=dict(color=accuracies, colorscale='RdYlBu')),
        row=2, col=2
    )
    
    # Update layout
    fig.update_layout(
        title_text="Model Performance Dashboard",
        showlegend=False,
        height=800
    )
    
    fig.write_html('reports/performance_dashboard.html')
    fig.show()

# Run comprehensive evaluation
print("Starting Comprehensive Evaluation Session...")
comprehensive_evaluation_session()
