# LLM Data Factory - Model Evaluation

This notebook evaluates the performance of our fine-tuned Phi-3-mini student model on customer support ticket classification.

## Evaluation Overview

We will:
1. Load the fine-tuned model
2. Load the test dataset
3. Generate predictions
4. Analyze performance metrics
5. Create visualizations
6. Compare with baseline models

In [None]:
# Import required libraries
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_fscore_support
import sys
import os
from pathlib import Path

# Add the parent directory to path for imports
sys.path.append(str(Path().parent))

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print(" Libraries imported successfully")

In [None]:
# Load the test dataset
test_data_path = "../data/test_data.json"

try:
    with open(test_data_path, 'r') as f:
        test_data = json.load(f)
    
    print(f" Loaded {len(test_data)} test samples")
    
    # Convert to DataFrame for easier analysis
    test_df = pd.DataFrame(test_data)
    
    # Display basic info about the test set
    print("\n Test Dataset Overview:")
    print(f"Total samples: {len(test_df)}")
    print(f"Categories: {test_df['category'].unique()}")
    print(f"\nCategory distribution:")
    print(test_df['category'].value_counts())
    
    # Display first few examples
    print(f"\n Sample test tickets:")
    for i, row in test_df.head(3).iterrows():
        print(f"\n{i+1}. Category: {row['category']}")
        print(f"   Message: {row['customer_message'][:100]}...")
        
except FileNotFoundError:
    print(f"Test data file not found: {test_data_path}")
    print("Please ensure you have created the test dataset")
except Exception as e:
    print(f" Error loading test data: {e}")

In [None]:
# Load the fine-tuned model
try:
    from app.inference import load_classifier, predict_ticket_category
    
    print("üîÑ Loading the fine-tuned model...")
    classifier = load_classifier()
    
    if classifier is not None:
        print("‚úÖ Model loaded successfully!")
        
        # Test with a sample prediction
        test_message = "The app keeps crashing when I try to save my work. This is very urgent!"
        result = predict_ticket_category(classifier, test_message)
        
        print(f"\nüß™ Test prediction:")
        print(f"Message: {test_message}")
        print(f"Predicted: {result['predicted_category']}")
        print(f"Confidence: {result['confidence']:.3f}")
        print(f"All probabilities: {result['probabilities']}")
        
    else:
        print("‚ùå Failed to load model")
        print("This might be because the model hasn't been trained yet.")
        print("Please run: python scripts/02_finetune_student_model.py")
        
except Exception as e:
    print(f"‚ùå Error loading model: {e}")
    print("Make sure you have trained the model first.")
    classifier = None

In [None]:
# Generate predictions for all test samples
if classifier is not None and 'test_df' in locals():
    print("üîÑ Generating predictions for all test samples...")
    
    predictions = []
    confidences = []
    all_probabilities = []
    
    for idx, row in test_df.iterrows():
        try:
            result = predict_ticket_category(classifier, row['customer_message'])
            predictions.append(result['predicted_category'])
            confidences.append(result['confidence'])
            all_probabilities.append(result['probabilities'])
            
            if (idx + 1) % 10 == 0:
                print(f"Processed {idx + 1}/{len(test_df)} samples...")
                
        except Exception as e:
            print(f"Error processing sample {idx}: {e}")
            predictions.append("Unknown")
            confidences.append(0.0)
            all_probabilities.append({})
    
    # Add predictions to dataframe
    test_df['predicted_category'] = predictions
    test_df['confidence'] = confidences
    test_df['probabilities'] = all_probabilities
    
    print(f"‚úÖ Generated predictions for {len(test_df)} samples")
    
    # Display some example predictions
    print(f"\nüìù Sample predictions:")
    for i, row in test_df.head(5).iterrows():
        correct = "‚úÖ" if row['category'] == row['predicted_category'] else "‚ùå"
        print(f"{correct} True: {row['category']} | Predicted: {row['predicted_category']} | Confidence: {row['confidence']:.3f}")
        
else:
    print("‚è≠Ô∏è Skipping predictions - model not loaded or test data not available")

In [None]:
# Calculate and display performance metrics
if 'predicted_category' in test_df.columns:
    print("üìä Performance Metrics")
    print("=" * 50)
    
    # Get true and predicted labels
    y_true = test_df['category'].tolist()
    y_pred = test_df['predicted_category'].tolist()
    
    # Overall accuracy
    accuracy = accuracy_score(y_true, y_pred)
    print(f"üéØ Overall Accuracy: {accuracy:.3f} ({accuracy*100:.1f}%)")
    
    # Detailed classification report
    print(f"\nüìã Detailed Classification Report:")
    print("-" * 40)
    report = classification_report(y_true, y_pred, output_dict=True)
    
    # Print formatted report
    for category, metrics in report.items():
        if category not in ['accuracy', 'macro avg', 'weighted avg']:
            print(f"\n{category}:")
            print(f"  Precision: {metrics['precision']:.3f}")
            print(f"  Recall:    {metrics['recall']:.3f}")
            print(f"  F1-Score:  {metrics['f1-score']:.3f}")
            print(f"  Support:   {metrics['support']}")
    
    # Macro and weighted averages
    print(f"\nüìà Average Metrics:")
    print(f"Macro Average F1:    {report['macro avg']['f1-score']:.3f}")
    print(f"Weighted Average F1: {report['weighted avg']['f1-score']:.3f}")
    
    # Per-category accuracy
    print(f"\nüéØ Per-Category Accuracy:")
    for category in test_df['category'].unique():
        mask = test_df['category'] == category
        cat_accuracy = (test_df[mask]['category'] == test_df[mask]['predicted_category']).mean()
        cat_count = mask.sum()
        print(f"{category}: {cat_accuracy:.3f} ({cat_count} samples)")
    
    # Confidence analysis
    print(f"\nüîç Confidence Analysis:")
    print(f"Average Confidence: {test_df['confidence'].mean():.3f}")
    print(f"Min Confidence: {test_df['confidence'].min():.3f}")
    print(f"Max Confidence: {test_df['confidence'].max():.3f}")
    
    # Correct vs incorrect predictions confidence
    correct_mask = test_df['category'] == test_df['predicted_category']
    correct_conf = test_df[correct_mask]['confidence'].mean()
    incorrect_conf = test_df[~correct_mask]['confidence'].mean()
    
    print(f"Avg Confidence (Correct): {correct_conf:.3f}")
    print(f"Avg Confidence (Incorrect): {incorrect_conf:.3f}")
    
else:
    print("‚è≠Ô∏è Skipping metrics calculation - predictions not available")

In [None]:
# Create confusion matrix visualization
if 'predicted_category' in test_df.columns:
    # Calculate confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    categories = sorted(test_df['category'].unique())
    
    # Create figure with subplots
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    fig.suptitle('Model Evaluation Results', fontsize=16, fontweight='bold')
    
    # 1. Confusion Matrix
    ax1 = axes[0, 0]
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=categories, yticklabels=categories, ax=ax1)
    ax1.set_title('Confusion Matrix')
    ax1.set_xlabel('Predicted')
    ax1.set_ylabel('Actual')
    
    # 2. Category Distribution
    ax2 = axes[0, 1]
    category_counts = test_df['category'].value_counts()
    ax2.bar(category_counts.index, category_counts.values)
    ax2.set_title('Test Set Category Distribution')
    ax2.set_xlabel('Category')
    ax2.set_ylabel('Count')
    ax2.tick_params(axis='x', rotation=45)
    
    # 3. Confidence Distribution
    ax3 = axes[1, 0]
    correct_conf = test_df[test_df['category'] == test_df['predicted_category']]['confidence']
    incorrect_conf = test_df[test_df['category'] != test_df['predicted_category']]['confidence']
    
    ax3.hist(correct_conf, alpha=0.7, label='Correct', bins=20, color='green')
    ax3.hist(incorrect_conf, alpha=0.7, label='Incorrect', bins=20, color='red')
    ax3.set_title('Confidence Distribution')
    ax3.set_xlabel('Confidence Score')
    ax3.set_ylabel('Frequency')
    ax3.legend()
    
    # 4. Per-category F1 scores
    ax4 = axes[1, 1]
    f1_scores = []
    for category in categories:
        cat_report = classification_report(y_true, y_pred, output_dict=True)
        f1_scores.append(cat_report[category]['f1-score'])
    
    bars = ax4.bar(categories, f1_scores)
    ax4.set_title('F1-Score by Category')
    ax4.set_xlabel('Category')
    ax4.set_ylabel('F1-Score')
    ax4.tick_params(axis='x', rotation=45)
    ax4.set_ylim(0, 1)
    
    # Add value labels on bars
    for bar, score in zip(bars, f1_scores):
        ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
                f'{score:.3f}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    print("üìä Visualizations created successfully!")
    
else:
    print("‚è≠Ô∏è Skipping visualizations - predictions not available")

In [None]:
# Export results and generate final report
if 'predicted_category' in test_df.columns:
    # Save detailed results
    results_path = "../results"
    os.makedirs(results_path, exist_ok=True)
    
    # Save predictions with details
    results_df = test_df[['ticket_id', 'customer_message', 'category', 
                         'predicted_category', 'confidence']].copy()
    results_df['correct'] = results_df['category'] == results_df['predicted_category']
    
    results_df.to_csv(f"{results_path}/detailed_predictions.csv", index=False)
    print(f"‚úÖ Saved detailed predictions to {results_path}/detailed_predictions.csv")
    
    # Save summary metrics
    summary_metrics = {
        "overall_accuracy": float(accuracy),
        "total_samples": len(test_df),
        "correct_predictions": int(correct_mask.sum()),
        "average_confidence": float(test_df['confidence'].mean()),
        "per_category_metrics": {},
        "confusion_matrix": cm.tolist(),
        "categories": categories
    }
    
    # Add per-category metrics
    for category in categories:
        cat_metrics = report[category]
        summary_metrics["per_category_metrics"][category] = {
            "precision": float(cat_metrics['precision']),
            "recall": float(cat_metrics['recall']),
            "f1_score": float(cat_metrics['f1-score']),
            "support": int(cat_metrics['support'])
        }
    
    # Save summary metrics as JSON
    with open(f"{results_path}/evaluation_summary.json", 'w') as f:
        json.dump(summary_metrics, f, indent=2)
    print(f"‚úÖ Saved evaluation summary to {results_path}/evaluation_summary.json")
    
    # Generate final report
    print("\n" + "="*60)
    print("üéâ FINAL EVALUATION REPORT")
    print("="*60)
    
    print(f"\nüìà Overall Performance:")
    print(f"   ‚Ä¢ Accuracy: {accuracy:.1%}")
    print(f"   ‚Ä¢ Total Samples: {len(test_df)}")
    print(f"   ‚Ä¢ Correct Predictions: {correct_mask.sum()}")
    print(f"   ‚Ä¢ Average Confidence: {test_df['confidence'].mean():.3f}")
    
    print(f"\nüèÜ Best Performing Category:")
    best_category = max(categories, key=lambda x: report[x]['f1-score'])
    best_f1 = report[best_category]['f1-score']
    print(f"   ‚Ä¢ {best_category}: F1-Score = {best_f1:.3f}")
    
    print(f"\n‚ö†Ô∏è Category Needing Improvement:")
    worst_category = min(categories, key=lambda x: report[x]['f1-score'])
    worst_f1 = report[worst_category]['f1-score']
    print(f"   ‚Ä¢ {worst_category}: F1-Score = {worst_f1:.3f}")
    
    print(f"\nüí° Key Insights:")
    if correct_conf > incorrect_conf:
        print(f"   ‚Ä¢ Model shows good confidence calibration")
        print(f"     (Correct: {correct_conf:.3f} vs Incorrect: {incorrect_conf:.3f})")
    
    if accuracy > 0.8:
        print(f"   ‚Ä¢ Excellent performance! Model ready for production")
    elif accuracy > 0.7:
        print(f"   ‚Ä¢ Good performance, consider additional tuning")
    else:
        print(f"   ‚Ä¢ Model needs improvement, review training data/process")
    
    print("\nüéØ Next Steps:")
    print("   ‚Ä¢ Deploy model to production if performance is satisfactory")
    print("   ‚Ä¢ Consider collecting more training data for underperforming categories")
    print("   ‚Ä¢ Monitor model performance in real-world scenarios")
    
else:
    print("‚è≠Ô∏è Skipping final report - predictions not available")