# LLM Data Factory - Model Evaluation

This notebook evaluates the performance of our fine-tuned Phi-3-mini student model on customer support ticket classification.

## Evaluation Overview

We will:
1. Load the fine-tuned model
2. Load the test dataset
3. Generate predictions
4. Analyze performance metrics
5. Create visualizations
6. Compare with baseline models

In [None]:
# Import required libraries
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_recall_fscore_support
import sys
import os
from pathlib import Path

# Add the parent directory to path for imports
sys.path.append(str(Path().parent))

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print(" Libraries imported successfully")

In [None]:
# Load the test dataset
test_data_path = "../data/test_data.json"

try:
    with open(test_data_path, 'r') as f:
        test_data = json.load(f)
    
    print(f" Loaded {len(test_data)} test samples")
    
    # Convert to DataFrame for easier analysis
    test_df = pd.DataFrame(test_data)
    
    # Display basic info about the test set
    print("\n Test Dataset Overview:")
    print(f"Total samples: {len(test_df)}")
    print(f"Categories: {test_df['category'].unique()}")
    print(f"\nCategory distribution:")
    print(test_df['category'].value_counts())
    
    # Display first few examples
    print(f"\n Sample test tickets:")
    for i, row in test_df.head(3).iterrows():
        print(f"\n{i+1}. Category: {row['category']}")
        print(f"   Message: {row['customer_message'][:100]}...")
        
except FileNotFoundError:
    print(f"Test data file not found: {test_data_path}")
    print("Please ensure you have created the test dataset")
except Exception as e:
    print(f" Error loading test data: {e}")

In [None]:
# Load the fine-tuned model
try:
    from app.inference import load_classifier, predict_ticket_category
    
    print("🔄 Loading the fine-tuned model...")
    classifier = load_classifier()
    
    if classifier is not None:
        print("✅ Model loaded successfully!")
        
        # Test with a sample prediction
        test_message = "The app keeps crashing when I try to save my work. This is very urgent!"
        result = predict_ticket_category(classifier, test_message)
        
        print(f"\n🧪 Test prediction:")
        print(f"Message: {test_message}")
        print(f"Predicted: {result['predicted_category']}")
        print(f"Confidence: {result['confidence']:.3f}")
        print(f"All probabilities: {result['probabilities']}")
        
    else:
        print("❌ Failed to load model")
        print("This might be because the model hasn't been trained yet.")
        print("Please run: python scripts/02_finetune_student_model.py")
        
except Exception as e:
    print(f"❌ Error loading model: {e}")
    print("Make sure you have trained the model first.")
    classifier = None

In [None]:
# Generate predictions for all test samples
if classifier is not None and 'test_df' in locals():
    print("🔄 Generating predictions for all test samples...")
    
    predictions = []
    confidences = []
    all_probabilities = []
    
    for idx, row in test_df.iterrows():
        try:
            result = predict_ticket_category(classifier, row['customer_message'])
            predictions.append(result['predicted_category'])
            confidences.append(result['confidence'])
            all_probabilities.append(result['probabilities'])
            
            if (idx + 1) % 10 == 0:
                print(f"Processed {idx + 1}/{len(test_df)} samples...")
                
        except Exception as e:
            print(f"Error processing sample {idx}: {e}")
            predictions.append("Unknown")
            confidences.append(0.0)
            all_probabilities.append({})
    
    # Add predictions to dataframe
    test_df['predicted_category'] = predictions
    test_df['confidence'] = confidences
    test_df['probabilities'] = all_probabilities
    
    print(f"✅ Generated predictions for {len(test_df)} samples")
    
    # Display some example predictions
    print(f"\n📝 Sample predictions:")
    for i, row in test_df.head(5).iterrows():
        correct = "✅" if row['category'] == row['predicted_category'] else "❌"
        print(f"{correct} True: {row['category']} | Predicted: {row['predicted_category']} | Confidence: {row['confidence']:.3f}")
        
else:
    print("⏭️ Skipping predictions - model not loaded or test data not available")