# Model Evaluation for YouTube Summarizer with MLflow

This notebook evaluates the YouTube summarizer model using MLflow to track metrics and experiments.

In [None]:
import pandas as pd
import numpy as np
from transformers import pipeline
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import json
from typing import Dict, List, Any
from youtube_transcript_api import YouTubeTranscriptApi

# Download required NLTK data
nltk.download('punkt')

## Load Model from MLflow

In [None]:
def load_model_from_mlflow(run_id: str):
    """Load the YouTube summarizer model from MLflow"""
    model_uri = f"runs:/{run_id}/youtube_summarizer"
    return mlflow.pyfunc.load_model(model_uri)

# Replace with your run_id from the training notebook
RUN_ID = "your_run_id_here"
model = load_model_from_mlflow(RUN_ID)

## Prepare Test Data

In [None]:
def prepare_test_data() -> List[Dict[str, str]]:
    """Prepare test data with YouTube videos and reference summaries"""
    # Replace with your actual test data
    return [
        {
            "video_url": "https://www.youtube.com/watch?v=example1",
            "reference_summary": "Reference summary for video 1"
        },
        {
            "video_url": "https://www.youtube.com/watch?v=example2",
            "reference_summary": "Reference summary for video 2"
        }
    ]

## Evaluation Metrics

In [None]:
def calculate_metrics(predicted_summary: str, reference_summary: str) -> Dict[str, float]:
    """Calculate various evaluation metrics"""
    # ROUGE scores
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(reference_summary, predicted_summary)
    
    # BLEU score
    reference = [reference_summary.split()]
    candidate = predicted_summary.split()
    bleu = sentence_bleu(reference, candidate)
    
    # Summary length metrics
    pred_length = len(predicted_summary.split())
    ref_length = len(reference_summary.split())
    length_ratio = pred_length / ref_length if ref_length > 0 else 0
    
    return {
        'rouge1_precision': rouge_scores['rouge1'].precision,
        'rouge1_recall': rouge_scores['rouge1'].recall,
        'rouge1_f1': rouge_scores['rouge1'].fmeasure,
        'rouge2_f1': rouge_scores['rouge2'].fmeasure,
        'rougeL_f1': rouge_scores['rougeL'].fmeasure,
        'bleu_score': bleu,
        'summary_length_ratio': length_ratio,
        'predicted_length': pred_length,
        'reference_length': ref_length
    }

## MLflow Evaluation Pipeline

In [None]:
def evaluate_model_with_mlflow(model, test_data: List[Dict[str, str]]):
    """Evaluate model and log results to MLflow"""
    mlflow.set_experiment("youtube-summarizer-evaluation")
    
    with mlflow.start_run(run_name="model_evaluation") as run:
        all_metrics = []
        
        # Log model parameters
        model_params = model.get_config() if hasattr(model, 'get_config') else {}
        mlflow.log_params(model_params)
        
        # Evaluate each test example
        for i, example in enumerate(test_data):
            # Generate summary
            predicted_summary = model(example['video_url'])
            
            # Calculate metrics
            metrics = calculate_metrics(predicted_summary, example['reference_summary'])
            all_metrics.append(metrics)
            
            # Log metrics for each example
            for metric_name, value in metrics.items():
                mlflow.log_metric(f"example_{i}_{metric_name}", value)
            
            # Log summaries as artifacts
            example_dir = f"example_{i}"
            os.makedirs(example_dir, exist_ok=True)
            
            with open(f"{example_dir}/predicted_summary.txt", "w") as f:
                f.write(predicted_summary)
            with open(f"{example_dir}/reference_summary.txt", "w") as f:
                f.write(example['reference_summary'])
            
            mlflow.log_artifacts(example_dir)
        
        # Calculate and log average metrics
        avg_metrics = {}
        for metric in all_metrics[0].keys():
            avg_value = np.mean([m[metric] for m in all_metrics])
            avg_metrics[f"avg_{metric}"] = avg_value
            mlflow.log_metric(f"avg_{metric}", avg_value)
        
        # Create and log visualizations
        create_and_log_visualizations(all_metrics)
        
        return run.info.run_id, avg_metrics

## Create Visualizations

In [None]:
def create_and_log_visualizations(metrics_list: List[Dict[str, float]]):
    """Create and log visualizations to MLflow"""
    # Convert metrics to DataFrame
    df = pd.DataFrame(metrics_list)
    
    # ROUGE scores comparison
    plt.figure(figsize=(10, 6))
    rouge_metrics = ['rouge1_f1', 'rouge2_f1', 'rougeL_f1']
    df[rouge_metrics].mean().plot(kind='bar')
    plt.title('Average ROUGE Scores')
    plt.ylabel('Score')
    plt.tight_layout()
    plt.savefig('rouge_scores.png')
    mlflow.log_artifact('rouge_scores.png')
    plt.close()
    
    # Summary length analysis
    plt.figure(figsize=(10, 6))
    plt.scatter(df['reference_length'], df['predicted_length'])
    plt.plot([0, max(df['reference_length'])], [0, max(df['reference_length'])], '--', color='red')
    plt.xlabel('Reference Summary Length')
    plt.ylabel('Predicted Summary Length')
    plt.title('Summary Length Comparison')
    plt.tight_layout()
    plt.savefig('length_comparison.png')
    mlflow.log_artifact('length_comparison.png')
    plt.close()
    
    # Metrics distribution
    plt.figure(figsize=(12, 6))
    metrics_to_plot = ['rouge1_f1', 'rouge2_f1', 'rougeL_f1', 'bleu_score']
    df[metrics_to_plot].boxplot()
    plt.title('Distribution of Evaluation Metrics')
    plt.ylabel('Score')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig('metrics_distribution.png')
    mlflow.log_artifact('metrics_distribution.png')
    plt.close()

## Run Evaluation

In [None]:
# Prepare test data
test_data = prepare_test_data()

# Run evaluation
run_id, avg_metrics = evaluate_model_with_mlflow(model, test_data)

print("\nEvaluation Results:")
print("==================")
for metric, value in avg_metrics.items():
    print(f"{metric}: {value:.4f}")

print(f"\nMLflow run ID: {run_id}")
print("View detailed results in the MLflow UI")

## View Results in MLflow UI

To view the detailed results and visualizations:
1. Start the MLflow UI by running `mlflow ui` in your terminal
2. Open http://localhost:5000 in your browser
3. Navigate to the experiment "youtube-summarizer-evaluation"
4. Click on the run ID printed above to see detailed metrics and artifacts