# YouTube Summarizer: Model Setup and Monitoring

This notebook demonstrates how to:
1. Set up the YouTube summarizer model
2. Implement monitoring using MLflow
3. Track model performance metrics
4. Monitor system resources and latency

In [None]:
# Import required libraries
import os
import time
import mlflow
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from youtube_transcript_api import YouTubeTranscriptApi
from transformers import pipeline
from rouge_score import rouge_scorer
import psutil
import plotly.express as px
import plotly.graph_objects as go

## 1. Model Setup

First, we'll set up our summarization model using the Hugging Face Transformers library.

In [None]:
def initialize_model():
    """Initialize the summarization model"""
    model_name = "facebook/bart-large-cnn"  # You can change this to other models
    summarizer = pipeline("summarization", model=model_name)
    return summarizer

# Initialize the model
summarizer = initialize_model()

## 2. MLflow Setup

Set up MLflow to track experiments and model performance.

In [None]:
# Configure MLflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("youtube_summarizer_monitoring")

def log_metrics(metrics_dict):
    """Log metrics to MLflow"""
    with mlflow.start_run():
        mlflow.log_metrics(metrics_dict)
        mlflow.log_param("model_name", "facebook/bart-large-cnn")

## 3. Performance Monitoring Functions

In [None]:
class PerformanceMonitor:
    def __init__(self):
        self.scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        self.metrics_history = []
    
    def measure_latency(self, func, *args, **kwargs):
        """Measure execution time of a function"""
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        return result, end_time - start_time
    
    def measure_resource_usage(self):
        """Measure CPU and memory usage"""
        cpu_percent = psutil.cpu_percent()
        memory_info = psutil.Process().memory_info()
        return {
            'cpu_percent': cpu_percent,
            'memory_mb': memory_info.rss / 1024 / 1024
        }
    
    def calculate_rouge_scores(self, prediction, reference):
        """Calculate ROUGE scores"""
        scores = self.scorer.score(prediction, reference)
        return {
            'rouge1_f1': scores['rouge1'].fmeasure,
            'rouge2_f1': scores['rouge2'].fmeasure,
            'rougeL_f1': scores['rougeL'].fmeasure
        }
    
    def log_performance(self, latency, rouge_scores, resource_usage):
        """Log all performance metrics"""
        metrics = {
            'latency': latency,
            **rouge_scores,
            **resource_usage
        }
        self.metrics_history.append(metrics)
        log_metrics(metrics)
        return metrics

## 4. Visualization Functions

In [None]:
def plot_metrics_over_time(metrics_history):
    """Create interactive plots for metrics over time"""
    df = pd.DataFrame(metrics_history)
    
    # Latency plot
    fig_latency = px.line(df, y='latency', title='Inference Latency Over Time')
    fig_latency.show()
    
    # Resource usage plot
    fig_resources = go.Figure()
    fig_resources.add_trace(go.Scatter(y=df['cpu_percent'], name='CPU %'))
    fig_resources.add_trace(go.Scatter(y=df['memory_mb'], name='Memory (MB)'))
    fig_resources.update_layout(title='Resource Usage Over Time')
    fig_resources.show()
    
    # ROUGE scores plot
    fig_rouge = go.Figure()
    for metric in ['rouge1_f1', 'rouge2_f1', 'rougeL_f1']:
        fig_rouge.add_trace(go.Scatter(y=df[metric], name=metric))
    fig_rouge.update_layout(title='ROUGE Scores Over Time')
    fig_rouge.show()

## 5. Example Usage

In [None]:
# Initialize the performance monitor
monitor = PerformanceMonitor()

def process_video(video_id, reference_summary=None):
    """Process a video with monitoring"""
    # Get transcript
    transcript = YouTubeTranscriptApi.get_transcript(video_id)
    text = ' '.join([t['text'] for t in transcript])
    
    # Generate summary with latency measurement
    summary, latency = monitor.measure_latency(
        lambda: summarizer(text, max_length=130, min_length=30)[0]['summary_text']
    )
    
    # Measure resource usage
    resource_usage = monitor.measure_resource_usage()
    
    # Calculate ROUGE scores if reference summary is provided
    rouge_scores = monitor.calculate_rouge_scores(summary, reference_summary) if reference_summary else {}
    
    # Log all metrics
    metrics = monitor.log_performance(latency, rouge_scores, resource_usage)
    
    return summary, metrics

# Example usage
# video_id = "YOUR_VIDEO_ID"
# summary, metrics = process_video(video_id)
# plot_metrics_over_time(monitor.metrics_history)