# YouTube Video Summarizer with MLflow Integration

This notebook demonstrates how to:
1. Create a YouTube video summarization chain
2. Track the chain and prompts using MLflow
3. Load and use the tracked model

## Setup and Requirements

In [None]:
!pip install youtube-transcript-api groq python-dotenv mlflow

In [None]:
import mlflow
from youtube_transcript_api import YouTubeTranscriptApi
import groq
import os
from dotenv import load_dotenv
from urllib.parse import urlparse, parse_qs
import json
from typing import Dict, Any

# Load environment variables
load_dotenv()

# Initialize Groq client
client = groq.Groq(
    api_key=os.getenv('GROQ_API_KEY')
)

## Define Chain Components

In [None]:
class YouTubeSummaryChain:
    def __init__(self, model_name: str = "mixtral-8x7b-32768", temperature: float = 0.3):
        self.model_name = model_name
        self.temperature = temperature
        self.prompt_template = """
        Please provide a comprehensive summary of the following video transcript. 
        Focus on the main points, key insights, and important conclusions:

        {text}

        Please structure the summary with:
        1. Main Topic/Theme
        2. Key Points
        3. Important Details
        4. Conclusions
        """
    
    def extract_video_id(self, url: str) -> str:
        """Extract YouTube video ID from URL"""
        parsed_url = urlparse(url)
        if parsed_url.hostname == 'youtu.be':
            return parsed_url.path[1:]
        if parsed_url.hostname in ('www.youtube.com', 'youtube.com'):
            if parsed_url.path == '/watch':
                return parse_qs(parsed_url.query)['v'][0]
        return None

    def get_transcript(self, video_id: str) -> str:
        """Get transcript for a YouTube video"""
        try:
            transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
            return ' '.join([t['text'] for t in transcript_list])
        except Exception as e:
            print(f"Error getting transcript: {e}")
            return None

    def summarize_text(self, text: str) -> str:
        """Summarize text using Groq"""
        prompt = self.prompt_template.format(text=text)

        try:
            completion = client.chat.completions.create(
                model=self.model_name,
                messages=[
                    {"role": "user", "content": prompt}
                ],
                temperature=self.temperature,
                max_tokens=2048
            )
            return completion.choices[0].message.content
        except Exception as e:
            print(f"Error in summarization: {e}")
            return None

    def __call__(self, url: str) -> str:
        """Process a YouTube URL and return summary"""
        video_id = self.extract_video_id(url)
        if not video_id:
            return "Invalid YouTube URL"
        
        transcript = self.get_transcript(video_id)
        if not transcript:
            return "Could not retrieve transcript"
        
        summary = self.summarize_text(transcript)
        if not summary:
            return "Could not generate summary"
        
        return summary

    def get_config(self) -> Dict[str, Any]:
        """Get chain configuration for MLflow tracking"""
        return {
            "model_name": self.model_name,
            "temperature": self.temperature,
            "prompt_template": self.prompt_template
        }

## MLflow Integration

In [None]:
def log_chain_to_mlflow(chain: YouTubeSummaryChain, experiment_name: str = "youtube-summarizer"):
    """Log the chain configuration and prompt to MLflow"""
    mlflow.set_experiment(experiment_name)
    
    with mlflow.start_run() as run:
        # Log parameters
        config = chain.get_config()
        mlflow.log_params({
            "model_name": config["model_name"],
            "temperature": config["temperature"]
        })
        
        # Log prompt template as artifact
        with open("prompt_template.txt", "w") as f:
            f.write(config["prompt_template"])
        mlflow.log_artifact("prompt_template.txt")
        
        # Log the chain as a custom model
        mlflow.pyfunc.log_model(
            artifact_path="youtube_summarizer",
            python_model=chain,
            artifacts={"prompt_template": "prompt_template.txt"},
            code_path=["."]  # Include current directory in the model
        )
        
        return run.info.run_id

def load_chain_from_mlflow(run_id: str) -> YouTubeSummaryChain:
    """Load a chain from MLflow"""
    model_uri = f"runs:/{run_id}/youtube_summarizer"
    chain = mlflow.pyfunc.load_model(model_uri)
    return chain

## Example Usage

In [None]:
# Create and log the chain
chain = YouTubeSummaryChain()
run_id = log_chain_to_mlflow(chain)
print(f"Chain logged with run_id: {run_id}")

# Load the chain
loaded_chain = load_chain_from_mlflow(run_id)

# Use the loaded chain
youtube_url = "https://www.youtube.com/watch?v=your_video_id"
summary = loaded_chain(youtube_url)
print(summary)

## View MLflow Experiment Results

You can view the tracked experiments by running:
```bash
mlflow ui
```

This will start the MLflow UI server where you can see:
1. All experiment runs
2. Chain configurations
3. Prompt templates
4. Performance metrics (if added)