# YouTube RAG System with MLflow Tracking

This notebook implements a RAG system for YouTube videos with MLflow experiment tracking. It logs:
- Parameters: chunk size, overlap, number of videos, etc.
- Metrics: processing time, embedding dimensions, retrieval performance
- Artifacts: video metadata, sample questions and answers

In [None]:
import os
import time
import json
from typing import List, Dict
import mlflow
import pandas as pd
from datetime import datetime
from sklearn.metrics.pairwise import cosine_similarity
from youtube_transcript_api import YouTubeTranscriptApi
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from dotenv import load_dotenv
from urllib.parse import urlparse, parse_qs

In [None]:
# Load environment variables
load_dotenv()

# MLflow setup
mlflow.set_experiment("YouTube RAG System")

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings()

In [None]:
def extract_video_id(url: str) -> str:
    """Extract YouTube video ID from URL"""
    parsed_url = urlparse(url)
    if parsed_url.hostname == 'youtu.be':
        return parsed_url.path[1:]
    if parsed_url.hostname in ('www.youtube.com', 'youtube.com'):
        if parsed_url.path == '/watch':
            return parse_qs(parsed_url.query)['v'][0]
    raise ValueError(f'Invalid YouTube URL: {url}')

def get_transcript(video_id: str) -> Dict:
    """Get transcript and metadata for a YouTube video"""
    try:
        start_time = time.time()
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
        processing_time = time.time() - start_time
        
        text = ' '.join([t['text'] for t in transcript_list])
        return {
            'text': text,
            'processing_time': processing_time,
            'word_count': len(text.split()),
            'segment_count': len(transcript_list)
        }
    except Exception as e:
        print(f"Error getting transcript for video {video_id}: {str(e)}")
        return None

def process_videos(urls: List[str]) -> List[dict]:
    """Process multiple YouTube videos and return their transcripts with metrics"""
    documents = []
    metrics = []
    
    for url in urls:
        video_id = extract_video_id(url)
        result = get_transcript(video_id)
        
        if result:
            documents.append({
                'video_id': video_id,
                'url': url,
                'content': result['text']
            })
            metrics.append({
                'video_id': video_id,
                'processing_time': result['processing_time'],
                'word_count': result['word_count'],
                'segment_count': result['segment_count']
            })
    
    return documents, metrics

In [None]:
# Configuration parameters
params = {
    'chunk_size': 1000,
    'chunk_overlap': 200,
    'retriever_k': 3,
    'temperature': 0
}

# Example YouTube URLs
youtube_urls = [
    "https://www.youtube.com/watch?v=example1",
    "https://www.youtube.com/watch?v=example2"
]

# Start MLflow run
with mlflow.start_run(run_name=f"rag_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}"):
    # Log parameters
    mlflow.log_params(params)
    mlflow.log_param("num_videos", len(youtube_urls))
    
    # Process videos and log metrics
    start_time = time.time()
    documents, video_metrics = process_videos(youtube_urls)
    processing_time = time.time() - start_time
    
    # Log video processing metrics
    mlflow.log_metric("total_processing_time", processing_time)
    for vm in video_metrics:
        mlflow.log_metrics({
            f"processing_time_{vm['video_id']}": vm['processing_time'],
            f"word_count_{vm['video_id']}": vm['word_count'],
            f"segment_count_{vm['video_id']}": vm['segment_count']
        })
    
    # Create and log text chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=params['chunk_size'],
        chunk_overlap=params['chunk_overlap'],
        length_function=len
    )
    
    chunks = []
    metadatas = []
    
    for doc in documents:
        doc_chunks = text_splitter.split_text(doc['content'])
        doc_metadatas = [{
            'video_id': doc['video_id'],
            'url': doc['url'],
            'chunk': i
        } for i in range(len(doc_chunks))]
        
        chunks.extend(doc_chunks)
        metadatas.extend(doc_metadatas)
    
    # Log chunking metrics
    mlflow.log_metric("total_chunks", len(chunks))
    mlflow.log_metric("avg_chunk_length", sum(len(c) for c in chunks) / len(chunks))
    
    # Create vector store
    vectorstore = Chroma.from_texts(
        texts=chunks,
        embedding=embeddings,
        metadatas=metadatas,
        persist_directory="./chroma_db"
    )
    vectorstore.persist()
    
    # Create QA chain
    qa_chain = RetrievalQA.from_chain_type(
        llm=ChatOpenAI(temperature=params['temperature']),
        chain_type="stuff",
        retriever=vectorstore.as_retriever(search_kwargs={"k": params['retriever_k']}),
        return_source_documents=True
    )
    
    # Log artifacts
    video_metadata = pd.DataFrame(video_metrics)
    video_metadata.to_csv("video_metadata.csv", index=False)
    mlflow.log_artifact("video_metadata.csv")
    
    # Function to ask questions and log results
    def ask_question(question: str, log_to_mlflow: bool = True):
        """Ask a question and get answer with sources, optionally logging to MLflow"""
        start_time = time.time()
        result = qa_chain({"query": question})
        response_time = time.time() - start_time
        
        if log_to_mlflow:
            qa_result = {
                'question': question,
                'answer': result['result'],
                'sources': [doc.metadata['url'] for doc in result['source_documents']],
                'response_time': response_time
            }
            
            with open(f"qa_result_{len(os.listdir()) + 1}.json", 'w') as f:
                json.dump(qa_result, f)
            mlflow.log_artifact(f"qa_result_{len(os.listdir())}.json")
            mlflow.log_metric(f"response_time_{len(os.listdir())}", response_time)
        
        print(f"Question: {question}\n")
        print(f"Answer: {result['result']}\n")
        print("Sources:")
        for doc in result['source_documents']:
            print(f"- Video: {doc.metadata['url']}")
        print(f"\nResponse time: {response_time:.2f} seconds")

In [None]:
# Example questions
questions = [
    "What are the main topics discussed in these videos?",
    "Can you summarize the key points from all videos?",
    "What are the most important conclusions presented?"
]

for question in questions:
    ask_question(question)