# YouTube RAG System

This notebook demonstrates a Proof of Concept for creating a RAG system using YouTube video transcripts. The system will:
1. Extract transcripts from provided YouTube videos
2. Process and chunk the transcripts
3. Store them in a ChromaDB vector database
4. Enable question-answering using the stored knowledge

In [None]:
import os
from typing import List
from youtube_transcript_api import YouTubeTranscriptApi
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from dotenv import load_dotenv
import pandas as pd
from urllib.parse import urlparse, parse_qs

In [None]:
# Load environment variables (make sure to create a .env file with your OPENAI_API_KEY)
load_dotenv()

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings()

In [None]:
def extract_video_id(url: str) -> str:
    """Extract YouTube video ID from URL"""
    parsed_url = urlparse(url)
    if parsed_url.hostname == 'youtu.be':
        return parsed_url.path[1:]
    if parsed_url.hostname in ('www.youtube.com', 'youtube.com'):
        if parsed_url.path == '/watch':
            return parse_qs(parsed_url.query)['v'][0]
    raise ValueError(f'Invalid YouTube URL: {url}')

def get_transcript(video_id: str) -> str:
    """Get transcript for a YouTube video"""
    try:
        transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
        return ' '.join([t['text'] for t in transcript_list])
    except Exception as e:
        print(f"Error getting transcript for video {video_id}: {str(e)}")
        return ""

def process_videos(urls: List[str]) -> List[dict]:
    """Process multiple YouTube videos and return their transcripts"""
    documents = []
    for url in urls:
        video_id = extract_video_id(url)
        transcript = get_transcript(video_id)
        if transcript:
            documents.append({
                'video_id': video_id,
                'url': url,
                'content': transcript
            })
    return documents

In [None]:
# Example usage - replace with your YouTube URLs
youtube_urls = [
    "https://www.youtube.com/watch?v=example1",
    "https://www.youtube.com/watch?v=example2"
]

# Process videos
documents = process_videos(youtube_urls)

# Create text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

In [None]:
# Process and store documents in ChromaDB
chunks = []
metadatas = []

for doc in documents:
    doc_chunks = text_splitter.split_text(doc['content'])
    doc_metadatas = [{
        'video_id': doc['video_id'],
        'url': doc['url'],
        'chunk': i
    } for i in range(len(doc_chunks))]
    
    chunks.extend(doc_chunks)
    metadatas.extend(doc_metadatas)

# Create and persist vector store
vectorstore = Chroma.from_texts(
    texts=chunks,
    embedding=embeddings,
    metadatas=metadatas,
    persist_directory="./chroma_db"
)
vectorstore.persist()

In [None]:
# Create QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=ChatOpenAI(temperature=0),
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    return_source_documents=True
)

def ask_question(question: str):
    """Ask a question and get answer with sources"""
    result = qa_chain({"query": question})
    
    print(f"Question: {question}\n")
    print(f"Answer: {result['result']}\n")
    print("Sources:")
    for doc in result['source_documents']:
        print(f"- Video: {doc.metadata['url']}")

In [None]:
# Example question
question = "What are the main topics discussed in these videos?"
ask_question(question)