In [11]:
import pandas as pd
import whisper

from langchain.tools import tool
from pytube import YouTube

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import chroma
from langchain.schema.runnable import RunnableLambda

from langchain.chains import RetrievalQA

In [9]:
@tool
def download_transcribe(video_url:str):
    """This extracts the text from the given video URL"""
    yt = YouTube(video_url)
    audio = yt.streams.load(only_audio = True).order_by('abt').desc().first()
    audio_path = audio.download("audio.mp4")
    
    model = whisper.load_model("base")
    result = model.transcribe(audio_path)
    return result['text']

In [12]:
def embed_store(text:str):
    splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 30)
    chunks = splitter.split_text(text)
    
    embed_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    db = chroma.from_texts(chunks,embed_model,collection_name ='yt_chunks')
    
    return db

runnable = RunnableLambda(embed_store)

In [14]:
from langchain_mistralai.chat_models import ChatMistralAI
import os
os.environ["MISTRAL_API_KEY"] = "veMgdofLWqX4XlTw3tFduXu4kTTkGwEK"
llm = ChatMistralAI(model="mistral-small")

In [15]:
def run_query(query: str, db):
    retriever = db.as_retriever(search_type="similarity", k=5)    
    chain = RetrievalQA(retriever=retriever, llm=llm)
    return chain.run(query)


In [16]:
import streamlit as st

st.title("YouTube RAG Explorer")

video_url = st.text_input("Enter YouTube URL")
query = st.text_input("Ask a question")

if st.button("Process & Query"):
    transcript = download_and_transcribe(video_url)
    db = embed_and_store(transcript)
    response = run_query(query, db)
    st.markdown("### Answer")
    st.write(response)

2025-08-04 20:57:52.082 
  command:

    streamlit run C:\Users\JothiRama\AppData\Local\Programs\Python\Python310\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
