In [48]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceEmbeddings

In [None]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Read API key from environment variable
groq_api_key = os.getenv("GROQ_API_KEY")

### Step 1a - Indexing (Document Ingestion)

In [50]:
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled

video_id = "PGUdWfB8nLg"
try:
    api = YouTubeTranscriptApi()
    transcript_chunks_list = api.fetch(video_id, languages=["en"])

    # Use .text instead of ["text"]
    transcript = " ".join(chunk.text for chunk in transcript_chunks_list)
    print(transcript)

except TranscriptsDisabled:
    print("No captions available for this video.")


when I was young my family lived overseas uh I lived in Indonesia for a few years and my mother she didn't have the money to send me where all the American kids went to school but she thought it was important for me to keep up with an American education so she decided to teach me extra lessons herself Monday through Friday but because she had to go to work the only time she could do it was at 4:30 in the morning but whenever I'd complain my mother would just give me one looks and she'd say this is no picnic for me either Buster so I know that some of you are still adjusting to being back at school but I'm here today because I have something important to discuss with you my father left my family when I was 2 years old and I was raised by a single mom who had to work and who struggled at times to pay the bills and wasn't always able to give us the things that other kids had there were times when I missed having a father in my life there were times when I was lonely and I felt like I didn

In [51]:
transcript_chunks_list

FetchedTranscript(snippets=[FetchedTranscriptSnippet(text='when I was young my family lived', start=0.28, duration=4.8), FetchedTranscriptSnippet(text='overseas uh I lived in Indonesia for a', start=2.72, duration=5.199), FetchedTranscriptSnippet(text="few years and my mother she didn't have", start=5.08, duration=4.559), FetchedTranscriptSnippet(text='the money to send me where all the', start=7.919, duration=3.76), FetchedTranscriptSnippet(text='American kids went to school but she', start=9.639, duration=4.12), FetchedTranscriptSnippet(text='thought it was important for me to keep', start=11.679, duration=5.281), FetchedTranscriptSnippet(text='up with an American education so she', start=13.759, duration=6.081), FetchedTranscriptSnippet(text='decided to teach me extra lessons', start=16.96, duration=5.88), FetchedTranscriptSnippet(text='herself Monday through Friday but', start=19.84, duration=4.599), FetchedTranscriptSnippet(text='because she had to go to work the only', start=22.8

### Step 1b - Indexing (Text Splitting)

In [52]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = splitter.create_documents([transcript])

In [53]:
len(chunks)

12

In [54]:
chunks[0]

Document(metadata={}, page_content="when I was young my family lived overseas uh I lived in Indonesia for a few years and my mother she didn't have the money to send me where all the American kids went to school but she thought it was important for me to keep up with an American education so she decided to teach me extra lessons herself Monday through Friday but because she had to go to work the only time she could do it was at 4:30 in the morning but whenever I'd complain my mother would just give me one looks and she'd say this is no picnic for me either Buster so I know that some of you are still adjusting to being back at school but I'm here today because I have something important to discuss with you my father left my family when I was 2 years old and I was raised by a single mom who had to work and who struggled at times to pay the bills and wasn't always able to give us the things that other kids had there were times when I missed having a father in my life there were times when

### Step 1c & 1d - Indexing (Embedding Generation and Storing in Vector Store)

In [55]:
# Embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.from_documents(chunks, embeddings)

In [56]:
vector_store.index_to_docstore_id

{0: '8fc85cbd-3a88-4167-a2fb-e423f9e9dd71',
 1: 'aa70736b-44a5-44dc-ae29-955c92f65245',
 2: 'fb95cdf9-0de2-4278-a603-ad231a7dbb92',
 3: '25afaf92-e146-4741-aeb1-c46d71267778',
 4: '5826bbc2-4ee1-43b0-a8fc-b3e15010e15a',
 5: '8bee3d4a-d268-49a5-9fcf-9cf441901cc8',
 6: '4a4db1ab-5bc7-4a59-927f-d11cc75d2734',
 7: 'a5f3fb7f-b982-4d9c-9bcb-ef8b1621b022',
 8: '2d205d3d-1870-4f04-b93a-2f4d87fd1b8d',
 9: '7718213d-7c65-4e51-a8f0-eacfe193f848',
 10: 'addb7ca2-7614-4e43-8e7d-373fde35ba1a',
 11: '8fda2ffc-7747-4397-bc35-f4e2cb96eb20'}

In [57]:
vector_store.get_by_ids(['317d5af0-4329-4162-8847-ca52a003294d'])

[]

### Step 2 - Retrieval

In [58]:
# Retrieval
# What is similarity search? It finds the top k most similar documents to the query based on vector similarity (e.g., cosine similarity).
# What is maximal marginal relevance (MMR)? It balances relevance and diversity in the retrieved documents.
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 4})

In [59]:
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000002331B7E72F0>, search_kwargs={'k': 4})

In [60]:
retriever.invoke('Who is Barack Obama?')

[Document(id='8fda2ffc-7747-4397-bc35-f4e2cb96eb20', metadata={}, page_content="want to ask all of you what's your contribution going to be what problems are you going to solve what discoveries will you make what will a president who comes here in 20 or 50 or 100 years say about what all of you did for this country you know your families your teachers and I are doing everything we can to make sure you have the education you need to answer these questions I'm working hard to fix up your classrooms and get you the books and the equipment and the computers you need to learn but you've got to do your part too so I expect all of you to get serious this year I expect you to put your best effort into everything you do I expect great things from each of you so don't let us down don't let your family down or your country down most of all don't let yourself down make us all proud [Music]"),
 Document(id='25afaf92-e146-4741-aeb1-c46d71267778', metadata={}, page_content="or a Supreme Court Justice

### Step 3 - Augmentation

In [61]:
prompt = PromptTemplate(
    template="""
      You are a helpful assistant.
      Answer ONLY from the provided transcript context.
      If the context is insufficient, just say you don't know.

      {context}
      Question: {question}
    """,
    input_variables = ['context', 'question']
)

In [62]:
# Retrieved documents from the vector store
question          = "Is the topic of nuclear fusion discussed in this video? if yes then what was discussed?"
retrieved_docs    = retriever.invoke(question)
retrieved_docs

[Document(id='8fda2ffc-7747-4397-bc35-f4e2cb96eb20', metadata={}, page_content="want to ask all of you what's your contribution going to be what problems are you going to solve what discoveries will you make what will a president who comes here in 20 or 50 or 100 years say about what all of you did for this country you know your families your teachers and I are doing everything we can to make sure you have the education you need to answer these questions I'm working hard to fix up your classrooms and get you the books and the equipment and the computers you need to learn but you've got to do your part too so I expect all of you to get serious this year I expect you to put your best effort into everything you do I expect great things from each of you so don't let us down don't let your family down or your country down most of all don't let yourself down make us all proud [Music]"),
 Document(id='5826bbc2-4ee1-43b0-a8fc-b3e15010e15a', metadata={}, page_content="can meet our greatest chal

In [63]:
# Combine the retrieved documents into a single context string
context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
context_text

"want to ask all of you what's your contribution going to be what problems are you going to solve what discoveries will you make what will a president who comes here in 20 or 50 or 100 years say about what all of you did for this country you know your families your teachers and I are doing everything we can to make sure you have the education you need to answer these questions I'm working hard to fix up your classrooms and get you the books and the equipment and the computers you need to learn but you've got to do your part too so I expect all of you to get serious this year I expect you to put your best effort into everything you do I expect great things from each of you so don't let us down don't let your family down or your country down most of all don't let yourself down make us all proud [Music]\n\ncan meet our greatest challenges in the future you'll need the knowledge and problem solving skills you learn in science and math to cure diseases like cancer and AIDS and to develop ne

In [64]:
# Augmentation
# Final Prompt 
# Combine context and question into the final prompt
final_prompt = prompt.invoke({"context": context_text, "question": question})
final_prompt

StringPromptValue(text="\n      You are a helpful assistant.\n      Answer ONLY from the provided transcript context.\n      If the context is insufficient, just say you don't know.\n\n      want to ask all of you what's your contribution going to be what problems are you going to solve what discoveries will you make what will a president who comes here in 20 or 50 or 100 years say about what all of you did for this country you know your families your teachers and I are doing everything we can to make sure you have the education you need to answer these questions I'm working hard to fix up your classrooms and get you the books and the equipment and the computers you need to learn but you've got to do your part too so I expect all of you to get serious this year I expect you to put your best effort into everything you do I expect great things from each of you so don't let us down don't let your family down or your country down most of all don't let yourself down make us all proud [Music

### Step 4 - Generation

In [65]:
# Generation
llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0.2)


In [66]:
answer = llm.invoke(final_prompt)
print(answer.content)


No, the topic of nuclear fusion is not discussed in this transcript. The transcript mentions "new Energy Technologies" as one of the areas where the knowledge and problem-solving skills learned in science and math can be applied, but it does not specifically mention nuclear fusion.


### Building The Chain


In [67]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

In [68]:
def format_docs(retrieved_docs):
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
  return context_text

In [69]:
parallel_chain = RunnableParallel({
    'context': retriever | RunnableLambda(format_docs),
    'question': RunnablePassthrough()
})

In [70]:
parser = StrOutputParser()
main_chain = parallel_chain | prompt | llm | parser

In [71]:
main_chain.invoke('Can you summarize the video?')

'The speaker is encouraging students to take their education seriously, work hard, and strive for greatness. They emphasize that success is not easy and that failures are a part of the learning process, but can be used as opportunities to learn and grow. The speaker also reminds students that their circumstances do not determine their destiny and that they have the power to create their own future. They urge students to set goals for their education and do everything they can to meet them, and to not make excuses for not trying. The overall message is one of motivation and empowerment, encouraging students to take control of their own education and future.'