## **1. Setup**

The project requires several packages that need to be installed into Workspace

In [None]:
#!pip install -r requirements.txt

In [2]:
#!pip install openai

In [3]:
#!pip install python-dotenv

In [4]:
#!pip install -U langchain-community

In [5]:
#!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

In [None]:
#!pip install transformers soundfile langchain docarray

In [None]:
#!pip install librosa
#!pip install langchain langsmith openai

In [None]:
#!pip install tiktoken
#!pip install faiss-cpu

In [4]:
import yt_dlp as youtube_dl
from yt_dlp import DownloadError
import os
import glob
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from langchain.document_loaders import TextLoader
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.agents import Tool, initialize_agent
from langchain.memory import ConversationBufferMemory
from langchain_core.chat_history import InMemoryChatMessageHistory
from dotenv import load_dotenv
from langchain.agents import initialize_agent, AgentType, Tool
from pydub import AudioSegment
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from openai import OpenAI

### **YouTube Audio Extraction & MP3 Conversion**

In [10]:
# Configuration for yt_dlp
def download_audio(url: str) -> str:
    
    ydl_config = {
        "format": "bestaudio/best",
        # List of post-processing steps; each dict represents one processor
        "postprocessors": [
            {
                "key": "FFmpegExtractAudio",   # Extract audio from the downloaded file
                "preferredcodec": "mp3",       # Convert audio to MP3 format
                "preferredquality": "192"      # Set audio quality to 192 kbps
            }
        ],
        # Template for naming the output file: "<video title>.<extension>"
        "outtmpl": "output/%(title)s.%(ext)s",
        "verbose": True  # Enable detailed logging
    }

    try:
        # Initialize the downloader with the specified config and start download
        with youtube_dl.YoutubeDL(ydl_config) as ydl:
            info = ydl.extract_info(url, download=True)
    except DownloadError as e:
        # Print any download errors that occur
        print("DownloadError:", e)
    
    return f"Downloaded: output_audio/{info['id']}.mp3"

### **Batch Transcription of MP3 Files with Whisper**

In [None]:
# # 1. Figure out if I can use GPU
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# #    and choose torch dtype to save memory on GPU
# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

# # 2. Pick the Whisper model I want
# model_id = "openai/whisper-small-v3"

# # 3. Load the model weights (using safetensors to speed things up)
# model = AutoModelForSpeechSeq2Seq.from_pretrained(
#     model_id,
#     torch_dtype=torch_dtype,
#     low_cpu_mem_usage=True,
#     use_safetensors=True
# )
# # move the model to the right device
# model.to(device)

In [None]:
# # 4. Load the processor that has both tokenizer & feature extractor
# processor = AutoProcessor.from_pretrained(model_id)

In [13]:
# # 5. Build the ASR pipeline with my model and processor
# pipe = pipeline(
#     "automatic-speech-recognition",
#     model=model,
#     tokenizer=processor.tokenizer,
#     feature_extractor=processor.feature_extractor,
#     torch_dtype=torch_dtype,
#     device=device,
# )

In [5]:
print("Current working dir:", os.getcwd())
print("ffmpeg exists:", os.path.exists(r"C:\ffmpeg-7.1.1-full_build\bin\ffmpeg.exe"))
print("ffprobe exists:", os.path.exists(r"C:\ffmpeg-7.1.1-full_build\bin\ffprobe.exe"))

Current working dir: c:\Users\007T\OneDrive\Desktop\ironhack2025\ai-bootcamp-final-project\Multimodal-AI-ChatBot-for-YouTube-Video-QA
ffmpeg exists: True
ffprobe exists: True


In [None]:
# Load your OpenAI API key from .env
load_dotenv()
api_key=os.getenv("OPENAI_API_KEY")
client = OpenAI()

In [140]:
import re
import math
# Configure ffmpeg for pydub (Windows)
AudioSegment.ffmpeg = r"C:\ffmpeg-7.1.1-full_build\bin\ffmpeg.exe"
AudioSegment.converter = AudioSegment.ffmpeg

# Define directories
input_folder = "audio_files/"
chunk_folder = "chunks"
transcripts_dir = "transcripts/"

# Create directories if they don't exist
os.makedirs(chunk_folder, exist_ok=True)
os.makedirs(transcripts_dir, exist_ok=True)

# Set chunk length (10 minutes in milliseconds)
chunk_length = 10 * 60 * 1000

# Find all MP3 files in the input folder
audio_files = glob.glob(os.path.join(input_folder, "*.mp3"))
if not audio_files:
    raise ValueError("❌ No MP3 files found in the audio_files/ directory")

# Process each audio file
for audio_path in audio_files:
    base_name = os.path.splitext(os.path.basename(audio_path).replace(" ", "_"))[0]
    base_name = re.sub(r'[^A-Za-z0-9_]', '', base_name)
    print(f"\n🎧 Processing video: {base_name}")

    # Define the output transcript file for this video
    transcript_file = os.path.join(transcripts_dir, f"{base_name}.txt")
    # Open the transcript file in write mode to clear previous contents if any
    with open(transcript_file, "w", encoding="utf-8") as f:
        f.write(f"# Transcript for {base_name}\n\n")

    # Load the audio file and calculate the number of chunks
    audio = AudioSegment.from_file(audio_path)
    total_length = len(audio)
    num_chunks = math.ceil(total_length / chunk_length)

    # Split the audio into chunks and transcribe
    for i in range(num_chunks):
        start = i * chunk_length
        end = min(start + chunk_length, total_length)
        chunk = audio[start:end]

        # Export chunk to MP3
        chunk_filename = f"{base_name}_chunk_{i}.mp3"
        chunk_path = os.path.join(chunk_folder, chunk_filename)
        chunk.export(chunk_path, format="mp3")

        print(f"🔹 Transcribing chunk {i+1}/{num_chunks}...")

        # Transcribe the chunk using Whisper
        with open(chunk_path, "rb") as ff:
            resp = client.audio.transcriptions.create(
                file=ff,
                model="whisper-1"
            )

        # Append the transcription result to the transcript file
        with open(transcript_file, "a", encoding="utf-8") as out:
            out.write(f"## {chunk_filename}\n")
            out.write(resp.text + "\n\n")

    print(f"✅ Done. Transcript saved to: {transcript_file}")



🎧 Processing video: IELTS_Listening_Practice__What_You_NEED_to_Know
🔹 Transcribing chunk 1/2...
🔹 Transcribing chunk 2/2...
✅ Done. Transcript saved to: transcripts/IELTS_Listening_Practice__What_You_NEED_to_Know.txt

🎧 Processing video: IELTS_Vocabulary_Synonyms_for_the_40_Most_Commonly_Used_Wordsmp3_chunk_0
🔹 Transcribing chunk 1/2...
🔹 Transcribing chunk 2/2...
✅ Done. Transcript saved to: transcripts/IELTS_Vocabulary_Synonyms_for_the_40_Most_Commonly_Used_Wordsmp3_chunk_0.txt


## **Transcript Ingestion and Semantic Retrieval Setup**

#### **Document Loading (DirectoryLoader)**

We replace the single‐file TextLoader with a DirectoryLoader that reads all *.txt files under ./transcripts/.

Why this change: This makes the code automatically pick up every transcript file (i.e. every video) without hard-coding one filename.

In [6]:
# Create a new instance of the TextLoader class, specifying the directory containing the text files
#loader = TextLoader("./transcripts/transcript.txt")

# Load the documents from the specified directory using the TextLoader instance
#docs = loader.load()


loader = DirectoryLoader(
    "./transcripts",
    glob="*.txt",
    loader_cls=TextLoader
)
docs = loader.load()

In [8]:
docs[0]

Document(metadata={'source': 'transcripts\\IELTS_Listening_Practice__What_You_NEED_to_Know.txt'}, page_content="# Transcript for IELTS_Listening_Practice__What_You_NEED_to_Know\n\n## IELTS_Listening_Practice__What_You_NEED_to_Know_chunk_0.mp3\nHi everyone, Chris here from IELTSadvantage.com with another lesson and today what we're gonna focus on is how to practice IELTS listening. So what we're gonna do is look at why doing lots of practice tests is a terrible idea. In fact, this is the worst thing you could do. If you think that just doing lots and lots and lots of practice tests is gonna help you get a higher score, you are wrong. But what then I'm gonna show you is three better ways to practice that will actually improve your scores because at the end of the day, what are we doing? We're helping you improve your scores. So we're only gonna teach you the things that work and make you aware of the things that don't work. And these three ways are totally free and you can do them at hom

#### **Recursive Text Splitting**

We use RecursiveCharacterTextSplitter (with separators ["\n\n","\n"," ",""]) to break each loaded document into ~600-character chunks with 100-character overlap.

Why this change: The splitter will try to cut at paragraph or line boundaries first, preserving sentence integrity before falling back to a pure character count.

In [12]:
"""from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=600, chunk_overlap=100)
docs = text_splitter.split_documents(docs)

embeddings = OpenAIEmbeddings(openai_api_key=api_key)
from langchain.vectorstores import FAISS

vectorstore = FAISS.from_documents(docs, embeddings)
"""

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    separators=["\n\n", "\n", " ", ""]
)
chunks = splitter.split_documents(docs)

In [14]:
chunks

[Document(metadata={'source': 'transcripts\\IELTS_Listening_Practice__What_You_NEED_to_Know.txt'}, page_content='# Transcript for IELTS_Listening_Practice__What_You_NEED_to_Know'),
 Document(metadata={'source': 'transcripts\\IELTS_Listening_Practice__What_You_NEED_to_Know.txt'}, page_content='## IELTS_Listening_Practice__What_You_NEED_to_Know_chunk_0.mp3'),
 Document(metadata={'source': 'transcripts\\IELTS_Listening_Practice__What_You_NEED_to_Know.txt'}, page_content="Hi everyone, Chris here from IELTSadvantage.com with another lesson and today what we're gonna focus on is how to practice IELTS listening. So what we're gonna do is look at why doing lots of practice tests is a terrible idea. In fact, this is the worst thing you could do. If you think that just doing lots and lots and lots of practice tests is gonna help you get a higher score, you are wrong. But what then I'm gonna show you is three better ways to practice that will actually improve your scores because at the end of the

In [13]:
len(chunks[2].page_content)

994

In [None]:
"""from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=600, chunk_overlap=100)
docs = text_splitter.split_documents(docs)
"""

#### **Assigning Source Metadata**

What happens here: We attach chunk.metadata["source"] = video_id to each chunk, where video_id is the base filename of the transcript.


In [15]:
for chunk in chunks:
    video_id = os.path.splitext(os.path.basename(chunk.metadata["source"]))[0]
    chunk.metadata["source"] = video_id

#### **Embedding Generation & FAISS Indexing**

In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002") 
vectorstore = FAISS.from_documents(chunks, embeddings)

  embeddings = OpenAIEmbeddings() # model="text-embedding-ada-002"


#### **Retriever Initialization**

In [58]:
retriever = vectorstore.as_retriever()

Tracer

In [19]:
langsmith_key = os.getenv("LANGSMITH_API_KEY")

In [21]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.tracers import LangChainTracer


# 1. Create the tracer
tracer = LangChainTracer()

# 2. Build a CallbackManager with it
callback_manager = CallbackManager([tracer])

# 3. Inject into your LLM
llm = ChatOpenAI(
    model_name="gpt-4",
    temperature=0,
    callback_manager=callback_manager,
    verbose=True,
)

  llm = ChatOpenAI(
  llm = ChatOpenAI(


DEBUG:langsmith.client:Sending compressed multipart request with context: trace=58b5689a-e74a-458e-835a-3d3956bce223,id=58b5689a-e74a-458e-835a-3d3956bce223; trace=58b5689a-e74a-458e-835a-3d3956bce223,id=58b5689a-e74a-458e-835a-3d3956bce223
DEBUG:langsmith.client:Sending compressed multipart request with context: trace=c04c06ce-2e10-4b58-b985-887fff318356,id=c04c06ce-2e10-4b58-b985-887fff318356; trace=c04c06ce-2e10-4b58-b985-887fff318356,id=b5151bf9-e5eb-4388-879c-39b04d12960e; trace=c04c06ce-2e10-4b58-b985-887fff318356,id=9179575b-37d8-4c8f-8ef5-ac13097f8c97; trace=9179575b-37d8-4c8f-8ef5-ac13097f8c97,id=9179575b-37d8-4c8f-8ef5-ac13097f8c97
DEBUG:langsmith.client:Sending compressed multipart request with context: trace=c04c06ce-2e10-4b58-b985-887fff318356,id=9179575b-37d8-4c8f-8ef5-ac13097f8c97; trace=9179575b-37d8-4c8f-8ef5-ac13097f8c97,id=9179575b-37d8-4c8f-8ef5-ac13097f8c97; trace=c04c06ce-2e10-4b58-b985-887fff318356,id=b5151bf9-e5eb-4388-879c-39b04d12960e; trace=c04c06ce-2e10-4b58

In [35]:
from langchain.prompts import PromptTemplate

# Custom Prompt Template for Strict QA
STRICT_QA_PROMPT = PromptTemplate(
    template="""Answer ONLY from the video transcript. If not found, say "This information is not mentioned in the transcript of the video".
    
    Transcript: {context}
    
    Question: {question}
    
    Answer:""",
    input_variables=["context", "question"]
)


Builds a RetrievalQA chain (using the “stuff” strategy) that ties together the LLM and retriever, with verbose=True to print intermediate debug info.

In [59]:
# Create a RetrievalQA instance with the specified parameters
qa_stuff = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever,
return_source_documents=True,
chain_type_kwargs={"prompt": STRICT_QA_PROMPT},
verbose=True)


In [26]:
#qa_stuff?

In [66]:
chat_history = InMemoryChatMessageHistory()

memory = ConversationBufferMemory(
    memory_key="chat_history",
    chat_memory=chat_history,
    return_messages=True
)

In [67]:
chain = RetrievalQA.from_chain_type(
llm=llm,
chain_type="stuff",
retriever=retriever,
return_source_documents=True,
memory=memory,
chain_type_kwargs={"prompt": STRICT_QA_PROMPT},
verbose=True)

### **Multi-Tool Conversational Agent with Memory**

Sets up three custom tools (transcript Q&A, summarization), configures a conversation buffer to remember past messages, and initializes a LangChain agent that can use these tools interactively while preserving chat history.

In [72]:
#define a summarization function and wrap it as a tool
def summarize_transcript(text: str) -> str:
    output = chain({"query": f"Summarize this:\n\n{text}"})
    answer = output["result"]
    sources = "\n\n".join([doc.page_content for doc in output['source_documents']])
    return f"Answer:\n{answer}\n\nSources:\n{sources}"

def run_TranscriptQA(question: str) -> str:
    output = chain({"query": question})
    answer = output["result"]
    sources = "\n\n".join([doc.page_content for doc in output['source_documents']])
    if not sources.strip() or not answer.strip() or "not mentioned" in answer.lower():
        return "Sorry, this information is not in the video transcript."
    return f"Answer:\n{answer}\n\nSources:\n{sources}"


In [37]:
tools = [Tool(
    name="TranscriptQA",
    func=run_TranscriptQA,
    description="The user will ask about the video but you will Answer questions strictly based on the video transcript. If the information is not in the transcript, no answer will be provided.No general knowledge.",
    return_direct=True),
   
         
    Tool(
    name="TranscriptSummarizer",
    func=summarize_transcript,
    description="Generate a concise summary of a given transcript text.",
    return_direct=True
    
)]


In [106]:
session_store = {}

def get_session_history(session_id: str) -> InMemoryChatMessageHistory:
    """Ensures each session has its own message history"""
    if session_id not in session_store:
        session_store[session_id] = InMemoryChatMessageHistory()
    return session_store[session_id]

In [None]:
import uuid
session_id = str(uuid.uuid4())  # e.g. "4f9b8a2e-1c3d-4f5a-9e6b-7d8f0a1b2c3d"

#configure a memory buffer to keep the full conversation history
memory = get_session_history(session_id)

In [None]:
# I initialize the agent with my tools, the LLM, and the memory buffer
agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION , #CHAT_CONVERSATIONAL_REACT_DESCRIPTION
    memory=memory,
    handle_parsing_errors=True,
    verbose=True,
    early_stopping_method="force",
    max_iterations=4,
    return_intermediate_steps=True
)
    

In [None]:
agent.agent.llm_chain.prompt.template = f"""
You are a video expert. Use this transcript to answer questions:

{{transcript_chunks}}

Always respond with 'Final Answer: <your answer>'.
Human: {{input}}
"""

### **Execute RetrievalQA Query & Display Answer**

In [29]:
import logging

# This will log all langsmith internals to stdout
langsmith_logger = logging.getLogger("langsmith")
langsmith_logger.setLevel(logging.DEBUG)
logging.basicConfig()  # ensure handlers are configured


In [73]:
from langchain_core.tracers.context import tracing_v2_enabled
with tracing_v2_enabled():
    # set the query
    query = "What is this the video content about?"

    #run the query 
    response = agent.invoke(query)

    #print response
    print(response)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "TranscriptSummarizer",
    "action_input": "What is this the video content about?"
}
```[0m

[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m

Observation: [33;1m[1;3mAnswer:
The video content is about the importance of consistent practice in improving listening skills for IELTS. It emphasizes that studying doesn't have to be boring or laborious, and encourages students to listen to things they enjoy, such as podcasts, TV, movies, radio, TED Talks, YouTube, and real people. The speaker highlights the convenience of having these resources available on a phone, providing an unlimited supply of listening material.

Sources:
# Transcript for IELTS_Listening_Practice__What_You_NEED_to_Know

## IELTS_Listening_Practice__What_You_NEED_to_Know_chunk_0.mp3

## IELTS_Listening_Practice__What_You_NEED_to_Know_chunk_1.mp3

about, you need to do something that you enjoy so that you will do



In [74]:
with tracing_v2_enabled():
    # set the query
    query = "2+2"

    #run the query 
    response = agent.invoke(query)

    #print response
    print(response)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "4"
}
```[0m

[1m> Finished chain.[0m
{'input': '2+2', 'chat_history': [HumanMessage(content='What is this the video content about?', additional_kwargs={}, response_metadata={}), AIMessage(content='Sorry, this information is not in the video transcript.', additional_kwargs={}, response_metadata={}), HumanMessage(content='What is this the video content about?', additional_kwargs={}, response_metadata={}), AIMessage(content="Answer:\nThe video content is about the importance of consistent practice in improving listening skills for IELTS. It emphasizes that studying doesn't have to be boring or laborious, and encourages students to listen to things they enjoy, such as podcasts, TV, movies, radio, TED Talks, YouTube, and real people. The speaker highlights the convenience of having these resources available on a phone, providing an unlimited supply of listening materi



In [76]:
with tracing_v2_enabled():
    query = "what is the capital of ksa"

    #run the query 
    response = agent.invoke(query)

    #print response
    print(response)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "The capital of the Kingdom of Saudi Arabia (KSA) is Riyadh."
}
```[0m

[1m> Finished chain.[0m
{'input': 'what is the capital of ksa', 'chat_history': [HumanMessage(content='What is this the video content about?', additional_kwargs={}, response_metadata={}), AIMessage(content='Sorry, this information is not in the video transcript.', additional_kwargs={}, response_metadata={}), HumanMessage(content='What is this the video content about?', additional_kwargs={}, response_metadata={}), AIMessage(content="Answer:\nThe video content is about the importance of consistent practice in improving listening skills for IELTS. It emphasizes that studying doesn't have to be boring or laborious, and encourages students to listen to things they enjoy, such as podcasts, TV, movies, radio, TED Talks, YouTube, and real people. The speaker highlights the convenience of having these 



In [64]:
memory.chat_memory.messages

[HumanMessage(content='What is this the video content about?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='Answer:\nThis information is not mentioned in the transcript of the video.\n\nSources:\n# Transcript for IELTS_Listening_Practice__What_You_NEED_to_Know\n\n# Transcript for IELTS_Vocabulary_Synonyms_for_the_40_Most_Commonly_Used_Wordsmp3_chunk_0\n\n## IELTS_Listening_Practice__What_You_NEED_to_Know_chunk_0.mp3', additional_kwargs={}, response_metadata={}),
 HumanMessage(content='What is this the video content about?', additional_kwargs={}, response_metadata={}),
 AIMessage(content='Answer:\nThis information is not mentioned in the transcript of the video.\n\nSources:\n# Transcript for IELTS_Listening_Practice__What_You_NEED_to_Know\n\n# Transcript for IELTS_Vocabulary_Synonyms_for_the_40_Most_Commonly_Used_Wordsmp3_chunk_0\n\n## IELTS_Listening_Practice__What_You_NEED_to_Know_chunk_0.mp3\n\n## IELTS_Listening_Practice__What_You_NEED_to_Know_chunk_1.mp3', addit

In [70]:
# set the query
query = "did you remember the first qustion I asked?"

#run the query 
response = agent.invoke(query)

#print response
print(response)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI should use the TranscriptQA tool to find the answer to this question.
Action: TranscriptQA
Action Input: "did you remember the first question I asked?"[0m

[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m

Observation: [36;1m[1;3mSorry, this information is not in the video transcript.[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m
{'input': 'did you remember the first qustion I asked?', '{chat_history}': [HumanMessage(content='What is this the video about?', additional_kwargs={}, response_metadata={}), AIMessage(content="Answer:\nMany IELTS students use the same words repeatedly, which can limit their vocabulary and band score. The video teaches 40 synonyms to help improve vocabulary and score high on the IELTS exam. Synonyms include claim for say, discuss for talk about, advocate for recommend, clarify for explain about, offer for give, inquire for ask, presume for think, consider for think about

In [81]:
with tracing_v2_enabled():
    # Test 1: Remembering user info
    print(agent.invoke("Hi, I'm Layla"))  # Should store name
    print(agent.invoke("What's my name?"))  # Should return "Your name is Layla"
    # Test 2: Block math questions
    print(agent.invoke("What's 2+2?"))  
    # Returns: "I only answer questions based on IELTS video content."
    # Test 3: Follow-up questions
    print(agent.invoke("What was the main tip for listening?")) 
    # Returns answer from transcript with sources
    print(agent.invoke("which is the most importent one of them"))
    print(agent.invoke("Summarize the video"))




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "Hello Layla! How can I assist you today?"
}
```[0m

[1m> Finished chain.[0m
{'input': "Hi, I'm Layla", 'chat_history': [HumanMessage(content='What is this the video content about?', additional_kwargs={}, response_metadata={}), AIMessage(content='Sorry, this information is not in the video transcript.', additional_kwargs={}, response_metadata={}), HumanMessage(content='What is this the video content about?', additional_kwargs={}, response_metadata={}), AIMessage(content="Answer:\nThe video content is about the importance of consistent practice in improving listening skills for IELTS. It emphasizes that studying doesn't have to be boring or laborious, and encourages students to listen to things they enjoy, such as podcasts, TV, movies, radio, TED Talks, YouTube, and real people. The speaker highlights the convenience of having these resources available on a phone, 



[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "Your name is Layla."
}
```[0m

[1m> Finished chain.[0m
{'input': "What's my name?", 'chat_history': [HumanMessage(content='What is this the video content about?', additional_kwargs={}, response_metadata={}), AIMessage(content='Sorry, this information is not in the video transcript.', additional_kwargs={}, response_metadata={}), HumanMessage(content='What is this the video content about?', additional_kwargs={}, response_metadata={}), AIMessage(content="Answer:\nThe video content is about the importance of consistent practice in improving listening skills for IELTS. It emphasizes that studying doesn't have to be boring or laborious, and encourages students to listen to things they enjoy, such as podcasts, TV, movies, radio, TED Talks, YouTube, and real people. The speaker highlights the convenience of having these resources available on a phone, providing an unlimited supply of listening material.\n\nSources:\n#



[32;1m[1;3m```json
{
    "action": "Final Answer",
    "action_input": "4"
}
```[0m

[1m> Finished chain.[0m
{'input': "What's 2+2?", 'chat_history': [HumanMessage(content='What is this the video content about?', additional_kwargs={}, response_metadata={}), AIMessage(content='Sorry, this information is not in the video transcript.', additional_kwargs={}, response_metadata={}), HumanMessage(content='What is this the video content about?', additional_kwargs={}, response_metadata={}), AIMessage(content="Answer:\nThe video content is about the importance of consistent practice in improving listening skills for IELTS. It emphasizes that studying doesn't have to be boring or laborious, and encourages students to listen to things they enjoy, such as podcasts, TV, movies, radio, TED Talks, YouTube, and real people. The speaker highlights the convenience of having these resources available on a phone, providing an unlimited supply of listening material.\n\nSources:\n# Transcript for IELTS_



[32;1m[1;3m```json
{
    "action": "TranscriptQA",
    "action_input": "What was the main tip for listening?"
}
```[0m

[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m

Observation: [36;1m[1;3mAnswer:
The main tip for listening is to actively focus on one or two things while listening to a podcast, TV show, movie, or YouTube video. This could be focusing on vocabulary, accent, intonation, or multiple speakers. The speaker suggests noting down new words, guessing their meaning from context, and noting down all the information needed to remember that word. This active listening is more beneficial than passive listening.

Sources:
So listening passively is just sitting on the bus, looking out the window, listening to a podcast, and not really thinking about it. Now, that's better than nothing, but it is better to actively focus on one, maximum two things while you're listening to that podcast, or that TV show, or that movie, or that YouTube video. Focus on one 



[32;1m[1;3m```json
{
    "action": "TranscriptQA",
    "action_input": "which is the most important one of them"
}
```[0m

[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m

Observation: [36;1m[1;3mSorry, this information is not in the video transcript.[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m
{'input': 'which is the most importent one of them', 'chat_history': [HumanMessage(content='What is this the video content about?', additional_kwargs={}, response_metadata={}), AIMessage(content='Sorry, this information is not in the video transcript.', additional_kwargs={}, response_metadata={}), HumanMessage(content='What is this the video content about?', additional_kwargs={}, response_metadata={}), AIMessage(content="Answer:\nThe video content is about the importance of consistent practice in improving listening skills for IELTS. It emphasizes that studying doesn't have to be boring or laborious, and encourages students to listen to things they enjoy, such as



[32;1m[1;3m```json
{
    "action": "TranscriptSummarizer",
    "action_input": "Transcript for IELTS_Listening_Practice__What_You_NEED_to_Know"
}
```[0m

[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m

Observation: [33;1m[1;3mAnswer:
This information is not mentioned in the transcript of the video.

Sources:
# Transcript for IELTS_Listening_Practice__What_You_NEED_to_Know

## IELTS_Listening_Practice__What_You_NEED_to_Know_chunk_1.mp3

## IELTS_Listening_Practice__What_You_NEED_to_Know_chunk_0.mp3

# Transcript for IELTS_Vocabulary_Synonyms_for_the_40_Most_Commonly_Used_Wordsmp3_chunk_0[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m
{'input': 'Summarize the video', 'chat_history': [HumanMessage(content='What is this the video content about?', additional_kwargs={}, response_metadata={}), AIMessage(content='Sorry, this information is not in the video transcript.', additional_kwargs={}, response_metadata={}), HumanMessage(content='What is this the video cont



### LLM Evaluation using Langsmith

#### Response <> Reference Answer:

This measures exact answer accuracy—did the QA chain produce the exact text (or an acceptable paraphrase) you expected?

using a QA evaluation chain (e.g. QAEvalChain) to rate correctness.

In [None]:
from langchain.evaluation.qa import QAEvalChain

In [118]:
eval_chain = QAEvalChain.from_llm(
    llm=llm,
    tracer=tracer,
    evaluation_name="rag-qa-evaluation"
)

In [119]:
examples = [
    {
        "query": "What is one benefit of using synonyms according to the video?",
        "result": "Using synonyms demonstrates a broad vocabulary, which can increase your IELTS band score.",
        "reference": "Using synonyms demonstrates a broad vocabulary, which can increase your IELTS band score."
    },
    {
        "query": "How many synonyms will the video teach you?",
        "result": "40",
        "reference": "40"
    },
    {
        "query": "What synonym does the video suggest for 'say'?",
        "result": "claim",
        "reference": "claim"
    },
    {
        "query": "Which synonym replaces 'talk about' when reporting general subjects?",
        "result": "discuss",
        "reference": "discuss"
    },
    {
        "query": "What advanced synonym for 'recommend' is introduced?",
        "result": "advocate",
        "reference": "advocate"
    },
    {
        "query": "What is the suggested synonym for 'explain about'?",
        "result": "clarified",
        "reference": "clarified"
    },
    {
        "query": "What polite synonym does the video recommend for 'give'?",
        "result": "offer",
        "reference": "offer"
    },
    {
        "query": "Which word is recommended as a formal synonym for 'ask'?",
        "result": "inquire",
        "reference": "inquire"
    },
    {
        "query": "What alternative word can replace 'easy'?",
        "result": "simple",
        "reference": "simple"
    },
    {
        "query": "What synonym does the video give for 'tip'?",
        "result": "recommendation",
        "reference": "recommendation"
    },
    {
        "query": "Which phrase can be used instead of 'for example'?",
        "result": "to illustrate",
        "reference": "to illustrate"
    },
    {
        "query": "What longer synonym is suggested for 'extra'?",
        "result": "additional",
        "reference": "additional"
    },
    {
        "query": "What paraphrase does the video offer for 'I think'?",
        "result": "presume",
        "reference": "presume"
    },
    {
        "query": "What verb replaces 'think about' when considering something?",
        "result": "consider",
        "reference": "consider"
    },
    {
        "query": "Which word can substitute 'maybe'?",
        "result": "possibly",
        "reference": "possibly"
    },
    {
        "query": "What formal synonym for 'people' is shown?",
        "result": "individuals",
        "reference": "individuals"
    },
    {
        "query": "What precise term replaces 'family' when speaking of those living together?",
        "result": "household",
        "reference": "household"
    },
    {
        "query": "What formal verb does the video propose instead of 'live'?",
        "result": "reside",
        "reference": "reside"
    },
    {
        "query": "Which term is used as a synonym for 'town'?",
        "result": "community",
        "reference": "community"
    },
    {
        "query": "What synonym does the video suggest for 'city'?",
        "result": "metropolis",
        "reference": "metropolis"
    }
]


In [None]:
predictions = [{"result": ex["result"]} for ex in examples]

In [123]:
with tracing_v2_enabled():
    results = eval_chain.evaluate(
        examples,
        predictions,
        question_key="query",
        answer_key="reference",
        prediction_key="result",
    )


In [None]:
#   examples  → List[dict] each dict have keys: "query", "result", "reference"
#   results   → List[dict] each dict have keys: "score" , (optional) "reasoning"

for example, eval_out in zip(examples, results):
    query = example["query"]
    prediction = example["result"]
    reference = example["reference"]
    score = eval_out["results"]
    reasoning = eval_out.get("reasoning")

    print(f"Query:     {query}")
    print(f"Result:    {prediction}")
    print(f"Reference: {reference}")
    print(f"Score:     {score}")
    if reasoning:
        print(f"Reasoning: {reasoning}")
    print("-" * 40)


Query:     What is one benefit of using synonyms according to the video?
Result:    Using synonyms demonstrates a broad vocabulary, which can increase your IELTS band score.
Reference: Using synonyms demonstrates a broad vocabulary, which can increase your IELTS band score.
Score:     CORRECT
----------------------------------------
Query:     How many synonyms will the video teach you?
Result:    40
Reference: 40
Score:     CORRECT
----------------------------------------
Query:     What synonym does the video suggest for 'say'?
Result:    claim
Reference: claim
Score:     CORRECT
----------------------------------------
Query:     Which synonym replaces 'talk about' when reporting general subjects?
Result:    discuss
Reference: discuss
Score:     CORRECT
----------------------------------------
Query:     What advanced synonym for 'recommend' is introduced?
Result:    advocate
Reference: advocate
Score:     CORRECT
----------------------------------------
Query:     What is the sugge

In [None]:
# Test 1: Remembering user info
print(agent.invoke("Hi, I'm Layla"))  # Should store name
print(agent.invoke("What's my name?"))  # Should return "Your name is Layla"
# Test 2: Block math questions
print(agent.invoke("What's 2+2?"))  
# Returns: "I only answer questions based on IELTS video content."
# Test 3: Follow-up questions
print(agent.invoke("What was the main tip for listening?")) 
# Returns answer from transcript with sources
print(agent.invoke("Summarize the video"))

In [89]:
from langchain.evaluation.qa import QAEvalChain
# Ground truth examples: list of {'query': ..., 'answer': ...} pairs
examples = [
{"query": "What is the main topic of the video?", "answer": "The video discusses..."},
{"query": "summarize the content in 5 bold points", "answer": "In summary..."},
]

output = chain.invoke({"query": examples[0]["query"]})
print(output)


# Get predictions
predictions = [chain.invoke(e["query"])["result"] for e in examples]

# Evaluate faithfulness
llm_eval = ChatOpenAI(temperature=0, model_name="gpt-4")
eval_chain = QAEvalChain.from_llm(llm=llm_eval)
graded_outputs = eval_chain.evaluate(examples, predictions)
for g in graded_outputs:
    print(g)



[1m> Entering new RetrievalQA chain...[0m


ValueError: Got multiple output keys: dict_keys(['result', 'source_documents']), cannot determine which to store in memory. Please set the 'output_key' explicitly.

In [None]:
from langchain.evaluation.criteria import CriteriaEvalChain

criteria = {
        "relevance": "Does the answer directly answer the question?",
        "consistency": "Is the answer internally consistent and non-contradictory?"
    }

criteria_eval_chain = CriteriaEvalChain.from_llm(llm=llm_eval, criteria=criteria)

for i, ex in enumerate(examples):
        evaluation = criteria_eval_chain.evaluate_strings(
            input=ex["query"],
            prediction=predictions[i],
            reference=ex["answer"]
        )
        print(evaluation)

#### **Transcript Retrieval Performance Evaluation**


#### Retrieved Documents <> Input:

measure how well your retriever fetches the correct transcript files for each query.

In other words, compare the set of documents returned by the retriever against your ground-truth set of relevant files for that query.

Typical metrics are:

- Precision@k – the fraction of the top-k retrieved files that are actually relevant.

- MRR@k (Mean Reciprocal Rank) – the reciprocal of the rank at which the first relevant file appears.

- NDCG@k – how well all relevant files are ordered within the top-k, giving higher weight to those appearing earlier.

**Ground Truth**
Defines, for each query, the exact set of transcript files that contain the correct answer. It’s “answer key” for retrieval, mapping each question to the filenames you expect to see among the top results.

In [149]:
ground_truths = {
    # ── IELTS Listening Practice transcript ──
    "What sections are included in the IELTS Listening test?": {
        "IELTS_Listening_Practice__What_You_NEED_to_Know.txt"
    },
    "How many times will you hear each recording?": {
        "IELTS_Listening_Practice__What_You_NEED_to_Know.txt"
    },
    "What should you do if you miss an answer during the test?": {
        "IELTS_Listening_Practice__What_You_NEED_to_Know.txt"
    },
    "When can you transfer your answers onto the answer sheet?": {
        "IELTS_Listening_Practice__What_You_NEED_to_Know.txt"
    },
    "How much time is given at the end to check your answers?": {
        "IELTS_Listening_Practice__What_You_NEED_to_Know.txt"
    },

    # ── Vocabulary Synonyms transcript ──
    "What synonym does the video suggest for 'say'?": {
        "IELTS_Vocabulary_Synonyms_for_the_40_Most_Commonly_Used_Wordsmp3_chunk_0.txt"
    },
    "Which word can replace 'easy'?": {
        "IELTS_Vocabulary_Synonyms_for_the_40_Most_Commonly_Used_Wordsmp3_chunk_0.txt"
    },
    "What fancy synonym is given for 'tip'?": {
        "IELTS_Vocabulary_Synonyms_for_the_40_Most_Commonly_Used_Wordsmp3_chunk_0.txt"
    },
    "What phrase does the video recommend instead of 'for example'?": {
        "IELTS_Vocabulary_Synonyms_for_the_40_Most_Commonly_Used_Wordsmp3_chunk_0.txt"
    },
    "How does the video suggest paraphrasing 'people'?": {
        "IELTS_Vocabulary_Synonyms_for_the_40_Most_Commonly_Used_Wordsmp3_chunk_0.txt"
    },
}


**get_topk_sources**

A helper function that Returns a list of the corresponding transcript filenames (from metadata["source_file"]).

In [159]:
def get_topk_sources(query: str, k: int = 2) -> list[str]:
    docs = retriever.get_relevant_documents(query)[:k]
    return [doc.metadata["source"] for doc in docs]


**Precision@k**
The fraction of the top k retrieved filenames that are actually in the ground truth set.

Precision@k =
#(
retrieved
𝑘
 
∩
 
ground_truth
)
/ 𝑘



In [None]:
#!pip install scikit-learn

In [153]:
from sklearn.metrics import ndcg_score

def precision_at_k(retrieved: list[str], relevant: set[str]) -> float:
    return sum(1 for src in retrieved if src in relevant) / len(retrieved)


**MRR@k (Mean Reciprocal Rank)**

For each query, find the rank r (position) of the first correct file within the top k. Its reciprocal (1/ r) is the score for that query. MRR@k is the average of those reciprocals over all queries.

In [154]:
def compute_mrr(retrieved: list[str], relevant: set[str]) -> float:
    for i, src in enumerate(retrieved, start=1):
        if src in relevant:
            return 1.0 / i
    return 0.0

**NDCG@k (Normalized Discounted Cumulative Gain)**
Measures how well the retriever orders all relevant files within the top k

In [155]:
def ndcg_at_k(retrieved: list[str], relevant: set[str], k: int) -> float:
    true_rel = [1 if src in relevant else 0 for src in retrieved]
    return ndcg_score([true_rel], [true_rel], k=k)

In [160]:
import numpy as np

def evaluate_retrieval(ground_truths: dict[str, set[str]], k: int = 5):
    precisions, mrrs, ndcgs = [], [], []
    for query, relevant in ground_truths.items():
        topk = get_topk_sources(query, k)
        precisions.append(precision_at_k(topk, relevant))
        mrrs.append(compute_mrr(topk, relevant))
        ndcgs.append(ndcg_at_k(topk, relevant, k))
    print(f"Precision@{k}: {np.mean(precisions):.3f}")
    print(f"MRR@{k}:       {np.mean(mrrs):.3f}")
    print(f"NDCG@{k}:      {np.mean(ndcgs):.3f}")

with tracing_v2_enabled():
    evaluate_retrieval(ground_truths)


Precision@5: 0.000
MRR@5:       0.000
NDCG@5:      0.000


In [162]:
q = "What synonym does the video suggest for 'say'?"
with tracing_v2_enabled():
    docs = retriever.get_relevant_documents(q)[:5]
    for i, doc in enumerate(docs, 1):
        print(f"{i}. {doc.metadata['source']}")
        print("   →", doc.page_content[:100].replace("\n"," "), "…\n")


1. IELTS_Vocabulary_Synonyms_for_the_40_Most_Commonly_Used_Wordsmp3_chunk_0
   → you get one point. Comment below how many synonyms you know. For example, 20, 35 or 40. Let's get st …

2. IELTS_Vocabulary_Synonyms_for_the_40_Most_Commonly_Used_Wordsmp3_chunk_0
   → prices would rise. Another common word for reporting what someone said is talk about. It's used for  …

3. IELTS_Vocabulary_Synonyms_for_the_40_Most_Commonly_Used_Wordsmp3_chunk_0
   → Many IELTS students use the same words over and over again. And this is a problem because one of the …

