# Build a RAG Application for Youtube Transcripts using LangChain



In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [5]:
from langchain_openai import AzureChatOpenAI

client = AzureChatOpenAI(
  api_key = os.getenv("AZURE_OPENAI_API_KEY"),
  api_version = "2024-02-01",
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT") 
)

model = AzureChatOpenAI(
  openai_api_version="2023-05-15",
  azure_deployment= os.getenv("AZURE_OPENAI_MODEL_DEPLOYMENT_NAME")
)

In [7]:
model.invoke("What is MIT?")

AIMessage(content='MIT stands for the Massachusetts Institute of Technology, which is a private research university located in Cambridge, Massachusetts, USA. It is known for its strong emphasis on science, engineering, and technology, and is one of the most prestigious and influential institutions of higher education in the world. MIT is also known for its entrepreneurial spirit and its role in promoting innovation and technological advancement.', response_metadata={'token_usage': {'completion_tokens': 73, 'prompt_tokens': 11, 'total_tokens': 84}, 'model_name': 'gpt-35-turbo', 'system_fingerprint': 'fp_2f57f81c11', 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {}}, id='run-27a7737c-228d-4800-b9f4-9abd167cfb2d-0')

In [8]:
from langchain_core.output_parsers import StrOutputParser

parser = StrOutputParser()

chain = model | parser
chain.invoke("What is MIT?")

'MIT stands for the Massachusetts Institute of Technology, which is a prestigious private research university located in Cambridge, Massachusetts. It is known for its strong emphasis on science, engineering, and technology, and is one of the leading institutions for higher education and research in the world.'

In [9]:
from langchain.prompts import ChatPromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
prompt.format(context="Harvard is in Cambridge", question="Where is Harvard?")

'Human: \nAnswer the question based on the context below. If you can\'t \nanswer the question, reply "I don\'t know".\n\nContext: Harvard is in Cambridge\n\nQuestion: Where is Harvard?\n'

In [10]:
chain = prompt | model | parser
chain.invoke({
    "context": "Harvard is in Cambridge",
    "question": "Where is Harvard?"
})

'Harvard is in Cambridge.'

In [42]:
DATASET_NAME = "./prep/output/master_transcriptions.json"

import pandas as pd
transcripts_dataset = pd.read_json(DATASET_NAME)

In [43]:
transcripts_dataset

Unnamed: 0,speaker,title,videoId,description,start,seconds,text
0,,Map Azure DevOps Runtime Variables to Terrafor...,-ssTKjHVP_Q,"This is a recording of the March 29, 2023 virt...",00:00:02,2,Map Azure DevOps Runtime Variables to Terrafor...
1,,Map Azure DevOps Runtime Variables to Terrafor...,-ssTKjHVP_Q,"This is a recording of the March 29, 2023 virt...",00:05:04,304,from that now the only thing you know so I'm b...
2,,Map Azure DevOps Runtime Variables to Terrafor...,-ssTKjHVP_Q,"This is a recording of the March 29, 2023 virt...",00:10:07,607,I'm reading or I'm trying to follow somebody h...
3,,Map Azure DevOps Runtime Variables to Terrafor...,-ssTKjHVP_Q,"This is a recording of the March 29, 2023 virt...",00:15:10,910,and after that I'm using the stage called depl...
4,,Map Azure DevOps Runtime Variables to Terrafor...,-ssTKjHVP_Q,"This is a recording of the March 29, 2023 virt...",00:20:16,1216,this thing which I extreme you so the user inp...
...,...,...,...,...,...,...,...
595,,Gregor Suttie - Super Charge Your Learning of ...,_dM5AqWlga8,"This is a recording of the April 16, 2020 virt...",00:45:37,2737,cosmos DB that filters the results by cause do...
596,,Gregor Suttie - Super Charge Your Learning of ...,_dM5AqWlga8,"This is a recording of the April 16, 2020 virt...",00:50:40,3040,you secured divorce from eks so there's lots o...
597,,Gregor Suttie - Super Charge Your Learning of ...,_dM5AqWlga8,"This is a recording of the April 16, 2020 virt...",00:55:43,3343,like Ben Coleman has got around the template v...
598,,Gregor Suttie - Super Charge Your Learning of ...,_dM5AqWlga8,"This is a recording of the April 16, 2020 virt...",01:00:47,3647,the blanks and if you're done Wayne and you're...


In [44]:
from langchain_community.document_loaders import DataFrameLoader
loader = DataFrameLoader(transcripts_dataset, page_content_column="text")
transcripts = loader.load()

In [45]:
from langchain_openai.embeddings import AzureOpenAIEmbeddings

embeddings = AzureOpenAIEmbeddings()
embedded_query = embeddings.embed_query("Where is MIT?")

print(f"Embedding length: {len(embedded_query)}")
print(embedded_query[:10])

Embedding length: 1536
[-0.0018333289920419945, -0.011083231397127654, -0.02962633060017498, -0.019222479468945192, -0.036047146015586715, 0.025456797537283127, -0.02816099882530481, 0.012508599188675555, -0.005631533962546989, -0.0015494211236423516]


In [27]:
sentence1 = embeddings.embed_query("MIT is in Cambridge")
sentence2 = embeddings.embed_query("Cambridge is across the river from Boston")

In [28]:
from sklearn.metrics.pairwise import cosine_similarity

query_sentence1_similarity = cosine_similarity([embedded_query], [sentence1])[0][0]
query_sentence2_similarity = cosine_similarity([embedded_query], [sentence2])[0][0]

query_sentence1_similarity, query_sentence2_similarity

(0.9192585966676008, 0.8041802064416832)

In [33]:
from langchain_community.vectorstores import DocArrayInMemorySearch

vectorstore1 = DocArrayInMemorySearch.from_texts(
    [
        "MIT is in Cambridge",
        "Harvard is in Cambridge",
        "Harvard is a university",
        "Cambridge is across the river from Boston",
        "Beacon Hill is in Boston",
        "Samuel Adams lived in Boston",
    ],
    embedding=embeddings,
)

In [35]:
vectorstore1.similarity_search_with_score(query="Where is MIT?", k=3)

[(Document(page_content='MIT is in Cambridge'), 0.9192586047417408),
 (Document(page_content='Harvard is in Cambridge'), 0.8438128759663384),
 (Document(page_content='Harvard is a university'), 0.8267941200342381)]

In [36]:
retriever1 = vectorstore1.as_retriever()
retriever1.invoke("Where is MIT?")

[Document(page_content='MIT is in Cambridge'),
 Document(page_content='Harvard is in Cambridge'),
 Document(page_content='Harvard is a university'),
 Document(page_content='Beacon Hill is in Boston')]

In [37]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

setup = RunnableParallel(context=retriever1, question=RunnablePassthrough())
setup.invoke("Where did Sam Adams live?")

{'context': [Document(page_content='Samuel Adams lived in Boston'),
  Document(page_content='Harvard is in Cambridge'),
  Document(page_content='Cambridge is across the river from Boston'),
  Document(page_content='Beacon Hill is in Boston')],
 'question': 'Where did Sam Adams live?'}

In [38]:
chain = setup | prompt | model | parser
chain.invoke("Where did Sam Adams live?")

'Sam Adams lived in Boston.'

In [39]:
chain.invoke("Where is Cambridge?")

'Cambridge is across the river from Boston.'

In [46]:
vectorstore2 = DocArrayInMemorySearch.from_documents(transcripts, embeddings)

In [48]:
chain = (
    {"context": vectorstore2.as_retriever(), "question": RunnablePassthrough()}
    | prompt
    | model
    | parser
)
chain.invoke("What is langchain?")

'Langchain is a popular orchestration library used in the open source world for language models and is available in multiple languages. It is used for turning user queries into keyword search queries for search indexes.'