# Import Libraries

In [50]:
from langchain_community.document_loaders import YoutubeLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import PromptTemplate

# Step-1: Indexing

## **1(a): Document Ingestion**

In [51]:
loader=YoutubeLoader.from_youtube_url(
    "https://www.youtube.com/watch?v=T-D1OfcDW1M", add_video_info=False
)
transcript= loader.load()

In [52]:
transcript

[Document(metadata={'source': 'T-D1OfcDW1M'}, page_content='Large language models. They are everywhere. They get some things amazingly right and other things very interestingly wrong. My name\xa0is Marina Danilevsky. I am a Senior Research Scientist here at IBM Research. And I want\xa0to tell you about a framework to help large language models be more accurate and more up to\xa0date: Retrieval-Augmented Generation, or RAG. Let\'s just talk about the "Generation" part for a\xa0minute. So forget the "Retrieval-Augmented". So the\xa0generation, this refers to large language models,\xa0or LLMs, that generate text in response to a user query, referred to as a prompt. These\xa0models can have some undesirable behavior. I want to tell you an anecdote to illustrate this. So my kids, they recently asked me this question: "In our solar system, what planet has the most\xa0moons?" And my response was, “Oh, that\'s really great that you\'re asking this question. I loved\xa0space when I was your age

In [53]:
text = transcript[0].page_content

# Basic cleanup
clean_text = (
    text.replace("\xa0", " ")    # remove non-breaking spaces
        .replace("\n", " ")      # remove newlines
        .replace("  ", " ")      # remove double spaces
        .strip()                 # remove leading/trailing spaces
)

print(clean_text)  # show preview

Large language models. They are everywhere. They get some things amazingly right and other things very interestingly wrong. My name is Marina Danilevsky. I am a Senior Research Scientist here at IBM Research. And I want to tell you about a framework to help large language models be more accurate and more up to date: Retrieval-Augmented Generation, or RAG. Let's just talk about the "Generation" part for a minute. So forget the "Retrieval-Augmented". So the generation, this refers to large language models, or LLMs, that generate text in response to a user query, referred to as a prompt. These models can have some undesirable behavior. I want to tell you an anecdote to illustrate this. So my kids, they recently asked me this question: "In our solar system, what planet has the most moons?" And my response was, “Oh, that's really great that you're asking this question. I loved space when I was your age.” Of course, that was like 30 years ago. But I know this! I read an article and the artic

## **1B- Text Splitting**

In [54]:
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.create_documents([clean_text])

In [55]:
len(chunks)

15

In [56]:
chunks

[Document(metadata={}, page_content='Large language models. They are everywhere. They get some things amazingly right and other things very interestingly wrong. My name is Marina Danilevsky. I am a Senior Research Scientist here at IBM Research. And I want to tell you about a framework to help large language models be more accurate and more up to date: Retrieval-Augmented Generation, or RAG. Let\'s just talk about the "Generation" part for a minute. So forget the "Retrieval-Augmented". So the generation, this refers to large'),
 Document(metadata={}, page_content='part for a minute. So forget the "Retrieval-Augmented". So the generation, this refers to large language models, or LLMs, that generate text in response to a user query, referred to as a prompt. These models can have some undesirable behavior. I want to tell you an anecdote to illustrate this. So my kids, they recently asked me this question: "In our solar system, what planet has the most moons?" And my response was, “Oh, tha

In [57]:
chunks[0].page_content

'Large language models. They are everywhere. They get some things amazingly right and other things very interestingly wrong. My name is Marina Danilevsky. I am a Senior Research Scientist here at IBM Research. And I want to tell you about a framework to help large language models be more accurate and more up to date: Retrieval-Augmented Generation, or RAG. Let\'s just talk about the "Generation" part for a minute. So forget the "Retrieval-Augmented". So the generation, this refers to large'

## **1C & 1D- Embedding Generation and Storing in Vector Store**

In [None]:
embeddings = HuggingFaceEndpointEmbeddings(model="sentence-transformers/all-MiniLM-L6-v2")
vector_store=FAISS.from_documents(chunks,embeddings)

In [None]:
vector_store.index_to_docstore_id