### RAG Part I: Indexing Your Data

In [7]:
from langchain_community.document_loaders import  TextLoader

## Load text file

In [11]:
loader = TextLoader("../documents/bhagat_singh.txt")
documets = loader.load()

## Load webpage

In [16]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://en.wikipedia.org/wiki/Bhagat_Singh")
documents = loader.load()
print(documents)



In [31]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(r"../documents/attention.pdf")
documents = loader.load()
print(documents)

[Document(metadata={'producer': 'PDFium', 'creator': 'PDFium', 'creationdate': 'D:20251022163812', 'source': '../documents/attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗ †\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗ ‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming m

## Spliting text chunking

In [47]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
                                          chunk_overlap=300)

splits = splitter.split_documents(documents)

In [53]:
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language


PYTHON_CODE = """
    def hello_world():
        print("Hello world!")

    # call the function
    hello_worls()
    """

splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=50,
    chunk_overlap=0)

splits = splitter.create_documents(PYTHON_CODE)

In [62]:
markdown_text = """
Middleware
Middleware is the defining feature of create_agent. It offers a highly customizable entry-point, raising the ceiling for what you can build.
Great agents require context engineering: getting the right information to the model at the right time. Middleware helps you control dynamic prompts, conversation summarization, selective tool access, state management, and guardrails through a composable abstraction.
​
Prebuilt middleware
LangChain provides a few prebuilt middlewares for common patterns, including:
PIIMiddleware: Redact sensitive information before sending to the model
SummarizationMiddleware: Condense conversation history when it gets too long
HumanInTheLoopMiddleware: Require approval for sensitive tool calls
"""


splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.MARKDOWN,
    chunk_size=50,
    chunk_overlap=10
)

splits = splitter.create_documents([markdown_text], [{"source": "https://docs.langchain.com/oss/python/releases/langchain-v1"}])

In [63]:
splits

[Document(metadata={'source': 'https://docs.langchain.com/oss/python/releases/langchain-v1'}, page_content='Middleware'),
 Document(metadata={'source': 'https://docs.langchain.com/oss/python/releases/langchain-v1'}, page_content='Middleware is the defining feature of'),
 Document(metadata={'source': 'https://docs.langchain.com/oss/python/releases/langchain-v1'}, page_content='of create_agent. It offers a highly customizable'),
 Document(metadata={'source': 'https://docs.langchain.com/oss/python/releases/langchain-v1'}, page_content='entry-point, raising the ceiling for what you can'),
 Document(metadata={'source': 'https://docs.langchain.com/oss/python/releases/langchain-v1'}, page_content='you can build.'),
 Document(metadata={'source': 'https://docs.langchain.com/oss/python/releases/langchain-v1'}, page_content='Great agents require context engineering: getting'),
 Document(metadata={'source': 'https://docs.langchain.com/oss/python/releases/langchain-v1'}, page_content='getting the r

### Generating Text Embeddings

In [66]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")