### RAG Part I: Indexing Your Data

In [2]:
from langchain_community.document_loaders import  TextLoader
import tqdm as notebook_tqdm

## Load text file

In [3]:
loader = TextLoader("../documents/bhagat_singh.txt")
documets = loader.load()

## Load webpage

In [4]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://en.wikipedia.org/wiki/Bhagat_Singh")
documents = loader.load()

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [5]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(r"../documents/attention.pdf")
documents = loader.load()


## Spliting text chunking

In [6]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
                                          chunk_overlap=300)

splits = splitter.split_documents(documents)

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter, Language


PYTHON_CODE = """
    def hello_world():
        print("Hello world!")

    # call the function
    hello_worls()
    """

splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON,
    chunk_size=50,
    chunk_overlap=0)

splits = splitter.create_documents(PYTHON_CODE)

In [8]:
markdown_text = """
Middleware
Middleware is the defining feature of create_agent. It offers a highly customizable entry-point, raising the ceiling for what you can build.
Great agents require context engineering: getting the right information to the model at the right time. Middleware helps you control dynamic prompts, conversation summarization, selective tool access, state management, and guardrails through a composable abstraction.
​
Prebuilt middleware
LangChain provides a few prebuilt middlewares for common patterns, including:
PIIMiddleware: Redact sensitive information before sending to the model
SummarizationMiddleware: Condense conversation history when it gets too long
HumanInTheLoopMiddleware: Require approval for sensitive tool calls
"""


splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.MARKDOWN,
    chunk_size=50,
    chunk_overlap=10
)

splits = splitter.create_documents([markdown_text], [{"source": "https://docs.langchain.com/oss/python/releases/langchain-v1"}])

In [9]:
splits

[Document(metadata={'source': 'https://docs.langchain.com/oss/python/releases/langchain-v1'}, page_content='Middleware'),
 Document(metadata={'source': 'https://docs.langchain.com/oss/python/releases/langchain-v1'}, page_content='Middleware is the defining feature of'),
 Document(metadata={'source': 'https://docs.langchain.com/oss/python/releases/langchain-v1'}, page_content='of create_agent. It offers a highly customizable'),
 Document(metadata={'source': 'https://docs.langchain.com/oss/python/releases/langchain-v1'}, page_content='entry-point, raising the ceiling for what you can'),
 Document(metadata={'source': 'https://docs.langchain.com/oss/python/releases/langchain-v1'}, page_content='you can build.'),
 Document(metadata={'source': 'https://docs.langchain.com/oss/python/releases/langchain-v1'}, page_content='Great agents require context engineering: getting'),
 Document(metadata={'source': 'https://docs.langchain.com/oss/python/releases/langchain-v1'}, page_content='getting the r

### Generating Text Embeddings

In [10]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [11]:
# embeddings.embed_documents([
#     "Hii there"
#     "Oh hello"
#     "What is you name?"
#     "My friend call me kamal"
#     "Hello kamal!"
# ])

### Storing embeddings in vectordb
Getting strarted with PGVector

In [12]:
from langchain_community.document_loaders import TextLoader
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_postgres import PGVector
from langchain_core.documents import Document
import uuid

In [13]:
loader = TextLoader("../documents/bhagat_singh.txt")
documents = loader.load()

In [14]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1000,
                                          chunk_overlap=300)

splits = splitter.split_documents(documents)

In [15]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [16]:
connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"

vector_store = PGVector(
    embeddings=embeddings,
    collection_name="my_docs",
    connection=connection,
    use_jsonb=True
)

In [30]:
vector_store.add_documents(splits)

['1dd51dde-7652-4528-bede-7c62dc747713',
 'a88258c8-0037-459f-ab79-df33d8827f02',
 '72c0f82a-c93b-42b1-8415-9dbb2bea547a',
 'f9988e63-95fc-405b-ac47-99b0816ca29d']

In [31]:
docs = vector_store.similarity_search("when did bhagat singh got born?")
print(docs)

[Document(id='1dd51dde-7652-4528-bede-7c62dc747713', metadata={'source': '../documents/bhagat_singh.txt'}, page_content="Bhagat Singh (27 September 1907[2][a] – 23 March 1931) was an Indian anti-colonial revolutionary[3] who participated in the mistaken murder of a junior British police officer in December 1928 in what was intended to be retaliation for the death of an Indian nationalist.[4][5] He later took part in a largely symbolic bombing of the Central Legislative Assembly in Delhi and a hunger strike in jail, which—on the back of sympathetic coverage in Indian-owned newspapers—turned him into a household name in the Punjab region, and, after his execution at age 23, a martyr and folk hero in Northern India.[6] Borrowing ideas from Bolshevism and anarchism,[7] the charismatic Bhagat Singh[8] electrified a growing militancy in India in the 1930s and prompted urgent introspection within the Indian National Congress's nonviolent, and eventually successful, campaign for India's indepe

In [33]:
print(docs[0].page_content)

Bhagat Singh (27 September 1907[2][a] – 23 March 1931) was an Indian anti-colonial revolutionary[3] who participated in the mistaken murder of a junior British police officer in December 1928 in what was intended to be retaliation for the death of an Indian nationalist.[4][5] He later took part in a largely symbolic bombing of the Central Legislative Assembly in Delhi and a hunger strike in jail, which—on the back of sympathetic coverage in Indian-owned newspapers—turned him into a household name in the Punjab region, and, after his execution at age 23, a martyr and folk hero in Northern India.[6] Borrowing ideas from Bolshevism and anarchism,[7] the charismatic Bhagat Singh[8] electrified a growing militancy in India in the 1930s and prompted urgent introspection within the Indian National Congress's nonviolent, and eventually successful, campaign for India's independence.[9]


### Add docuemts in pgsql

In [34]:
ids = [str(uuid.uuid4()), str(uuid.uuid4())]

In [27]:
# Document??

In [35]:
new_docs = [
    Document(page_content="""Hello my  name is kamal and i am 26 years old.
    "I am greduated with science with physics honors
    "from veer kunwar singh university""",
    metadata={"source": "kamal's information"}
    ),
    Document(page_content="There is a cat sat under the table", 
             metadata={"source": "cat story"})
]


In [40]:
vector_store.add_documents(documents=new_docs, id=ids)
print("New docs added successfully!")

New docs added successfully!


In [46]:
docs = vector_store.similarity_search("what is the name of the collage of kamal?")
response = docs[0].page_content
print(response)

Hello my  name is kamal and i am 26 years old.
    "I am greduated with science with physics honors
    "from veer kunwar singh university


In [47]:
docs = vector_store.similarity_search("where did the can sit?")
response = docs[0].page_content
print(response)

There is a cat sat under the table
