# load the data from website

In [86]:
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/i5V3ACEyz6hnYpVq6MTSvg/state-of-the-union.txt")

data = loader.load()

In [87]:
data[0].page_content[:100]
text = data[0].page_content

In [88]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=200, chunk_overlap=30,
    length_function=len )

text_chunks = splitter.split_text(text)
text_chunks[0]

'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.'

# Lets Use PreTrained_Embedding model by HuggingFace library

In [89]:
from langchain_community.embeddings import HuggingFaceEmbeddings
model_name = 'sentence-transformers/all-mpnet-base-v2'
embedding_model = HuggingFaceEmbeddings(model_name = model_name)
embedding_model

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-mpnet-base-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

# Now lets Create the embeddings for each chunks

In [90]:
chunk_embeddings = embedding_model.embed_documents(text_chunks)

In [91]:
len(chunk_embeddings)

271

# Now store these chunks_embeddings into Vectordatabase using ChromaDB and FAISS

In [93]:
idx = []
for i in range(len(text_chunks)):
    idx.append(str(i))
len(idx)

271

In [94]:
from langchain.vectorstores import Chroma
chroma_db = Chroma.from_texts(text_chunks, embedding_model, ids = idx)

In [95]:
# print some text_chunks from database
print(chroma_db._collection.get('2')['documents'])

['With a duty to one another to the American people to the Constitution. \n\nAnd with an unwavering resolve that freedom will always triumph over tyranny.']


In [96]:
# print the length of database
chroma_db._collection.count()

271

In [98]:
query = "what is your company policies"
similar_query = chroma_db.similarity_search(query, k=2)
similar_query[0].page_content

'child care, to be able to get back to work.'

In [99]:
# Add data to existing database
Q1 = "we want to add some data into chrome database"
from langchain_core.documents import Document
new_chunk = Document(
    page_content = Q1,
    metadata = {
        'source':'ibm.com',
        'page' : 1
    }
)
new_chunk = [new_chunk]
print(chroma_db._collection.get('542'))

{'ids': [], 'embeddings': None, 'documents': [], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': []}


In [100]:
chroma_db.add_documents(
    documents = new_chunk,
    ids = ['542']
)

['542']

In [101]:
print(chroma_db._collection.get('542'))

{'ids': ['542'], 'embeddings': None, 'documents': ['we want to add some data into chrome database'], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': [{'page': 1, 'source': 'ibm.com'}]}


In [102]:
update_chunk = Document(
    page_content = 'We just updated the 542th index content here',
    metadata={
        'source':'ibm.com',
        'page':1
    }
)

In [103]:
chroma_db.update_document(
    '542',
    update_chunk
)

In [109]:
print(chroma_db._collection.get('542'))

{'ids': [], 'embeddings': None, 'documents': [], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': []}


In [107]:
# if you want to delete any document or chunk(text)
chroma_db._collection.delete('542')

In [108]:
print(chroma_db._collection.get('542'))

{'ids': [], 'embeddings': None, 'documents': [], 'uris': None, 'included': ['metadatas', 'documents'], 'data': None, 'metadatas': []}


# Now use a different DataBase like FAISS

In [110]:
from langchain.vectorstores import FAISS
faiss_db = FAISS.from_texts(text_chunks, embedding_model, ids = idx)

In [111]:
#print some stored data from FAISS_DB

In [112]:
query = "what is your company policies"
faiss_db.similarity_search(query, k= 2)

[Document(page_content='child care, to be able to get back to work.'),
 Document(page_content='Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers.')]

In [113]:
# faiss_db?

In [114]:
faiss_db.similarity_search(query, k=2) # simple search the closest content from data 

[Document(page_content='child care, to be able to get back to work.'),
 Document(page_content='Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers.')]

In [115]:
embedded_query = embedding_model.embed_query(query)
faiss_db.similarity_search_by_vector(embedded_query, k =2)   # it will use only embedded_vector of your query then search

[Document(page_content='child care, to be able to get back to work.'),
 Document(page_content='Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers.')]

In [116]:
faiss_db.similarity_search_with_score(query, k =2)   # search the content and score too

[(Document(page_content='child care, to be able to get back to work.'),
  1.3609143),
 (Document(page_content='Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers.'),
  1.3922057)]

In [117]:
# Now count the total vector stored in database
faiss_db.index.ntotal

271

# Add data into FAISS_DATABASE

In [118]:
faiss_db.add_texts(["new doc"], metadatas=[{"id": "new"}])

['35fbcbea-468f-4f8e-aa26-c8303ff02468']

In [119]:
faiss_db.index.ntotal   # after adding one text it becomes 272

272

In [120]:
# Save your database in directory
faiss_db.save_local("FAISS_DB")

In [121]:
# load your database for embedding new query for Gen_AI applications
faiss_db = faiss_db.load_local("FAISS_DB", embedding_model, allow_dangerous_deserialization=True)

In [122]:
faiss_db

<langchain_community.vectorstores.faiss.FAISS at 0x1a4fa620dd0>

In [123]:
retriever = faiss_db.as_retriever(search_type = 'mmr')  # Maximum Marginal Relevance Retrieval
docs = retriever.invoke(Q1)
docs

[Document(page_content='And on testing, we have made hundreds of millions of tests available for you to order for free.'),
 Document(page_content='But cancer from prolonged exposure to burn pits ravaged Heath’s lungs and body. \n\nDanielle says Heath was a fighter to the very end. \n\nHe didn’t know how to stop fighting, and neither did she.'),
 Document(page_content='Intel’s CEO, Pat Gelsinger, who is here tonight, told me they are ready to increase their investment from  \n$20 billion to $100 billion.'),
 Document(page_content='We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers.')]

In [124]:
faiss_db.as_retriever(search_type = 'mmr').invoke("email policy")

[Document(page_content='And I’m taking robust action to make sure the pain of our sanctions  is targeted at Russia’s economy. And I will use every tool at our disposal to protect American businesses and consumers.'),
 Document(page_content='It’s time to strengthen privacy protections, ban targeted advertising to children, demand tech companies stop collecting personal data on our children.'),
 Document(page_content='We got more than 130 countries to agree on a global minimum tax rate so companies can’t get out of paying their taxes at home by shipping jobs and factories overseas.'),
 Document(page_content='And I ask Congress to pass proven measures to reduce gun violence. Pass universal background checks. Why should anyone on a terrorist list be able to purchase a weapon?')]

In [125]:
faiss_db.similarity_search("email policy")

[Document(page_content='And I’m taking robust action to make sure the pain of our sanctions  is targeted at Russia’s economy. And I will use every tool at our disposal to protect American businesses and consumers.'),
 Document(page_content='It’s time to strengthen privacy protections, ban targeted advertising to children, demand tech companies stop collecting personal data on our children.'),
 Document(page_content='As I have made crystal clear the United States and our Allies will defend every inch of territory of NATO countries with the full force of our collective power.'),
 Document(page_content='And I will keep doing everything in my power to crack down on gun trafficking and ghost guns you can buy online and make at home—they have no serial numbers and can’t be traced.')]

In [126]:
# So we can see that MMR is more effective and efficient than normal similar_search for searching the most similar or relevant content for the same query 

# Lets integrate Ollama LLM into the System for Advanced Retriever Processes by langchain

In [127]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_community.llms import Ollama

# Setup Both llm and retriever for Multi Query Retriever
llm = Ollama(model = 'llama3.2')
faiss_retriever = faiss_db.as_retriever(search_type = 'similarity', search_kwargs = {'k': 3})


# Now make LANGCHAIN Retrievers

# Multi Query Retrievers

In [128]:
multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever =  faiss_db.as_retriever(search_kwargs = {'k':2}),
    llm = llm
)

In [129]:
Query = "how email policy works in this company for their employees"

In [130]:
similarity_results = faiss_retriever.invoke(Query)
multiquery_results = multi_query_retriever.invoke(Query)

In [131]:
multiquery_results

[Document(page_content='I’ve worked on these issues a long time.'),
 Document(page_content='Second – we must prepare for new variants. Over the past year, we’ve gotten much better at detecting new variants.'),
 Document(page_content='Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers.'),
 Document(page_content='We’ll build a national network of 500,000 electric vehicle charging stations, begin to replace poisonous lead pipes—so every child—and every American—has clean water to drink at home and at school,'),
 Document(page_content='Here are four common sense steps as we move forward safely.'),
 Document(page_content='Third – we can end the shutdown of schools and businesses. We have the tools we need.')]

In [132]:
similarity_results

[Document(page_content='There’s been a law on the books for almost a century \nto make sure taxpayers’ dollars support American jobs and businesses. \n\nEvery Administration says they’ll do it, but we are actually doing it.'),
 Document(page_content='We’re doing that here in the federal government. The vast majority of federal workers will once again work in person. \n\nOur schools are open. Let’s keep it that way. Our kids need to be in school.'),
 Document(page_content='–on their economy. The Ruble has lost 30% of its value.')]

In [133]:
similarity_results[0].page_content

'There’s been a law on the books for almost a century \nto make sure taxpayers’ dollars support American jobs and businesses. \n\nEvery Administration says they’ll do it, but we are actually doing it.'

In [134]:
for i , docs in enumerate(similarity_results):
    print(f"Retrieved result would be : {i+1} ---------------->")
    print(docs.page_content)

Retrieved result would be : 1 ---------------->
There’s been a law on the books for almost a century 
to make sure taxpayers’ dollars support American jobs and businesses. 

Every Administration says they’ll do it, but we are actually doing it.
Retrieved result would be : 2 ---------------->
We’re doing that here in the federal government. The vast majority of federal workers will once again work in person. 

Our schools are open. Let’s keep it that way. Our kids need to be in school.
Retrieved result would be : 3 ---------------->
–on their economy. The Ruble has lost 30% of its value.


In [135]:
for i , docs in enumerate(multiquery_results):
    print(f"Retrieved result would be : {i+1} ---------------->")
    print(docs.page_content)
    print("-"*45,">")

Retrieved result would be : 1 ---------------->
I’ve worked on these issues a long time.
--------------------------------------------- >
Retrieved result would be : 2 ---------------->
Second – we must prepare for new variants. Over the past year, we’ve gotten much better at detecting new variants.
--------------------------------------------- >
Retrieved result would be : 3 ---------------->
Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers.
--------------------------------------------- >
Retrieved result would be : 4 ---------------->
We’ll build a national network of 500,000 electric vehicle charging stations, begin to replace poisonous lead pipes—so every child—and every American—has clean water to drink at home and at school,
--------------------------------------------- >
Retrieved result would be : 5 ---------------->
Here are four common sense steps as we move forward safely.
--------------------------------------------- >
Ret

# SINGLE QUERY RETRIEVER USING PDF DATA

In [136]:
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/ioch1wsxkfqgfLLgmd-6Rw/langchain-paper.pdf"
from langchain_community.document_loaders import PyPDFLoader
pdf_loader = PyPDFLoader(url)

pdf_data = pdf_loader.load()

In [137]:
pdf_chunks = splitter.split_documents(pdf_data)

In [138]:
pdf_chunks[0]

Document(metadata={'source': 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/ioch1wsxkfqgfLLgmd-6Rw/langchain-paper.pdf', 'page': 0}, page_content='* corresponding author - jkim72@kent.edu \nRevolutionizing Mental Health Care through \nLangChain: A Journey with a Large Language \nModel\nAditi Singh \n Computer Science  \n Cleveland State University')

In [140]:
pdf_embeddings = embedding_model.embed_documents(pdf_chunks[0].page_content)

In [141]:
len(pdf_embeddings[0])

768

In [142]:
index = []
for i in range(len(pdf_chunks)):
    index.append(str(i))
len(index)
idx = []
for i in range(len(pdf_chunks)):
    idx.append(i)

In [143]:
pdf_chroma_db = Chroma.from_documents(pdf_chunks, embedding_model, ids = index)
pdf_faiss_db = FAISS.from_documents(pdf_chunks, embedding_model , ids = idx)

In [144]:
# pdf_chunks used for Single Query Retriever

In [145]:
for i, doc in enumerate(pdf_chunks):
    doc.metadata["source"] = f"page_{i+1}.pdf"

In [146]:
from langchain_core.documents import Document
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from lark import lark
import json

metadata_field_info = [
    AttributeInfo(
        name="source",
        description="from where data has been fetched",
        type="string",
    ),
    AttributeInfo(
        name = 'page',
        description = "On which page this data is written in the pdf",
        type = 'integer'
    ),
]

In [147]:
document_content_description = "Give me the brief introduction of Langchain ?"

self_query_retriever = SelfQueryRetriever.from_llm(
    llm, 
    pdf_chroma_db,
    document_content_description,
    metadata_field_info,
)

In [148]:
self_qeury_results = self_query_retriever.invoke("i want to know langchain implementation")

OutputParserException: Parsing text
Here are the structured requests for each example:

### Example 1
Data Source:
```json
{
    "content": "Lyrics of a song",
    "attributes": {
        "artist": {
            "type": "string",
            "description": "Name of the song artist"
        },
        "length": {
            "type": "integer",
            "description": "Length of the song in seconds"
        },
        "genre": {
            "type": "string",
            "description": "The song genre, one of "pop", "rock" or "rap""
        }
    }
}
```

User Query:
What are songs by Taylor Swift or Katy Perry about teenage romance under 3 minutes long in the dance pop genre

Structured Request:
```json
{
    "query": "teenager love",
    "filter": "and(or(eq(\"artist\", \"Taylor Swift\"), eq(\"artist\", \"Katy Perry\")), lt(\"length\", 180), eq(\"genre\", \"pop\"))"
}
```

### Example 2
Data Source:
```json
{
    "content": "Lyrics of a song",
    "attributes": {
        "artist": {
            "type": "string",
            "description": "Name of the song artist"
        },
        "length": {
            "type": "integer",
            "description": "Length of the song in seconds"
        },
        "genre": {
            "type": "string",
            "description": "The song genre, one of "pop", "rock" or "rap""
        }
    }
}
```

User Query:
What are songs that were not published on Spotify

Structured Request:
```json
{
    "query": "",
    "filter": "NO_FILTER"
}
```

### Example 3
Data Source:
```json
{
    "content": "Give me the brief introduction of Langchain ?",
    "attributes": {
        "source": {
            "description": "from where data has been fetched",
            "type": "string"
        },
        "page": {
            "description": "On which page this data is written in the pdf",
            "type": "integer"
        }
    }
}
```

User Query:
i want to know langchain implementation

Structured Request:
```json
{
    "query": "langchain impl",
    "filter": ""
}
```
 raised following error:
Got invalid JSON object. Error: Expecting ',' delimiter: line 14 column 53 (char 393)

In [None]:
self_qeury_results

# Parent Document Retriever 

In [72]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore
store = InMemoryStore()

In [83]:
child_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000, chunk_overlap = 20)
parent_splitter = RecursiveCharacterTextSplitter(chunk_size = 400, chunk_overlap = 30)

parent_document_retriever = ParentDocumentRetriever(
    vectorstore  = pdf_faiss_db,
    docstore = store,
    child_splitter = child_splitter,
    parent_splitter = parent_splitter,)

In [84]:
parent_document_result = parent_document_retriever.invoke("i want to know langchain implementation")

In [85]:
parent_document_result

[]