In [9]:
import os 
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.llms import HuggingFaceHub
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_pinecone import PineconeVectorStore
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from urllib.request import urlretrieve
from pinecone import Pinecone, ServerlessSpec
from uuid import uuid4
import time


<h2>Loading data</h2>

In [2]:
files = [
    "https://www.census.gov/content/dam/Census/library/publications/2022/demo/p70-178.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-017.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-016.pdf",
    "https://www.census.gov/content/dam/Census/library/publications/2023/acs/acsbr-015.pdf",
]

os.makedirs('census-pdfs',exist_ok=True)

In [3]:
for url in files:
    file_path = os.path.join("census-pdfs", url.rpartition("/")[2])
    urlretrieve(url, file_path) # retrieves pdfs from the web

In [10]:
loader = PyPDFDirectoryLoader("census-pdfs") #it loads the directory that contains the pdfs

<h2>CHUNKING</h2>

In [12]:
#Chunk the pdfs to a list of langchain_core.documents.base.Document where each one contains tuples of
#(meta data {source_file,page}, page_content)

listed_docs = loader.load() 

In [6]:
listed_docs[0]

Document(metadata={'source': 'census-pdfs\\acsbr-015.pdf', 'page': 0}, page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015\nIssued September 2023\nDouglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect people’s access to \nhealth coverage. For example, between 2021 and 2022, \nthe labor market continued to improve, which may \nhave affected private coverage in the United States \nduring that time.1 Public policy changes included \nthe renewal of the Public Health Emergency, which \nallowed Medicaid enrollees to remain covered under \nthe Continuous Enrollment Provision.2 The American \nRescue Plan (ARP) enhanced Marketplace premium \nsubsidies for those with incomes above 400 percent \nof the poverty level as well as for unemployed people.3\nIn addition to national policies, individual states and \nthe District of Columbia can affec

In [7]:
len(listed_docs[0].page_content)

4258

the page_content is big, so we need to chunk it and then save the chunks into the vector database

In [53]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 400,
    chunk_overlap = 50
)

chunked_docs = text_splitter.split_documents(listed_docs)

In [9]:
chunked_docs[0]

Document(metadata={'source': 'census-pdfs\\acsbr-015.pdf', 'page': 0}, page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015\nIssued September 2023\nDouglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect people’s access to \nhealth coverage. For example, between 2021 and 2022, \nthe labor market continued to improve, which may \nhave affected private coverage in the United States \nduring that time.1 Public policy changes included \nthe renewal of the Public Health Emergency, which \nallowed Medicaid enrollees to remain covered under \nthe Continuous Enrollment Provision.2 The American \nRescue Plan (ARP) enhanced Marketplace premium')

In [10]:
len(chunked_docs[0].page_content)

694

In [39]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)

In [40]:
avg_char_listed_docs = avg_doc_length(listed_docs)
avg_char_chunked_docs = avg_doc_length(chunked_docs)

avg_char_listed_docs,avg_char_chunked_docs

(3840, 212)

<h2>Embedding</h2>

In [14]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",  # one of the most popular embedding models in hugging face
    model_kwargs={"device": "cpu"},
    encode_kwargs={"noramlize_embeddings": True},  # normalize embeddings to unit length
)

  huggingface_embeddings = HuggingFaceBgeEmbeddings(


In [15]:
huggingface_embeddings.embed_query("Hello world!")

[0.004552455618977547,
 0.17278991639614105,
 0.034776367247104645,
 0.005749491974711418,
 -0.026328902691602707,
 -0.04085709899663925,
 0.02265791967511177,
 -0.04465889185667038,
 -0.018803782761096954,
 0.008832556195557117,
 0.04025248810648918,
 -0.034709054976701736,
 0.015151104889810085,
 -0.014657402411103249,
 0.07468858361244202,
 -0.04393263906240463,
 -0.05603843927383423,
 0.020306618884205818,
 -0.05812907591462135,
 -0.04611072689294815,
 0.08254793286323547,
 0.10951974242925644,
 0.01444613840430975,
 0.025379350408911705,
 -0.08089375495910645,
 0.01504394318908453,
 -0.0035153122153133154,
 0.013029968366026878,
 0.09713462740182877,
 -0.06146962195634842,
 -0.027697736397385597,
 0.0014512579655274749,
 0.08353219926357269,
 0.01702173240482807,
 -0.01003107987344265,
 0.08817806094884872,
 0.053562190383672714,
 -0.039075590670108795,
 0.02845209278166294,
 -0.05871634930372238,
 0.024071132764220238,
 -0.03864150121808052,
 -0.04744642600417137,
 0.003006333950

<h2>Vector DataBase</h2>

In [2]:
pinecone_api = "***"

In [None]:

pc = Pinecone(api_key=pinecone_api)

In [38]:
#an index is a data structure that enables efficient similarity search and retrieval of high-dimensional vectors
index_name = "langchain-test-index"  
existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [39]:
pc.describe_index(index_name)

{
    "name": "langchain-test-index",
    "dimension": 384,
    "metric": "cosine",
    "host": "langchain-test-index-y95f95m.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "deletion_protection": "disabled"
}

In [40]:

pine_vectorstore = PineconeVectorStore(index=index, embedding=huggingface_embeddings)

In [54]:
uuids = [str(uuid4()) for _ in range(len(chunked_docs))]

pine_vectorstore.add_documents(documents=chunked_docs, ids=uuids)

['62fff530-94fe-4248-bc8b-afda8b872e1e',
 '4cd423d9-1544-4754-9f43-755db2e2c10d',
 '819cdb8e-1b70-49cb-a331-5e21713e5051',
 'd393adff-ad16-4a17-86eb-3f0d23fc92f6',
 '7e3490ae-24f2-4335-b3ec-72a46b3a2cec',
 '4e56c27f-1720-42ea-a6d8-decfb245e377',
 '8a2eb18b-c3c9-4120-a2c5-074d1b167691',
 '28cb64a0-be25-46f5-a147-a2f1c2ea08a4',
 '3396aa74-b0c0-4d8a-87b0-02e81c2ab9a9',
 '34038802-b557-47f8-b001-8b4a5adf4f79',
 '09f1f6fb-f741-48ef-a167-39e49a99b3cc',
 '022bb3f6-462f-4ceb-9ad6-e492483c2838',
 'efef01aa-012c-482f-b286-0d6f67b95a88',
 '27af587e-7f15-4d0c-9489-c0542dbd44d8',
 'bbc453b3-e8ed-4c86-afcf-02730e934374',
 '51f92cf6-d7a7-42d7-bc0f-17228753b162',
 'f57a0d76-6b9b-4be3-afa7-182faaa9ec21',
 'cd93ec5d-2d1d-42ca-be1b-cb7fd0e117f2',
 'b19d4137-46d1-414f-8517-e105df2024b9',
 '0b951808-13fe-464b-a904-c4e3a67230df',
 'b6c8e017-c433-41e0-95df-cf636defebc2',
 '3f190532-5598-4f75-a83f-95be68f2de75',
 '63cf0f39-876e-4f7e-b7c6-7e62e23e910e',
 '6db31a4f-7de8-46b2-b8e6-2365bf8585d5',
 'd7af3cc3-fa04-

In [42]:
#FAISS.from_documents(documents,embedding fn to use) --> vector store
vectorstore = FAISS.from_documents(chunked_docs,huggingface_embeddings)

In [43]:
query = "What are the trends in median household income across different states in the united states between 2021 and 2022?"

<h4>use the similarity score on the vector_db</h2>

In [56]:
results = pine_vectorstore.similarity_search(query,k=5)
for res in results:
	print(f"{res.page_content} [{res.metadata}]")
	print("*****")

in the 21st Century, Steven H. Murdock 
and David A. Swanson (eds.), Springer, 
Netherlands, 2008.
topics, visit <www.census.gov/
programs-surveys/acs>.
HIGHLIGHTS
• Median household income in 
the United States was $74,755 
in 2022, a decline of 0.8 percent 
from last year, after adjusting 
for inflation.6
• Real median household income 
increased in five states and [{'page': 1.0, 'source': 'census-pdfs\\acsbr-017.pdf'}]
*****
and Utah—showed a statistically 
significant increase in real median 
household income; 17 states 
showed a decrease. Real median 
household income in 2022 was not 
statistically different from that in 
2021 for 28 states, the District of 
Columbia, and Puerto Rico  
(Table 1).
Median Household Income in the Past 12 Months
for the United States and Puerto Rico: 2022
DC
DE
TX
CA
MT
AZ
ID
NV
NM
CO [{'page': 3.0, 'source': 'census-pdfs\\acsbr-017.pdf'}]
*****
hold income. Seventeen states 
showed a decrease. Real median 
household income in 2022 was not 
statistica

In [44]:
#apply similarity search to the query using vectorstore.similarity_search()
relevant_docs = vectorstore.similarity_search(query)
print(relevant_docs[0].page_content)

Figure 1.
Median Household Income in the Past 12 Months in the United States: 2005–2022
 
Note: Estimates for 2020 experimental data not shown. For more information on the 2020 experimental data products,


<h4>build a retriever to pass the similar context to LLM</h4>

In [57]:
pinecone_retriever = pine_vectorstore.as_retriever(search_type="similarity_score_threshold",search_kwargs={"k":3,"score_threshold" : 0.6})

In [45]:
#retrieves similar text from the vectorstore

retriever = vectorstore.as_retriever(search_type="similarity" , search_kwargs={"k" : 3})

In [20]:
#put your hugging face access token here
access_token = "***"

In [21]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = access_token

In [58]:
hf = HuggingFaceHub(
    repo_id='mistralai/Mistral-Nemo-Instruct-2407',#'mistralai/Mistral-7B-v0.1'
    model_kwargs = {"temperature" : 0.1 , "max_length" : 900}
)

  hf = HuggingFaceHub(


In [None]:
output = hf.invoke(query)
print(output)

<h4>Prompt Template</h4>

In [64]:
prompt_template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.
3. Only use the provided context to answer the question. Do not use any external sources. 
{context}

Question: {question}

Helpful Answer:
"""

PROMPT = PromptTemplate(
 template=prompt_template, input_variables=["context", "question"]
)

<h4>Combine Every Thing</h4>

In [65]:
pinecone_retievalQA = RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever = pinecone_retriever,
    return_source_documents = True,
    chain_type_kwargs={"prompt":PROMPT}
)

In [48]:
#create the chain
retrievalQA = RetrievalQA.from_chain_type(
    llm=hf, 
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs = {"prompt" : PROMPT}
)

### USE THE RAG

In [67]:
pinecone_result = pinecone_retievalQA.invoke({"query":query})
print(pinecone_result['result'])

Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.
3. Only use the provided context to answer the question. Do not use any external sources. 
in the 21st Century, Steven H. Murdock 
and David A. Swanson (eds.), Springer, 
Netherlands, 2008.
topics, visit <www.census.gov/
programs-surveys/acs>.
HIGHLIGHTS
• Median household income in 
the United States was $74,755 
in 2022, a decline of 0.8 percent 
from last year, after adjusting 
for inflation.6
• Real median household income 
increased in five states and

and Utah—showed a statistically 
significant increase in real median 
household income; 17 states 
showed a decrease. Real median 
household income in 2022 was not 
statistically different 

In [49]:
result = retrievalQA.invoke({"query":query})
print(result)

{'query': 'What are the trends in median household income across different states in the united states between 2021 and 2022?', 'result': 'Use the following pieces of context to answer the question at the end. Please follow the following rules:\n1. If you don\'t know the answer, don\'t try to make up an answer. Just say "I can\'t find the final answer but you may want to check the following links".\n2. If you find the answer, write the answer in a concise way with five sentences maximum.\n\nFigure 1.\nMedian Household Income in the Past 12 Months in the United States: 2005–2022\n \nNote: Estimates for 2020 experimental data not shown. For more information on the 2020 experimental data products,\n\nprograms-surveys/acs>.\nHIGHLIGHTS\n• Median household income in \nthe United States was $74,755 \nin 2022, a decline of 0.8 percent \nfrom last year, after adjusting \nfor inflation.6\n• Real median household income \nincreased in five states and\n\nhold income. Seventeen states \nshowed a d

In [51]:
print(result['result'])

Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

Figure 1.
Median Household Income in the Past 12 Months in the United States: 2005–2022
 
Note: Estimates for 2020 experimental data not shown. For more information on the 2020 experimental data products,

programs-surveys/acs>.
HIGHLIGHTS
• Median household income in 
the United States was $74,755 
in 2022, a decline of 0.8 percent 
from last year, after adjusting 
for inflation.6
• Real median household income 
increased in five states and

hold income. Seventeen states 
showed a decrease. Real median 
household income in 2022 was not 
statistically different from that in 
2021 for 28 states, the District of 
Columbia, and Puerto Rico. This

In [37]:
relevant_docs = result['source_documents']
print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')
print("*" * 100)
for i, doc in enumerate(relevant_docs):
    print(f"Relevant Document #{i+1}:\nSource file: {doc.metadata['source']}, Page: {doc.metadata['page']}\nContent: {doc.page_content}")
    print("-"*100)
    print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')

There are 3 documents retrieved which are relevant to the query.
****************************************************************************************************
Relevant Document #1:
Source file: census-pdfs\acsbr-017.pdf, Page: 3
Content: hold income in 2022 was $24,112 
(Table 1 and Figure 2). Median 
household income was lower than 
the U.S. median in 30 states and 
Puerto Rico. It was higher than the 
U.S. median in 17 states and the 
District of Columbia. The medians 
for Arizona, Oregon, and Vermont 
were not statistically different from 
the U.S. median.
From 2021 to 2022, five states—
Alabama, Alaska, Delaware, Florida, 
and Utah—showed a statistically 
significant increase in real median 
household income; 17 states 
showed a decrease. Real median 
household income in 2022 was not 
statistically different from that in 
2021 for 28 states, the District of 
Columbia, and Puerto Rico  
(Table 1).
-------------------------------------------------------------------------------