In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_huggingface import HuggingFaceEndpoint
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
import numpy as np

In [2]:
# read the PDF's in the 'us-census-data' directory

loader = PyPDFDirectoryLoader('End-to-End-Gen-AI-Powered-App/us-census-data')
documents = loader.load()
documents[0]

Document(page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015Issued September 2023Douglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect people’s access to health coverage. For example, between 2021 and 2022, the labor market continued to improve, which may have affected private coverage in the United States \nduring that time.\n1 Public policy changes included \nthe renewal of the Public Health Emergency, which \nallowed Medicaid enrollees to remain covered under the Continuous Enrollment Provision.\n2 The American \nRescue Plan (ARP) enhanced Marketplace premium subsidies for those with incomes above 400 percent of the poverty level as well as for unemployed people.\n3\nIn addition to national policies, individual states and the District of Columbia can affect health insurance coverage by making Marketplace or Medicaid more accessi

In [3]:
# transform the data

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
final_documents = text_splitter.split_documents(documents)
final_documents[0]

Document(page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015Issued September 2023Douglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect people’s access to health coverage. For example, between 2021 and 2022, the labor market continued to improve, which may have affected private coverage in the United States \nduring that time.\n1 Public policy changes included \nthe renewal of the Public Health Emergency, which \nallowed Medicaid enrollees to remain covered under the Continuous Enrollment Provision.\n2 The American \nRescue Plan (ARP) enhanced Marketplace premium subsidies for those with incomes above 400 percent of the poverty level as well as for unemployed people.\n3', metadata={'source': 'End-to-End-Gen-AI-Powered-App/us-census-data/acsbr-015.pdf', 'page': 0})

In [4]:
len(final_documents)

316

In [5]:
# embedding using Hugging Face
embeddings = HuggingFaceBgeEmbeddings(
    model_name='BAAI/bge-small-en-v1.5',  #sentence-transformers/all-MiniLM-l6-v2
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)



In [6]:
# let's check wheher our embedding is working or not

print(np.array(embeddings.embed_query(final_documents[0].page_content)))
print(np.array(embeddings.embed_query(final_documents[0].page_content)).shape)

[-8.46568346e-02 -1.19099440e-02 -3.37892473e-02  2.94559393e-02
  5.19160032e-02  5.73839583e-02 -4.10018004e-02  2.74267644e-02
 -1.05128251e-01 -1.58055834e-02  7.94858560e-02  5.64318486e-02
 -1.31765241e-02 -3.41544449e-02  5.81598282e-03  4.72548008e-02
 -1.30747156e-02  3.12990649e-03 -3.44225839e-02  3.08406390e-02
 -4.09086198e-02  3.52737792e-02 -2.43761521e-02 -4.35831212e-02
  2.41503362e-02  1.31986821e-02 -4.84453235e-03  1.92346908e-02
 -5.43912873e-02 -1.42735034e-01  5.15530724e-03  2.93115638e-02
 -5.60810231e-02 -8.53530224e-03  3.14141512e-02  2.76736263e-02
 -2.06188541e-02  8.24231580e-02  4.15425263e-02  5.79654723e-02
 -3.71586867e-02  6.26163650e-03 -2.41389778e-02 -5.61797665e-03
 -2.51715649e-02  5.04975207e-03 -2.52801683e-02 -2.91944738e-03
 -8.24044272e-03 -5.69604300e-02  2.30822954e-02 -5.54214371e-03
  5.11555411e-02  6.09937795e-02  6.49766624e-02 -5.38514368e-02
  2.19109636e-02 -2.54194327e-02 -4.49223295e-02  4.22459282e-02
  4.75252606e-02  7.23216

In [7]:
# vector store creation using FAISS
vectorstore = FAISS.from_documents(final_documents[:120], embedding=embeddings)

In [8]:
# query using similarity search
query = 'WHAT IS HEALTH INSURANCE COVERAGE?'
relevant_documents = vectorstore.similarity_search(query)
relevant_documents[0].page_content

'2 U.S. Census Bureau\nWHAT IS HEALTH INSURANCE COVERAGE?\nThis brief presents state-level estimates of health insurance coverage \nusing data from the American Community Survey (ACS). The  \nU.S. Census Bureau conducts the ACS throughout the year; the \nsurvey asks respondents to report their coverage at the time of \ninterview. The resulting measure of health insurance coverage, \ntherefore, reflects an annual average of current comprehensive \nhealth insurance coverage status.* This uninsured rate measures a \ndifferent concept than the measure based on the Current Population \nSurvey Annual Social and Economic Supplement (CPS ASEC). \nFor reporting purposes, the ACS broadly classifies health insurance \ncoverage as private insurance or public insurance. The ACS defines \nprivate health insurance as a plan provided through an employer \nor a union, coverage purchased directly by an individual from an \ninsurance company or through an exchange (such as healthcare.'

In [9]:
# setup retriever
retriever = vectorstore.as_retriever(search_type='similarity', search_kwargs={'k': 3})
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x7feaca05e950>, search_kwargs={'k': 3})

In [10]:
# setup hugging face API token
# get a token: https://huggingface.co/docs/api-inference/quicktour#get-your-api-token

from getpass import getpass
import os

HUGGINGFACEHUB_API_TOKEN = getpass()
os.environ["HUGGINGFACEHUB_API_TOKEN"] = HUGGINGFACEHUB_API_TOKEN


# setup LLM from Hugging Face
repo_id = "mistralai/Mistral-7B-Instruct-v0.2"

llm = HuggingFaceEndpoint(
    repo_id=repo_id,
    model_kwargs = {'max_length': 128},
    temperature=0.5,
    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
)

query = 'What is health insurance coverage?'
llm.invoke(query)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /home/zeus/.cache/huggingface/token
Login successful


"\n\nHealth insurance coverage is a contract between an individual or group and an insurance company that provides financial protection against the cost of medical care. The insurance company agrees to pay for certain medical expenses, while the insured pays a premium and may also be responsible for copayments, deductibles, and coinsurance.\n\nThere are different types of health insurance plans, including:\n\n1. Fee-for-service plans: These plans allow insured individuals to choose their own healthcare providers and pay for services on a per-visit basis.\n2. Health maintenance organizations (HMOs): HMOs require insured individuals to choose a primary care physician who manages their care and refers them to specialists as needed.\n3. Preferred provider organizations (PPOs): PPOs allow insured individuals to choose their healthcare providers from a network of preferred providers, and may offer more flexibility in terms of out-of-network care.\n4. Point-of-service (POS) plans: POS plans a

In [11]:
#Hugging Face models can be run locally through the HuggingFacePipeline class.
# from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
# from transformers import pipeline

# hf_model = pipeline(
#     "text-generation", model="cerebras/Cerebras-GPT-590M", max_new_tokens=200
# )

# original_model = HuggingFacePipeline(pipeline=hf_model)

# llm = hf 
# llm.invoke(query)

In [12]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
 """

In [13]:
# setup prompt
prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])

In [14]:
# retrieval QA
retrievalQA = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={'prompt': prompt}
)

In [15]:
query = 'DIFFERENCES IN THE UNINSURED RATE BY STATE IN 2022'
result  = retrievalQA.invoke({'query': query})
result['result']

'1. The uninsured rate in 2022 ranged from 2.4% in Massachusetts to 16.6% in Texas, with a national rate of 8.0%.\n 2. Ten of the 15 states with the highest uninsured rates were above the national average.\n 3. Medicaid coverage accounted for a larger percentage of coverage in states that expanded Medicaid eligibility (22.7%) compared to nonexpansion states (18.0%).\n 4. Uninsured rates decreased in 27 states from 2021 to 2022, while only Maine had an increase.\n 5. The uninsured rate in Maine increased from 5.7% to 6.6%, but it remained below the national average.\n 6. The private coverage rates were not statistically different in North Dakota and Utah.\n 7. The uninsured rates for the most populous metropolitan areas are provided in the context.\n\nDetailed Answer:\nThe context provides information on the differences in the uninsured rate by state in 2022, as well as changes in the uninsured rate from 2021 to 2022. According to the context, the uninsured rate in 2022 ranged from a lo