In [None]:
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceHub

from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from huggingface_hub import login

import os

os.environ["HUGGINGFACEHUB_API_TOKEN"] = os.getenv("HUGGING_FACE_TOKEN")

access_token = os.getenv("HUGGING_FACE_TOKEN")
login(access_token)

In [None]:
## Read the PDFs from the folder

loader = PyPDFDirectoryLoader("./us_census")

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

final_documents = text_splitter.split_documents(documents)

final_documents[0]

In [None]:
len(final_documents)

In [None]:
## Embedding using HuggingFace

hf_embeddings = HuggingFaceEmbeddings(
    model_name = "BAAI/bge-small-en-v1.5", #sentence-transformers/all-MiniLM-16-v2
    model_kwargs = {'device':'cpu'},
    encode_kwargs={'normalize_embeddings':True}
)

In [None]:
import numpy as np
np.array(hf_embeddings.embed_query(final_documents[0].page_content))

In [None]:
print(np.array(hf_embeddings.embed_query(final_documents[0].page_content)))
print(np.array(hf_embeddings.embed_query(final_documents[0].page_content)).shape)

In [None]:
## VectorStore Creation

vectorstore=FAISS.from_documents(final_documents, hf_embeddings)

In [None]:
query = "What is health insurance coverage?"

relevant_docs = vectorstore.similarity_search(query=query)

print(relevant_docs[0].page_content)

In [18]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})
print(retriever)

tags=['FAISS', 'HuggingFaceEmbeddings'] vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001F415494A40> search_kwargs={'k': 3}


In [None]:
hf = HuggingFaceHub(
    repo_id="mistralai/Mistral-7B-v0.1",
    model_kwargs={"temperature":0.1,"max_length":500}

)
query="What is the health insurance coverage?"
hf.invoke(query)

In [None]:
#Hugging Face models can be run locally through the HuggingFacePipeline class.
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

hf = HuggingFacePipeline.from_model_id(
    model_id="mistralai/Mistral-7B-v0.1",
    task="text-generation",
    pipeline_kwargs={"temperature": 0, "max_new_tokens": 300}
)

llm = hf 

In [None]:
llm.invoke(query)

In [30]:
prompt_template="""
Use the following piece of context to answer the question asked.
Please try to provide the answer only based on the context

{context}
Question:{question}

Helpful Answers:
 """

In [31]:
prompt=PromptTemplate(template=prompt_template,input_variables=["context","question"])

In [32]:
retrievalQA=RetrievalQA.from_chain_type(
    llm=hf,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt":prompt}
)

In [33]:
query = "Differences in the uninsured rate by state in 2022"

In [34]:
# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])



KeyboardInterrupt: 