In [8]:
from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_huggingface import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate

from langchain.chains import retrieval_qa

In [None]:
loader = PyPDFDirectoryLoader('./us_census')
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

final_documents = text_splitter.split_documents(documents)
final_documents[0]

Document(metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.2 (Windows)', 'creationdate': '2023-09-09T07:52:17-04:00', 'author': 'U.S. Census Bureau', 'keywords': 'acsbr-015', 'moddate': '2023-09-12T14:44:47+01:00', 'title': 'Health Insurance Coverage Status and Type by Geography: 2021 and 2022', 'trapped': '/false', 'source': 'us_sensus\\acsbr-015.pdf', 'total_pages': 18, 'page': 0, 'page_label': '1'}, page_content='Health Insurance Coverage Status and Type \nby Geography: 2021 and 2022\nAmerican Community Survey Briefs\nACSBR-015\nIssued September 2023\nDouglas Conway and Breauna Branch\nINTRODUCTION\nDemographic shifts as well as economic and govern-\nment policy changes can affect people’s access to \nhealth coverage. For example, between 2021 and 2022, \nthe labor market continued to improve, which may \nhave affected private coverage in the United States \nduring that time.1 Public policy changes included \nthe renewal of the Public Health Emergency, wh

In [5]:
len(final_documents)

316

In [None]:
embeddings = HuggingFaceEmbeddings(
    model_name="ibm-granite/granite-embedding-278m-multilingual",
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True})

In [10]:
vectors = FAISS.from_documents(documents, embeddings)
vectors.similarity_search("what is census")

[Document(id='c3be583c-c51c-4d3e-bfda-96c9d63669dc', metadata={'producer': 'Adobe PDF Library 17.0', 'creator': 'Adobe InDesign 18.5 (Windows)', 'creationdate': '2023-10-19T11:35:38-04:00', 'author': 'U.S. Census Bureau', 'keywords': 'household income in states and metropolitan areas 2022', 'moddate': '2023-11-30T12:35:09+00:00', 'title': 'Household Income in States and Metropolitan Areas: 2022', 'trapped': '/false', 'source': 'us_sensus\\acsbr-017.pdf', 'total_pages': 9, 'page': 6, 'page_label': '7'}, page_content='U.S. Census Bureau  7\nhouseholds to $51,374 for Black \nhouseholds.10 Black households \nexperienced a statistically sig-\nnificant increase in real median \nhousehold income between 2021 \nand 2022. Asian households \n10 Federal surveys give respondents the \noption of reporting more than one race. \nTherefore, two basic ways of defining a race \ngroup are possible. A group, such as Asian, \nmay be defined as those who reported Asian \nand no other race (the race-alone or