## Install needed libraries

In [1]:
# %pip install --upgrade --quiet pypdf
# %pip install langchain
# %pip install 
#%pip install langchain-huggingface
#%pip uninstall transformers huggingface_hub sentence-transformers -y
#%pip install transformers==4.28.1 huggingface_hub==0.14.1 sentence-transformers==2.2.0
#%pip install -U sentence-transformers
#%pip install chromadb

## Load Hugging Face Token

In [2]:
import os
from getpass import getpass

HF_TOKEN = getpass("Huggingface Token : ")
os.environ['Token'] = HF_TOKEN

## Load PDF Document 

In [3]:
from langchain import *
from langchain_community.document_loaders import PyPDFLoader

In [4]:
file_path = "EACC-NATIONAL-SURVEY-REPORT-2023.pdf"
loader = PyPDFLoader(file_path)
pages = loader.load()
pages[0]

Document(metadata={'source': 'EACC-NATIONAL-SURVEY-REPORT-2023.pdf', 'page': 0}, page_content='National Ethics and Corruption Survey (NECS) 2023\nEACC Research Report No. 15 of December 2023iNATIONAL ETHICS AND \nCORRUPTION SURVEY \n(NECS), 2023\nEVIDENCE FROM \nHOUSEHOLDS IN KENYA\nTuangamize Uﬁsadi, Tuijenge Kenya\nETHICS AND ANTI-CORRUPTION COMMISSION\nEACC Research Report No. 15 of December 2023   ')

## Chunking

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=0
)

chunks = text_splitter.split_documents(pages)

In [7]:
chunks[3]

Document(metadata={'source': 'EACC-NATIONAL-SURVEY-REPORT-2023.pdf', 'page': 3}, page_content='iv\nNational Ethics and Corruption Survey (NECS) 2023\nEACC Research Report No. 15 of December 2023EACC ORGANIZATIONAL STATEMENTS \nMission\nVision\nCore Values\nIntegrity\nInnovationTeam Work Fidelity to the\nLaw\nProfessionalismAn Integrity and \nValues-Driven Kenyan SocietyTo promote integrity and \ncombat corruption through \nlaw enforcement, prevention \nand educationOur Mandate\nTo combat and prevent \ncorruption, economic crime and \nunethical conduct in Kenya \nthrough law enforcement,')

## Embedding

In [8]:
from langchain_huggingface import HuggingFaceEmbeddings

In [9]:
# model = "deepset/roberta-base-squad2"
# Default sentence transfromer is "sentence-transformers/all-mpnet-base-v2"
import sentence_transformers
embeddings_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')

  from tqdm.autonotebook import tqdm, trange





## Vector Database

In [10]:
from langchain.vectorstores import Chroma

In [11]:
# initialize the vector store (save to disk)
db = Chroma.from_documents(chunks, embeddings_model, persist_directory="./chroma_db")

In [30]:
query="Last word"

In [31]:
# retrieve from vector db (load from disk) with query
db2 = Chroma(persist_directory="./chroma_db", embedding_function=embeddings_model)
retrieved_docs = db2.similarity_search(query)
print(retrieved_docs[0].page_content)

Teso 0.8
Boran 0.5
Rendille 0.5
Kuria 0.4
Arab 0.3
Others 2.1
