## Install needed libraries

In [40]:
# %pip install --upgrade --quiet pypdf
# %pip install langchain
# %pip install 
#%pip install langchain-huggingface
#%pip uninstall transformers huggingface_hub sentence-transformers -y
#%pip install transformers==4.28.1 huggingface_hub==0.14.1 sentence-transformers==2.2.0
#%pip install -U sentence-transformers
#%pip install chromadb

## Load Hugging Face Token

In [41]:
import os
from getpass import getpass

HF_TOKEN = getpass("Huggingface Token : ")
os.environ['Token'] = HF_TOKEN

## Load PDF Document 

In [42]:
from langchain import *
from langchain_community.document_loaders import PyPDFLoader

In [43]:
file_path = "EACC-NATIONAL-SURVEY-REPORT-2023.pdf"
loader = PyPDFLoader(file_path)
pages = loader.load()
pages[0]

Document(metadata={'source': 'EACC-NATIONAL-SURVEY-REPORT-2023.pdf', 'page': 0}, page_content='National Ethics and Corruption Survey (NECS) 2023\nEACC Research Report No. 15 of December 2023iNATIONAL ETHICS AND \nCORRUPTION SURVEY \n(NECS), 2023\nEVIDENCE FROM \nHOUSEHOLDS IN KENYA\nTuangamize Uﬁsadi, Tuijenge Kenya\nETHICS AND ANTI-CORRUPTION COMMISSION\nEACC Research Report No. 15 of December 2023   ')

## Chunking

In [44]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [45]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=0
)

chunks = text_splitter.split_documents(pages)

In [46]:
chunks[3]

Document(metadata={'source': 'EACC-NATIONAL-SURVEY-REPORT-2023.pdf', 'page': 3}, page_content='iv\nNational Ethics and Corruption Survey (NECS) 2023\nEACC Research Report No. 15 of December 2023EACC ORGANIZATIONAL STATEMENTS \nMission\nVision\nCore Values\nIntegrity\nInnovationTeam Work Fidelity to the\nLaw\nProfessionalismAn Integrity and \nValues-Driven Kenyan SocietyTo promote integrity and \ncombat corruption through \nlaw enforcement, prevention \nand educationOur Mandate\nTo combat and prevent \ncorruption, economic crime and \nunethical conduct in Kenya \nthrough law enforcement,')

## Embedding

In [47]:
from langchain_huggingface import HuggingFaceEmbeddings

In [48]:
# model = "deepset/roberta-base-squad2"
# Default sentence transfromer is "sentence-transformers/all-mpnet-base-v2"
import sentence_transformers
embeddings_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')

## Vector Database

In [49]:
from langchain.vectorstores import Chroma

In [50]:
# initialize the vector store (save to disk)
db = Chroma.from_documents(chunks, embeddings_model, persist_directory="./chroma_db")

In [51]:
query="Last word"

In [52]:
# retrieve from vector db (load from disk) with query
db2 = Chroma(persist_directory="./chroma_db", embedding_function=embeddings_model)
retrieved_docs = db2.similarity_search(query)
print(retrieved_docs[0].page_content)

Teso 0.8
Boran 0.5
Rendille 0.5
Kuria 0.4
Arab 0.3
Others 2.1


In [53]:
# initialize the retriever
retriever = db2.as_retriever(
    search_type="mmr", #similarity
    search_kwargs={'k': 4}
)

In [77]:
import os

huggingfacehub_api_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")


In [80]:
# import the question-answering chain and Huggingface Hub LLM
from langchain.llms import HuggingFaceHub

# define the llm
llm = HuggingFaceHub(repo_id="microsoft/Phi-3-mini-128k-instruct",
                     model_kwargs={
                         "temperature":0.1,
                         "max_new_tokens":512,
                         "return_full_text":False,
                         "repetition_penalty":1.1,
                         "top_p":0.9
                     },
                     huggingfacehub_api_token=huggingfacehub_api_token
                     
                     )


In [81]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [82]:
template = """
<|system|>
You are a helpful assistant.<|end|>
<|user|>
Question?<|end|>
<|assistant|>
"""

In [83]:
prompt = ChatPromptTemplate.from_template(template)

In [84]:
output_parser = StrOutputParser()

In [85]:
chain = (
    {"context": retriever, "query": RunnablePassthrough()}
    | prompt
    | llm
    | output_parser
)

In [86]:
response = chain.invoke(query)
response

BadRequestError:  (Request ID: fHyOM2uSXA5y78pJQo-eO)

Bad request:
Authorization header is correct, but the token seems invalid