## Install needed libraries

In [1]:
# %pip install --upgrade --quiet pypdf
# %pip install langchain
# %pip install 
#%pip install langchain-huggingface
#%pip uninstall transformers huggingface_hub sentence-transformers -y
#%pip install transformers==4.28.1 huggingface_hub==0.14.1 sentence-transformers==2.2.0
#%pip install -U sentence-transformers
#%pip install chromadb

## Load PDF Document 

In [2]:
from langchain import *
from langchain_community.document_loaders import PyPDFLoader

In [3]:
file_path = "EACC-NATIONAL-SURVEY-REPORT-2023.pdf"
loader = PyPDFLoader(file_path)
pages = loader.load()
pages[0]

Document(metadata={'source': 'EACC-NATIONAL-SURVEY-REPORT-2023.pdf', 'page': 0}, page_content='National Ethics and Corruption Survey (NECS) 2023\nEACC Research Report No. 15 of December 2023iNATIONAL ETHICS AND \nCORRUPTION SURVEY \n(NECS), 2023\nEVIDENCE FROM \nHOUSEHOLDS IN KENYA\nTuangamize Uﬁsadi, Tuijenge Kenya\nETHICS AND ANTI-CORRUPTION COMMISSION\nEACC Research Report No. 15 of December 2023   ')

## Chunking

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=0
)

chunks = text_splitter.split_documents(pages)

In [6]:
chunks[3]

Document(metadata={'source': 'EACC-NATIONAL-SURVEY-REPORT-2023.pdf', 'page': 3}, page_content='iv\nNational Ethics and Corruption Survey (NECS) 2023\nEACC Research Report No. 15 of December 2023EACC ORGANIZATIONAL STATEMENTS \nMission\nVision\nCore Values\nIntegrity\nInnovationTeam Work Fidelity to the\nLaw\nProfessionalismAn Integrity and \nValues-Driven Kenyan SocietyTo promote integrity and \ncombat corruption through \nlaw enforcement, prevention \nand educationOur Mandate\nTo combat and prevent \ncorruption, economic crime and \nunethical conduct in Kenya \nthrough law enforcement,')

## Embedding

In [7]:
from langchain_huggingface import HuggingFaceEmbeddings

In [8]:
# model = "deepset/roberta-base-squad2"
# Default sentence transfromer is "sentence-transformers/all-mpnet-base-v2"
import sentence_transformers
embeddings_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2')

  from tqdm.autonotebook import tqdm, trange





## Vector Database

In [9]:
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA

In [10]:
# initialize the vector store (save to disk)
db = Chroma.from_documents(chunks, embeddings_model, persist_directory="./chroma_db")

In [11]:
query="What is the most corrupt ministry?"

In [12]:
# retrieve from vector db (load from disk) with query
db2 = Chroma(persist_directory="./chroma_db", embedding_function=embeddings_model)
retrieved_docs = db2.similarity_search(query)
print(retrieved_docs[0].page_content)

Transport, Infrastructure, Housing, Urban Development and Public Works (5.8%), Ministry of Education 
(5.5%) and Ministry of Defense (5.4%). This is as presented in Figure 3.35.
Figure 3.35: Ministries perceived to be most prone to Corruption and Unethical Conduct
3.6.8.2 Government Departments and Agencies Perceived to be most prone to Corruption and 
Unethical Practices
Government Departments and Agencies perceived to be most prone to corruption are the Police


In [13]:
# initialize the retriever
retriever = db2.as_retriever(
    search_type="mmr", #similarity
    search_kwargs={'k': 4}
)

In [23]:
# Create a tokenizer object by loading the pretrained "Intel/dynamic_tinybert" tokenizer.
tokenizer = AutoTokenizer.from_pretrained("Intel/dynamic_tinybert")

# Create a question-answering model object by loading the pretrained "Intel/dynamic_tinybert" model.
model = AutoModelForQuestionAnswering.from_pretrained("Intel/dynamic_tinybert")

In [24]:
# Specify the model name you want to use
model_name = "Intel/dynamic_tinybert"

# Load the tokenizer associated with the specified model
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)

# Define a question-answering pipeline using the model and tokenizer
question_answer = pipeline(
    "question-answering", 
    model=model_name, 
    tokenizer=tokenizer,
)

# Create an instance of the HuggingFacePipeline, which wraps the question-answering pipeline
# with additional model-specific arguments (temperature and max_length)
llm = HuggingFacePipeline(
    pipeline=question_answer,
    model_kwargs={"temperature": 0.7, "max_length": 512},
)

In [27]:
question = "Percentages of corruption in various ministries?"

def err_remove(er):
    lin = "------------"
    er = str(er)
    start_index = er.find(lin) + len(lin)
    end_index = er.rfind(lin)
    answer = er[start_index:end_index].strip()
    return answer

try:
    # Use the pipeline directly for question answering
    context = " ".join([doc.page_content for doc in retrieved_docs])
    result = question_answer(question=question, context=context)
    answer = result['answer']
    print("Answer:", answer)  # Print the answer if successful
except Exception as error:
    answer = err_remove(error)
    print("Error Answer:", answer)  # Print the error answer if an exception occurs


Answer: 5.8%
