# Building RAG Application for tracking Kenya's corruption Scandals from 2010-2024

## Install Needed Libraries

In [2]:
#%pip install google-generativeai
#%pip install langchain-google-genai
#%pip install chromadb
#%pip install langchain-community
#%pip install langchain


## Import Libraries

In [3]:
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


## Initialize Environment

In [4]:
# Load environment variables from .env file
load_dotenv()

# Retrieve the API key from environment variables
google_api_key = os.getenv("GOOGLE_API_KEY")

In [5]:
llm = ChatGoogleGenerativeAI(model="gemini-pro", api_key=google_api_key)

In [6]:
embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", api_key=google_api_key)

## Load Document

In [7]:
file_path = "EACC-NATIONAL-SURVEY-REPORT-2023.pdf"
pdf_loader = PyPDFLoader(file_path)
docs = pdf_loader.load()

## Chunking or Text Splitting

In [8]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=0
)

chunks = text_splitter.split_documents(docs)

In [9]:
chunks

[Document(metadata={'source': 'EACC-NATIONAL-SURVEY-REPORT-2023.pdf', 'page': 0}, page_content='National Ethics and Corruption Survey (NECS) 2023\nEACC Research Report No. 15 of December 2023iNATIONAL ETHICS AND \nCORRUPTION SURVEY \n(NECS), 2023\nEVIDENCE FROM \nHOUSEHOLDS IN KENYA\nTuangamize Uﬁsadi, Tuijenge Kenya\nETHICS AND ANTI-CORRUPTION COMMISSION\nEACC Research Report No. 15 of December 2023'),
 Document(metadata={'source': 'EACC-NATIONAL-SURVEY-REPORT-2023.pdf', 'page': 1}, page_content='ii\nNational Ethics and Corruption Survey (NECS) 2023\nEACC Research Report No. 15 of December 2023Ethics and Anti-Corruption Commission (EACC)\nIntegrity Centre\nJakaya Kikwete/Valley Road Junction\nTel: (020) 4997000\nMobile: 0709 781 000; 0730 997 000\nToll Free : 1551\neacc@integrity.go.ke\nDesign and layout by Sharafa\nEmail: nsharafa@gmail.com/nsharafa@yahoo.com'),
 Document(metadata={'source': 'EACC-NATIONAL-SURVEY-REPORT-2023.pdf', 'page': 2}, page_content='National Ethics and Corrupt

## Initialize VectorDB and Retriever

In [10]:
vectordb = Chroma.from_documents(chunks, embeddings)

In [11]:
#Configure Chroma as a retriever with top_k=5
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

## Define Retrieval Chain

In [12]:
#Create the retrieval chain(pipeline)
template = """
You are a helpful AI assistant.
Answer based on the context provided. 
context: {context}
input: {input}
answer:
"""
prompt = PromptTemplate.from_template(template)

In [13]:
combine_docs_chain = create_stuff_documents_chain(llm, prompt)
combine_docs_chain

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), config={'run_name': 'format_inputs'})
| PromptTemplate(input_variables=['context', 'input'], template='\nYou are a helpful AI assistant.\nAnswer based on the context provided. \ncontext: {context}\ninput: {input}\nanswer:\n')
| ChatGoogleGenerativeAI(model='models/gemini-pro', client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x000001EA68627C50>, async_client=<google.ai.generativelanguage_v1beta.services.generative_service.async_client.GenerativeServiceAsyncClient object at 0x000001EA6FC56890>, default_metadata=())
| StrOutputParser(), config={'run_name': 'stuff_documents_chain'})

In [14]:
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)
retrieval_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['Chroma', 'GoogleGenerativeAIEmbeddings'], vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x000001EA708E46D0>, search_kwargs={'k': 5}), config={'run_name': 'retrieve_documents'})
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), config={'run_name': 'format_inputs'})
            | PromptTemplate(input_variables=['context', 'input'], template='\nYou are a helpful AI assistant.\nAnswer based on the context provided. \ncontext: {context}\ninput: {input}\nanswer:\n')
            | ChatGoogleGenerativeAI(model='models/gemini-pro', client=<google.ai.generativelanguage_v1beta.services.generative_service.client.GenerativeServiceClient object at 0x000001EA68627C50>, async_client=<google.ai.genera

In [21]:
response=retrieval_chain.invoke({"input":"Report summary"})
#Print the answer to the question
print(response["answer"])

The report is organized into four chapters. Chapter One presents the background to the survey, Chapter Two gives the methodology and Chapter Three presents the findings themed on the objectives of the Survey. Chapter Four contains conclusions and recommendations. The demographic, social and economic characteristics of Survey respondents are provided in the appendices.
