In [2]:
import os
os.environ['GEMINI_API_KEY'] = 'AIzaSyCGsj7XUUDktYTIqS3ITCOIk54oN7OD9dw'

In [7]:
from pypdf import PdfReader
import regex as re

def load_pdf(file_path):

    reader = PdfReader(file_path)

    # Loop over each page and store it in a variable
    text = ""
    for page in reader.pages:
        text += page.extract_text()

    return text

pdf_text = load_pdf(file_path="D:\\subject projects\\RAG\\app\\ai.pdf")

In [8]:
def split_text(text: str):

    split_text = re.split('\n \n', text)
    return [i for i in split_text if i != ""]

chunked_text = split_text(text=pdf_text)

In [13]:
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
import os

class GeminiEmbeddingFunction(EmbeddingFunction):

    def __call__(self, input: Documents) -> Embeddings:
        gemini_api_key = os.getenv("GEMINI_API_KEY")
        if not gemini_api_key:
            raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        return genai.embed_content(model=model,
                                   content=input,
                                   task_type="retrieval_document",
                                   title=title)["embedding"]

In [14]:
import chromadb
from typing import List
def create_chroma_db(documents:List, path:str, name:str):

    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    for i, d in enumerate(documents):
        db.add(documents=d, ids=str(i))

    return db, name

db,name =create_chroma_db(documents=chunked_text,
                          path="/content",
                          name="rag_experiment")



In [15]:
def load_chroma_collection(path, name):

    chroma_client = chromadb.PersistentClient(path=path)
    db = chroma_client.get_collection(name=name, embedding_function=GeminiEmbeddingFunction())

    return db

db=load_chroma_collection(path="/content", name="rag_experiment")

In [16]:
def get_relevant_passage(query, db, n_results):
  passage = db.query(query_texts=[query], n_results=n_results)['documents'][0]
  return passage

#Example usage
relevant_text = get_relevant_passage(query="Sanctions on Russia",db=db,n_results=3)

Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


In [32]:
def make_rag_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = ("""You are a helpful and informative bot that answers questions using text from the reference passage included below. \
  Be sure to respond in a complete sentence, being comprehensive, including all relevant background information. \
  However, you are talking to a non-technical audience, so be sure to break down complicated concepts and \
  strike a friendly and converstional tone. \
  If the passage is irrelevant to the answer, you may ignore it.\
  If there is no suitable passage for the given question repond with suitable respond, aknowladging that you are unable to help
  QUESTION: '{query}'
  PASSAGE: '{relevant_passage}'

  ANSWER:
  """).format(query=query, relevant_passage=escaped)

  return prompt

In [33]:
import google.generativeai as genai
import os

def generate_answer(prompt):
    gemini_api_key = 'AIzaSyCGsj7XUUDktYTIqS3ITCOIk54oN7OD9dw'
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key=gemini_api_key)
    model = genai.GenerativeModel('gemini-1.5-flash')
    answer = model.generate_content(prompt)
    return answer.text

In [34]:
def generate_llm_answer(db,query):
    #retrieve top 3 relevant text chunks
    relevant_text = get_relevant_passage(query,db,n_results=3)
    prompt = make_rag_prompt(query,
                             relevant_passage="".join(relevant_text)) # joining the relevant chunks to create a single passage
    answer = generate_answer(prompt)

    return answer

In [35]:
db=load_chroma_collection(path="/content", #replace with path of your persistent directory
                          name="rag_experiment") #replace with the collection name

answer = generate_llm_answer(db,query="what is e Royal Society")
print(answer)

Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


The Royal Society is the UK’s national academy of sciences. Its main goal, as stated in its founding charters from the 1660s, is to acknowledge, promote, and support excellence in science and to encourage the development and use of science for the benefit of humanity. 



In [36]:
answer = generate_llm_answer(db,query="what is Buddhism")
print(answer)

Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


I am sorry, but I am unable to answer your question as the provided passage does not contain any information about Buddhism. 

