In [8]:
! pip install langchain-community
! pip install chromadb
! pip install pypdf
! pip install sentence-transformers



## Embedder

In [9]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [10]:
import os

In [11]:
class Embedder:
  def __init__(self, files_path, db_name, refresh_db=False):
    self.files_path = files_path
    self.db_name    = db_name
    self.docs       = []
    self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    self.embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device' : 'cpu'})
    if refresh_db: self._load_files()
  def _load_files(self):
    file_list = os.listdir(self.files_path)
    print(file_list)
    for f in file_list:
      self.docs.extend(PyPDFLoader(self.files_path + f).load())
    docs = self.text_splitter.split_documents(self.docs)
    vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=self.db_name)

In [12]:
embedder = Embedder("./input_files/", './chroma_db_nccn', True)

['Resume.pdf', 'srs_doc.pdf']


## RAG

In [13]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import google.generativeai as genai

In [14]:
GEMINI_API_KEY = "AIzaSyBJfKk6jzMUTznoNMNhyB42WfkfrlUKwug"
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel(model_name="gemini-1.5-flash")

In [16]:
class RAGHandler:
  def __init__(self, model):
    self.model = model
  def _generate_rag_prompt(self, query, context):
    escaped = context.replace("'", "").replace('"', '').replace("\n", " ")
    prompt = (f"""
    You are a helpful and informative bot that answers questions using text from reference context included below. \
    Be sure to respond in a complete sentence, being comprenhensive, including all relevant background information. \
    However, you are talinkg to a non-technical audience, so be sure to break down complicated concepts and \
    strike friendly and conversational tone. \
    If the context is irrelevant to the answer, you may ignore it.

    USER QUESTION: '{query}'
    CONTEXT: '{context}'

    ANSWER:
    """)
    return prompt
  def _get_relevant_context_from_db(self, query):
    context = ""
    embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device' : 'cpu'})
    vector_db = Chroma(persist_directory='./chroma_db_nccn', embedding_function=embedding_function)
    search_results = vector_db.similarity_search(query, k=6)
    for result in search_results:
      context += result.page_content + "\n"
    return context
  def _generate_answer(self, prompt):
    answer = self.model.generate_content(prompt)
    return answer.text
  def query(self, query):
    context = self._get_relevant_context_from_db(query)
    prompt  = self._generate_rag_prompt(query, context)
    answer  = self._generate_answer(prompt)
    return answer

In [17]:
rag_handler = RAGHandler(model)

In [18]:
query = rag_handler.query("Whats the cost of the mail service?")

  vector_db = Chroma(persist_directory='./chroma_db_nccn', embedding_function=embedding_function)


In [19]:
print(query)

The cost of the mail service, also known as email services, is estimated to be around $15 USD. 

