In [None]:
! pip install langchain-community
! pip install chromadb
! pip install pypdf
! pip install sentence-transformers

## Embedder

In [3]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

In [41]:
! rm -r chroma_db_nccn

In [27]:
! ls -a input_files
! rm -r input_files/.ipynb_checkpoints/
! ls -a input_files

.  ..  Resume.pdf
rm: cannot remove 'input_files/.ipynb_checkpoints/': No such file or directory
.  ..  Resume.pdf


In [4]:
class Embedder:
  def __init__(self, files_path, db_name, refresh_db=False):
    self.files_path = files_path
    self.db_name    = db_name
    self.docs       = []
    self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    self.embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device' : 'cpu'})
    if refresh_db: self._load_files()
  def _load_files(self):
    file_list = os.listdir(self.files_path)
    for f in file_list: self.docs.extend(PyPDFLoader(self.files_path + f).load())
    docs = self.text_splitter.split_documents(self.docs)
    vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=self.db_name)

In [5]:
embedder = Embedder("./input_files/", './chroma_db_nccn', True)

  self.embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device' : 'cpu'})
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## RAG

In [6]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import google.generativeai as genai

In [7]:
GEMINI_API_KEY = ""
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel(model_name="gemini-1.5-flash")

In [14]:
class RAGHandler:
  def __init__(self, model):
    self.model = model
    embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device' : 'cpu'})
    self.vector_db = Chroma(persist_directory='./chroma_db_nccn', embedding_function=embedding_function)
  def _generate_rag_prompt(self, query, context):
    escaped = context.replace("'", "").replace('"', '').replace("\n", " ")
    prompt = (f"""
    You are a helpful and informative bot that answers questions using text from reference context included below. \
    Be sure to respond in a complete sentence, being comprenhensive, including all relevant background information. \
    However, you are talinkg to a non-technical audience, so be sure to break down complicated concepts and \
    strike friendly and conversational tone. \
    If the context is irrelevant to the answer, you may ignore it.

    Each context information will have some metadata at the end of the object, \
    please add the references of where can the user find this information, based \
    on the metadata.

    USER QUESTION: '{query}'
    CONTEXT: '{context}'

    ANSWER:
    """)
    return prompt
  def _get_relevant_context_from_db(self, query):
    context = ""
    search_results = self.vector_db.similarity_search(query, k=6)
    for result in search_results:
      context += result.page_content + "\n"
      context += f"{result.metadata}" + "\n"
      print(f"{result.metadata}")
    return context
  def _generate_answer(self, prompt):
    answer = self.model.generate_content(prompt)
    return answer.text
  def query(self, query):
    context = self._get_relevant_context_from_db(query)
    prompt  = self._generate_rag_prompt(query, context)
    answer  = self._generate_answer(prompt)
    return answer

In [15]:
rag_handler = RAGHandler(model)

In [16]:
query = rag_handler.query("What's the price to be charged for the email service?")

{'page': 34, 'source': './input_files/srs_doc.pdf'}
{'page': 35, 'source': './input_files/srs_doc.pdf'}
{'page': 29, 'source': './input_files/srs_doc.pdf'}
{'page': 35, 'source': './input_files/srs_doc.pdf'}
{'page': 14, 'source': './input_files/srs_doc.pdf'}
{'page': 24, 'source': './input_files/srs_doc.pdf'}


In [17]:
print(query)

The cost of sending emails through a provider like SendGrid or Mailgun is estimated to be around $15 USD. This is mentioned in the section about "Email Services" in the document. 
 
   [Source: './input_files/srs_doc.pdf', page 35] 

