In [None]:
! pip install langchain-community
! pip install chromadb
! pip install pypdf
! pip install sentence-transformers

## Embedder

In [1]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import subprocess

In [2]:
def clone_repo(repo_url, destination=None):
  command = ["git", "clone", repo_url]
  if destination: command.append(destination)
  subprocess.run(command, check=True)
def get_all_file_paths(directory):
  file_paths = []
  for root, _, files in os.walk(directory):
    for file in files: file_paths.append(os.path.join(root, file))
  return file_paths

In [3]:
def filter_files(directories, files):
  def should_keep(file_path):
    return not any(f"/{dir}/" in file_path for dir in directories)
  return list(filter(should_keep, files))

In [None]:
clone_repo("https://github.com/cokelaer/fitter")

In [9]:
class Embedder:
  def __init__(self, directories, directories_filter, db_name, refresh_db=False, metadata={}):
    self.directories = directories
    self.directories_filter = directories_filter
    self.db_name    = db_name
    self.docs       = []
    self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    self.embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device' : 'cpu'})
    self.metadata = metadata
    if refresh_db: self.load_files()
    print("Done Setup")
  def load_files(self):
    for dir in self.directories: self.load_directory(dir)
    docs = self.text_splitter.split_documents(self.docs)
    vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=self.db_name)
    print("Done file load")
  def load_directory(self, directory_path):
    file_list = get_all_file_paths(directory_path)
    file_list = filter_files(self.directories_filter, file_list)
    for f in file_list:
      extension = f.split("/")[-1].split(".")[-1]
      document = None
      if extension == "pdf": document = PyPDFLoader(f).load()
      elif extension in ["png", "jpg", "jpeg", "exe", "bat"]: continue
      else: document = TextLoader(f).load()
      document[0].metadata = document[0].metadata | self.metadata
      print(document[0].metadata)
      self.docs.extend(document)

In [10]:
directories = ["./fitter", "./input_files"]
directories_filter = [".git", ".github"]
metadata = {"user_id" : "asdgw2dsag"}
emb = Embedder(directories, directories_filter, "./new_db", True, metadata)

  self.embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device' : 'cpu'})
  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'source': './fitter/poetry.lock', 'user_id': 'asdgw2dsag'}
{'source': './fitter/.pre-commit-config.yaml', 'user_id': 'asdgw2dsag'}
{'source': './fitter/.readthedocs.yml', 'user_id': 'asdgw2dsag'}
{'source': './fitter/pyproject.toml', 'user_id': 'asdgw2dsag'}
{'source': './fitter/README.rst', 'user_id': 'asdgw2dsag'}
{'source': './fitter/LICENSE', 'user_id': 'asdgw2dsag'}
{'source': './fitter/doc/faqs.rst', 'user_id': 'asdgw2dsag'}
{'source': './fitter/doc/tuto.rst', 'user_id': 'asdgw2dsag'}
{'source': './fitter/doc/Makefile', 'user_id': 'asdgw2dsag'}
{'source': './fitter/doc/contrib.rst', 'user_id': 'asdgw2dsag'}
{'source': './fitter/doc/index.rst', 'user_id': 'asdgw2dsag'}
{'source': './fitter/doc/conf.py', 'user_id': 'asdgw2dsag'}
{'source': './fitter/doc/requirements.txt', 'user_id': 'asdgw2dsag'}
{'source': './fitter/doc/data.csv', 'user_id': 'asdgw2dsag'}
{'source': './fitter/doc/references.rst', 'user_id': 'asdgw2dsag'}
{'source': './fitter/doc/source/conf.py', 'user_id': 'asdgw

## RAG

In [11]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import google.generativeai as genai
import json

In [12]:
GEMINI_API_KEY = "AIzaSyBaBCyUV9Rj6KgeAYoXdl09I3kV7JOJbfg"
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel(model_name="gemini-1.5-flash")

In [13]:
class RAGHandler:
  def __init__(self, model, db_name):
    self.model = model
    embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device' : 'cpu'})
    self.vector_db = Chroma(persist_directory=db_name, embedding_function=embedding_function)
  def _generate_rag_prompt(self, query, context):
    escaped = context.replace("'", "").replace('"', '').replace("\n", " ")
    open_br = "{"
    close_br = "}"
    prompt = (f"""
    You are a helpful and informative bot that answers questions using text from reference context included below. \
    Be sure to respond in a complete sentence, being comprenhensive, including all relevant background information. \
    However, you are talinkg to a non-technical audience, so be sure to break down complicated concepts and \
    strike friendly and conversational tone. \
    If the context is irrelevant to the answer, you may ignore it.

    USER QUESTION: '{query}'
    CONTEXT: '{context}'

    ANSWER:
    Each context information will have some metadata at the end of the object. \
    If the information from given file is relevant, use the metadata to refence \
    by adding the source to a list in the sources list. Use the following format, \
    as a json object:

    {open_br}
      answer : <your response> // string
      sources : [
        {open_br}
          file_name: <Name of file>, // string
          path: <path to source> // string
        {close_br}
      ] // list of path sources
    {close_br}
    """)
    return prompt
  def _get_relevant_context_from_db(self, query, metadata):
    context = ""
    search_results = self.vector_db.similarity_search(query, k=6, filter=metadata)
    for result in search_results:
      context += result.page_content + "\n"
      context += f"{result.metadata}" + "\n"
    return context
  def _generate_answer(self, prompt):
    answer = self.model.generate_content(prompt)
    return answer.text
  def query(self, query, metadata={}):
    context = self._get_relevant_context_from_db(query, metadata)
    prompt  = self._generate_rag_prompt(query, context)
    answer  = self._generate_answer(prompt)
    answer  = json.loads(answer)
    return answer

In [14]:
rag_handler = RAGHandler(model, "./new_db")

  self.vector_db = Chroma(persist_directory=db_name, embedding_function=embedding_function)


In [15]:
metadata = {"user_id" : "asdgw2dsag"}
query = rag_handler.query("According to the documentation, what's the main purpose of the standard model?", metadata)

In [16]:
print(query)

{'answer': 'The Calypso Standard Reference Model (CSRM) is a pre-configured and extensible data model that helps standardize how financial instruments are represented. This makes it easier to integrate with other systems and reduces the need for custom development. It also includes tools and frameworks for managing trades, valuations, and risks across various asset classes.', 'sources': [{'file_name': 'Calypso_104.md', 'path': './input_files/Calypso_104.md'}]}
