In [None]:
! pip install langchain-community
! pip install chromadb
! pip install pypdf
! pip install sentence-transformers

## Embedder

In [2]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import subprocess

In [3]:
def clone_repo(repo_url, destination=None):
  command = ["git", "clone", repo_url]
  if destination: command.append(destination)
  subprocess.run(command, check=True)
def get_all_file_paths(directory):
  file_paths = []
  for root, _, files in os.walk(directory):
    for file in files: file_paths.append(os.path.join(root, file))
  return file_paths

In [4]:
def filter_files(directories, files):
  def should_keep(file_path):
    return not any(f"/{dir}/" in file_path for dir in directories)
  return list(filter(should_keep, files))

In [7]:
clone_repo("https://github.com/cokelaer/fitter")

In [8]:
class Embedder:
  def __init__(self, directories, directories_filter, db_name, refresh_db=False):
    self.directories = directories
    self.directories_filter = directories_filter
    self.db_name    = db_name
    self.docs       = []
    self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    self.embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device' : 'cpu'})
    if refresh_db: self.load_files()
    print("Done Setup")
  def load_files(self):
    for dir in self.directories: self.load_directory(dir)
    docs = self.text_splitter.split_documents(self.docs)
    vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=self.db_name)
    print("Done file load")
  def load_directory(self, directory_path):
    file_list = get_all_file_paths(directory_path)
    file_list = filter_files(self.directories_filter, file_list)
    for f in file_list:
      extension = f.split("/")[-1].split(".")[-1]
      print(f)
      if extension == "pdf": self.docs.extend(PyPDFLoader(f).load())
      elif extension in ["png", "jpg", "jpeg", "exe", "bat"]: continue
      else:self.docs.extend(TextLoader(f).load())

In [9]:
directories = ["./fitter", "./input_files"]
directories_filter = [".git", ".github"]
emb = Embedder(directories, directories_filter, "./new_db", True)



./fitter/.readthedocs.yml
./fitter/.pre-commit-config.yaml
./fitter/pyproject.toml
./fitter/LICENSE
./fitter/README.rst
./fitter/poetry.lock
./fitter/test/__init__.py
./fitter/test/test_histfit.py
./fitter/test/test_fitter.py
./fitter/test/test_main.py
./fitter/doc/Makefile
./fitter/doc/tuto.rst
./fitter/doc/references.rst
./fitter/doc/index.rst
./fitter/doc/contrib.rst
./fitter/doc/data.csv
./fitter/doc/faqs.rst
./fitter/doc/conf.py
./fitter/doc/requirements.txt
./fitter/doc/_static/fitter_256x256.png
./fitter/doc/_static/fitter_680x680.png
./fitter/doc/_static/fitter_64x64.png
./fitter/doc/source/conf.py
./fitter/doc/source/_static/copybutton.js
./fitter/src/fitter/main.py
./fitter/src/fitter/__init__.py
./fitter/src/fitter/fitter.py
./fitter/src/fitter/histfit.py
./input_files/Calypso_101.md
./input_files/Calypso_102.md
./input_files/Calypso_104.md
./input_files/Calypso_103.md
Done file load
Done Setup


## RAG

In [28]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import google.generativeai as genai
import json

In [15]:
GEMINI_API_KEY = "AIzaSyBkYYkEkbjbgNu2Z35G0RlW4RimQls4D0A"
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel(model_name="gemini-1.5-flash")

In [29]:
class RAGHandler:
  def __init__(self, model, db_name):
    self.model = model
    embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device' : 'cpu'})
    self.vector_db = Chroma(persist_directory=db_name, embedding_function=embedding_function)
  def _generate_rag_prompt(self, query, context):
    escaped = context.replace("'", "").replace('"', '').replace("\n", " ")
    open_br = "{"
    close_br = "}"
    prompt = (f"""
    You are a helpful and informative bot that answers questions using text from reference context included below. \
    Be sure to respond in a complete sentence, being comprenhensive, including all relevant background information. \
    However, you are talinkg to a non-technical audience, so be sure to break down complicated concepts and \
    strike friendly and conversational tone. \
    If the context is irrelevant to the answer, you may ignore it.

    USER QUESTION: '{query}'
    CONTEXT: '{context}'

    ANSWER:
    Each context information will have some metadata at the end of the object. \
    If the information from given file is relevant, use the metadata to refence \
    by adding the source to a list in the sources list. Use the following format, \
    as a json object:

    {open_br}
      answer : <your response> // string
      sources : [
        {open_br}
          file_name: <Name of file>, // string
          path: <path to source> // string
        {close_br}
      ] // list of path sources
    {close_br}
    """)
    return prompt
  def _get_relevant_context_from_db(self, query):
    context = ""

    search_results = self.vector_db.similarity_search(query, k=6)
    for result in search_results:
      context += result.page_content + "\n"
      context += f"{result.metadata}" + "\n"
    return context
  def _generate_answer(self, prompt):
    answer = self.model.generate_content(prompt)
    return answer.text
  def query(self, query):
    context = self._get_relevant_context_from_db(query)
    prompt  = self._generate_rag_prompt(query, context)
    answer  = self._generate_answer(prompt)
    answer  = json.loads(answer)
    return answer

In [30]:
rag_handler = RAGHandler(model, "./new_db")



In [31]:
query = rag_handler.query("According to the documentation, what's the main purpose of the standard model?")

In [32]:
print(query)

{'answer': "The Calypso Standard Reference Model (CSRM) is a pre-configured, modular, and extensible data model used by Calypso's platform. It's designed to support trading and risk management for various financial instruments. The CSRM provides a standard way to represent financial instruments, making it easier to integrate with other systems and reducing the need for custom development. It also includes pre-built workflows, business rules, and interfaces for managing trades, valuations, and risk across different asset classes like fixed income, equities, and commodities.", 'sources': [{'file_name': 'Calypso_104.md', 'path': './input_files/Calypso_104.md'}]}


In [33]:
query["answer"]

"The Calypso Standard Reference Model (CSRM) is a pre-configured, modular, and extensible data model used by Calypso's platform. It's designed to support trading and risk management for various financial instruments. The CSRM provides a standard way to represent financial instruments, making it easier to integrate with other systems and reducing the need for custom development. It also includes pre-built workflows, business rules, and interfaces for managing trades, valuations, and risk across different asset classes like fixed income, equities, and commodities."