In [11]:
! pip install langchain-community
! pip install chromadb
! pip install pypdf
! pip install sentence-transformers

Collecting langchain-community
  Downloading langchain_community-0.2.15-py3-none-any.whl.metadata (2.7 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain<0.3.0,>=0.2.15 (from langchain-community)
  Downloading langchain-0.2.15-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.3.0,>=0.2.37 (from langchain-community)
  Downloading langchain_core-0.2.37-py3-none-any.whl.metadata (6.2 kB)
Collecting langsmith<0.2.0,>=0.1.0 (from langchain-community)
  Downloading langsmith-0.1.108-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain-community)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.22.0-py3-none-any.whl.metadata (7.2 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,

## Embedder

In [53]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import subprocess

In [54]:
def clone_repo(repo_url, destination=None):
  command = ["git", "clone", repo_url]
  if destination: command.append(destination)
  subprocess.run(command, check=True)
def get_all_file_paths(directory):
  file_paths = []
  for root, _, files in os.walk(directory):
    for file in files: file_paths.append(os.path.join(root, file))
  return file_paths

In [55]:
def filter_files(directories, files):
  def should_keep(file_path):
    return not any(f"/{dir}/" in file_path for dir in directories)
  return list(filter(should_keep, files))

In [56]:
class Embedder:
  def __init__(self, directories, directories_filter, db_name, refresh_db=False):
    self.directories = directories
    self.directories_filter = directories_filter
    self.db_name    = db_name
    self.docs       = []
    self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
    self.embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device' : 'cpu'})
    if refresh_db: self.load_files()
    print("Done Setup")
  def load_files(self):
    for dir in self.directories: self.load_directory(dir)
    docs = self.text_splitter.split_documents(self.docs)
    vectorstore = Chroma.from_documents(docs, self.embedding_function, persist_directory=self.db_name)
    print("Done file load")
  def load_directory(self, directory_path):
    file_list = get_all_file_paths(directory_path)
    file_list = filter_files(self.directories_filter, file_list)
    for f in file_list:
      extension = f.split("/")[-1].split(".")[-1]
      print(f)
      if extension == "pdf": self.docs.extend(PyPDFLoader(f).load())
      elif extension in ["png", "jpg", "jpeg", "exe", "bat"]: continue
      else:self.docs.extend(TextLoader(f).load())

In [57]:
directories = ["./fitter", "./input_files"]
directories_filter = [".git", ".github"]
emb = Embedder(directories, directories_filter, "./new_db", True)

./fitter/pyproject.toml
./fitter/poetry.lock
./fitter/.pre-commit-config.yaml
./fitter/.readthedocs.yml
./fitter/README.rst
./fitter/LICENSE
./fitter/src/fitter/main.py
./fitter/src/fitter/fitter.py
./fitter/src/fitter/histfit.py
./fitter/src/fitter/__init__.py
./fitter/test/test_main.py
./fitter/test/test_histfit.py
./fitter/test/test_fitter.py
./fitter/test/__init__.py
./fitter/doc/index.rst
./fitter/doc/data.csv
./fitter/doc/faqs.rst
./fitter/doc/contrib.rst
./fitter/doc/Makefile
./fitter/doc/references.rst
./fitter/doc/conf.py
./fitter/doc/requirements.txt
./fitter/doc/tuto.rst
./fitter/doc/_static/fitter_680x680.png
./fitter/doc/_static/fitter_64x64.png
./fitter/doc/_static/fitter_256x256.png
./fitter/doc/source/conf.py
./fitter/doc/source/_static/copybutton.js
./input_files/srs_doc.pdf
Done file load
Done Setup


## RAG

In [58]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
import google.generativeai as genai

In [60]:
GEMINI_API_KEY = "AIzaSyCF3VMYT9n6kliFVbu462Eja1RkWaVBWPc"
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel(model_name="gemini-1.5-flash")

In [67]:
class RAGHandler:
  def __init__(self, model, db_name):
    self.model = model
    embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={'device' : 'cpu'})
    self.vector_db = Chroma(persist_directory=db_name, embedding_function=embedding_function)
  def _generate_rag_prompt(self, query, context):
    escaped = context.replace("'", "").replace('"', '').replace("\n", " ")
    prompt = (f"""
    You are a helpful and informative bot that answers questions using text from reference context included below. \
    Be sure to respond in a complete sentence, being comprenhensive, including all relevant background information. \
    However, you are talinkg to a non-technical audience, so be sure to break down complicated concepts and \
    strike friendly and conversational tone. \
    If the context is irrelevant to the answer, you may ignore it.

    Each context information will have some metadata at the end of the object, \
    please add the references of where can the user find this information, based \
    on the metadata.

    USER QUESTION: '{query}'
    CONTEXT: '{context}'

    ANSWER:
    """)
    return prompt
  def _get_relevant_context_from_db(self, query):
    context = ""
    search_results = self.vector_db.similarity_search(query, k=6)
    for result in search_results:
      context += result.page_content + "\n"
      context += f"{result.metadata}" + "\n"
      print(f"{result.metadata}")
    return context
  def _generate_answer(self, prompt):
    answer = self.model.generate_content(prompt)
    return answer.text
  def query(self, query):
    context = self._get_relevant_context_from_db(query)
    prompt  = self._generate_rag_prompt(query, context)
    answer  = self._generate_answer(prompt)
    return answer

In [68]:
rag_handler = RAGHandler(model, "./new_db")

In [71]:
query = rag_handler.query("Does the _update_data_pdf function defines any paramemter?")

{'source': './fitter/src/fitter/fitter.py'}
{'source': './fitter/src/fitter/histfit.py'}
{'source': './fitter/src/fitter/fitter.py'}
{'source': './fitter/src/fitter/fitter.py'}
{'source': './fitter/src/fitter/fitter.py'}
{'source': './fitter/src/fitter/fitter.py'}


In [72]:
print(query)

The `_update_data_pdf` function does not explicitly define any parameters. It's a method within a class, and its behavior is defined by the code within its body. This function is responsible for updating the probability density function (PDF) of the data. It does this by using the `np.histogram` function to calculate the histogram of the data, and then it adjusts the X-values to represent the centers of each bin.

