HGR Env python 3.9

IRPEnv 3.11

- Chroma DB


Includes vector search, full-text search, document storage, metadata filtering, and multi-modal retrieval.

Native integration with embedding models from HuggingFace, OpenAI, Google, and more.

Default embeddings are created using  : all-MiniLM-L6-v2

In [None]:
! pip install chromadb

Collecting chromadb
  Using cached chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Using cached build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting pydantic>=1.9 (from chromadb)
  Using cached pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-win_amd64.whl.metadata (262 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Using cached uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting numpy>=1.22.5 (from chromadb)
  Downloading numpy-2.2.4-cp311-cp311-win_amd64.whl.metadata (60 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Using cached posthog-3.21.0-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.0-cp311-cp311-win_amd64.whl.

# Setup Vector Store

In [1]:
import chromadb
from chromadb.config import Settings

# Creating the client
client = chromadb.Client(Settings(
    # chroma_db_impl="duckdb+parquet",
    persist_directory="db/" # Database files will be saved in this folder
                                ))

In [2]:
collection = client.get_or_create_collection(name="mmvqa")

In [3]:
collection.peek()

{'ids': [],
 'embeddings': array([], dtype=float64),
 'documents': [],
 'uris': None,
 'data': None,
 'metadatas': [],
 'included': [<IncludeEnum.embeddings: 'embeddings'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}

# Setup embedder and LLM

In [11]:
from openai import OpenAI
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

def get_embedding(text, model="text-embedding-nomic-embed-text-v1.5-embedding"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [12]:
llm = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")

In [None]:
# completion = client.chat.completions.create(
#   model="mistral-7b-anthropic:2",
#   messages=[
#     {"role": "system", "content": "Always answer in rhymes."},
#     {"role": "user", "content": "Introduce yourself."}
#   ],
#   temperature=0.9,
# )

In [14]:
from langchain_core.prompts import PromptTemplate

In [15]:
prompt = PromptTemplate(
    template="""system
You are a useful chat assitant that helps with answering questions based on the retrived documents. If the retrived documenets can not be used to answer the question then say'not enough content'.
    Here is the retrieved documents: \n\n {document} \n\n
    Here is the user question: {question} \n assistant""",
    input_variables=["question", "document"],
)

# Document Processors

In [4]:
# # Data\pdfs_for_Vanilla_RAG\sci_bk\science G-6 E.pdf
# from langchain_community.document_loaders import PyMuPDFLoader
# # Load the document using PyMuPDFLoader
# loader = PyMuPDFLoader()
from langchain_community.document_loaders import FileSystemBlobLoader
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import PyMuPDFParser

loader = GenericLoader(
    blob_loader=FileSystemBlobLoader(
        # path="../../../Data/MMVQA/pdfs/",
        path="../../../Data/pdfs_for_Vanilla_RAG/sci_bk/",
        glob="*.pdf",
    ),
    blob_parser=PyMuPDFParser(),
)

In [5]:
docs = loader.load()
print(docs[0].page_content)
# pprint.pp(docs[0].metadata)

SCIENCE
Grade 6
Educational Publications Department


In [None]:

documents = loader.load()

In [None]:
collection2.add(
    documents = [student_info, club_info, university_info],
    metadatas = [{"source": "student info"},{"source": "club info"},{'source':'university info'}],
    ids = ["id1", "id2", "id3"]
)

In [None]:
results = collection.query(
    query_texts=["What is the student name?"],
    n_results=2
)

results

{'ids': [['id1', 'id2']],
 'embeddings': None,
 'documents': [['\nAlexandra Thompson, a 19-year-old computer science sophomore with a 3.7 GPA,\nis a member of the programming and chess clubs who enjoys pizza, swimming, and hiking\nin her free time in hopes of working at a tech company after graduating from the University of Washington.\n',
   "\nThe university chess club provides an outlet for students to come together and enjoy playing\nthe classic strategy game of chess. Members of all skill levels are welcome, from beginners learning\nthe rules to experienced tournament players. The club typically meets a few times per week to play casual games,\nparticipate in tournaments, analyze famous chess matches, and improve members' skills.\n"]],
 'uris': None,
 'data': None,
 'metadatas': [[{'source': 'student info'}, {'source': 'club info'}]],
 'distances': [[1.2946667671203613, 1.3954033851623535]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>

In [None]:
results = collection.query(
    query_texts=["What is the student name?"],
    n_results=2,
    where = {'source' : 'just info'}
)

results

{'ids': [[]],
 'embeddings': None,
 'documents': [[]],
 'uris': None,
 'data': None,
 'metadatas': [[]],
 'distances': [[]],
 'included': [<IncludeEnum.distances: 'distances'>,
  <IncludeEnum.documents: 'documents'>,
  <IncludeEnum.metadatas: 'metadatas'>]}