# ChromaDB testing


test notebook


In [1]:
%pip install python-dotenv

from dotenv import load_dotenv
load_dotenv()


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


True

### Connection


In [36]:
import chromadb


class ChromaDBConnection:
    _instance = None
    _conn = None

    def __new__(cls, *args, **kwargs):
        if not cls._instance:
            cls._instance = super().__new__(cls)
            cls._conn = chromadb.HttpClient(host="localhost", port=8000)
        return cls._instance

    def get_conn(self):
        return self._conn

    def close(self):
        self._conn = None
        ChromaDBConnection._instance = None

In [37]:
conn1 = ChromaDBConnection().get_conn()
conn2 = ChromaDBConnection().get_conn()
conn1 == conn2

True

In [38]:
col = conn1.get_or_create_collection("test-collection")
col

Collection(name=test-collection)

In [None]:
col.add(
    ids=["doc1"],
    embeddings=[[0.1, 0.2, 0.3]],
    documents=["This is a test document."],
    metadatas=[{"source": "unit_test"}],
)

In [11]:
col.get(ids=["doc1"])

{'ids': ['doc1'],
 'embeddings': None,
 'metadatas': [{'source': 'unit_test'}],
 'documents': ['This is a test document.'],
 'data': None,
 'uris': None,
 'included': ['metadatas', 'documents']}

### Using Langchain


In [12]:
%pip install -qU openai langchain-openai langchain-chroma


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [39]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [None]:
col.add(
    ids=["doc2"],
    embeddings=embeddings.embed_documents(["This is another test document."]),
)

In [None]:
from langchain_chroma import Chroma

chroma_vs = Chroma(
    collection_name="embedding-collection", embedding_function=embeddings, client=conn1
)

In [9]:
from uuid import uuid4

from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
    id=2,
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
    id=3,
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
    id=4,
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
    id=5,
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
    id=6,
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
    id=7,
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
    id=8,
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
    id=9,
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
    id=10,
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]

chroma_vs.add_documents(documents=documents, ids=uuids)

['1e703663-126b-48b7-ab63-0f06f4a66b94',
 '309b2dfa-d01b-4cbb-9e0f-d85d96584546',
 'dc70b30e-de2b-4cc2-8e26-107d76712e09',
 '53652966-7fe1-4a61-a7dc-0abfe4f95cc4',
 '09806321-ae9e-44f4-8ffc-0f28985dac30',
 '9de53d1c-8313-4b0f-91db-77853a36be24',
 'bc8cd952-d3c6-4cec-a2df-cf83abb39acd',
 '84d463b7-a5e7-48e4-9b0f-083b31f5d1ac',
 '8c2bbe84-5d83-4787-a62d-ff14efff575c',
 '94e3fe1a-b823-4893-9c17-c2280472ee39']

In [10]:
# chroma_vs.delete(ids=uuids[-1])

In [11]:
results = chroma_vs.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    k=2,
    filter={"source": "tweet"},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* Building an exciting new project with LangChain - come check it out! [{'source': 'tweet'}]
* LangGraph is the best framework for building stateful, agentic applications! [{'source': 'tweet'}]


In [12]:
results = chroma_vs.similarity_search_with_score(
    "Will it be hot tomorrow?", k=1, filter={"source": "news"}
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

* [SIM=0.893770] The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees. [{'source': 'news'}]


In [13]:
results = chroma_vs.similarity_search_by_vector(
    embedding=embeddings.embed_query("I love green eggs and ham!"), k=1
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")

* I had chocolate chip pancakes and scrambled eggs for breakfast this morning. [{'source': 'tweet'}]


como retriever


In [None]:
retriever = chroma_vs.as_retriever(
    search_type="mmr", search_kwargs={"k": 1, "fetch_k": 5}
)
retriever.invoke("Stealing from the bank is a crime", filter={"source": "news"})
retriever.

[Document(id='53652966-7fe1-4a61-a7dc-0abfe4f95cc4', metadata={'source': 'news'}, page_content='Robbers broke into the city bank and stole $1 million in cash.')]

### ULIMA testing


In [13]:
%pip install -qU langchain-community pypdf


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [40]:
from langchain_chroma import Chroma

ULIMA_COLLECTION_NAME = "ulima-info-collection"
chroma_vs2 = Chroma(
    collection_name=ULIMA_COLLECTION_NAME, embedding_function=embeddings, client=conn1
)

In [41]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "./guia_matricula.pdf"
loader = PyPDFLoader(file_path)

In [42]:
import pprint

docs = loader.load()

pprint.pp(docs[0].metadata)
[d.page_content for d in docs[:3]]

{'producer': 'Microsoft® Word 2019',
 'creator': 'Microsoft® Word 2019',
 'creationdate': '2025-07-21T17:12:32-05:00',
 'title': 'UNIVERSIDAD DE LIMA',
 'author': 'Ggonzal',
 'moddate': '2025-07-21T17:12:32-05:00',
 'source': './guia_matricula.pdf',
 'total_pages': 4,
 'page': 0,
 'page_label': '1'}


['1 \n \nGUÍA DE INFORMACIÓN DE MATRÍCULA \nPARA INGRESANTES 2025-2 \n \nCALENDARIO \n \nENVÍO ELECTRÓNICO DE MATERIAL DE \nMATRÍCULA Lunes 21 de julio de 2025, 19.00 horas \nFECHA DE VENCIMIENTO DE LA BOLETA \nDE PAGO \nDesde el lunes 21 hasta el viernes 25 de \njulio de 2025, 18.00 horas \nCARGA DE DOCUMENTOS DE MATRÍCULA \nA TRAVÉS DEL PORTAL Mi Ulima \nDesde el martes 22 de julio hasta el \nviernes 1 de agosto de 2025, 18.00 horas \nEVALUACIÓN PSICOLÓGICA EN LÍNEA Jueves 24 y viernes 25 de julio de 2025 \nEVALUACIÓN DE ANTECEDENTES DE \nSALUD \nDesde el miércoles 30 de julio hasta el \nviernes 29 de agosto de 2025 \nMATRÍCULA DE INGRESANTES \nEsta actividad es realizada por la Universidad y \nse te confirmará por correo electrónico \nMartes 5 de agosto de 2025, 19.00 horas \nPUBLICACIÓN DE HORARIOS Martes 12 de agosto de 2025, 11.00 horas \nCEREMONIA DE BIENVENIDA Viernes 15 de agosto de 2025, 11.00 horas \nAuditorio ZUM \nEVALUACIÓN DE INGLÉS(1) \nSolo interesados en estudiar en e

In [44]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=30)
docs = text_splitter.split_documents(docs)
docs

[Document(metadata={'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2025-07-21T17:12:32-05:00', 'title': 'UNIVERSIDAD DE LIMA', 'author': 'Ggonzal', 'moddate': '2025-07-21T17:12:32-05:00', 'source': './guia_matricula.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='1 \n \nGUÍA DE INFORMACIÓN DE MATRÍCULA \nPARA INGRESANTES 2025-2 \n \nCALENDARIO \n \nENVÍO ELECTRÓNICO DE MATERIAL DE \nMATRÍCULA Lunes 21 de julio de 2025, 19.00 horas \nFECHA DE VENCIMIENTO DE LA BOLETA \nDE PAGO \nDesde el lunes 21 hasta el viernes 25 de \njulio de 2025, 18.00 horas'),
 Document(metadata={'producer': 'Microsoft® Word 2019', 'creator': 'Microsoft® Word 2019', 'creationdate': '2025-07-21T17:12:32-05:00', 'title': 'UNIVERSIDAD DE LIMA', 'author': 'Ggonzal', 'moddate': '2025-07-21T17:12:32-05:00', 'source': './guia_matricula.pdf', 'total_pages': 4, 'page': 0, 'page_label': '1'}, page_content='CARGA DE DOCUMENTOS DE MATRÍCULA \nA TRAVÉS DEL PORTAL M

In [45]:
chroma_vs2.add_documents(documents=docs)

['6dcadc65-e274-4ae8-b0f7-5fac6429e6ce',
 '2d1b553a-ec0c-4a25-90a8-976f1ab61ce9',
 'f62c955b-3e7e-49a2-82a3-c68ef0b25968',
 'bc365e45-017d-485b-a172-8161444459fb',
 'ebf8ac71-e2d5-4eff-bae0-4c16b5cdc849',
 '5d0f3dec-b7fd-45f0-9bba-2742b01f4ea6',
 '17ca82f0-150a-4417-8331-580575255fbd',
 'f156d63a-53b2-4ee3-8ebc-8c383e00f3f5',
 '6549a3db-c0e0-43c2-b8fd-1628a498f2f7',
 '0a808360-14d9-4deb-992a-cc21687da2bb',
 'b678bc78-819a-4b61-afef-10640272c9b2',
 '4b4684c2-f69c-42fe-8e77-c487c9780aee',
 'bc2a7ca7-1f20-4836-ab2b-79caba799424',
 '0630c716-140c-4790-8e39-f8f8a03ad3be',
 'fb93f1f5-b6ce-411f-98de-6450572dbb9b',
 'b3e60475-c5df-4c1d-a689-3341ffa49f58',
 '6feca0cc-600a-4d60-a591-e0c1e1c290d4',
 '2b9269ab-5606-4e2e-9af4-59b9d0477eb5',
 '428fdfab-3422-4d73-9ba7-6020401346e1',
 'ba50f64b-a3e9-4715-b0c7-f7df0dc1fa37',
 'ee06a9c8-232d-4350-a45c-259f57c27a4d',
 'c8fae7ac-f316-41cd-80de-d74b142609ff']

In [48]:
results = chroma_vs2.similarity_search(
    "¿Cuáles son los requisitos para la matrícula?", k=5
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* 2 
 
PROCESO DE MATRÍCULA 
La matrícula es un acto formal y voluntario que implica el compromiso de cumplir con la Ley 
Universitaria, el Estatuto y los reglamentos de la Universidad. 
 
Para comenzar con tu proceso de matrícula, debes seguir estos pasos: [{'page': 1, 'source': './guia_matricula.pdf', 'moddate': '2025-07-21T17:12:32-05:00', 'producer': 'Microsoft® Word 2019', 'page_label': '2', 'total_pages': 4, 'creationdate': '2025-07-21T17:12:32-05:00', 'author': 'Ggonzal', 'creator': 'Microsoft® Word 2019', 'title': 'UNIVERSIDAD DE LIMA'}]
* ✓ Si no entregas los documentos solicitados en las fechas establecidas, perderás el derecho 
de vacante. 
 
✓ Al matricularte, asumes los compromisos académicos y económicos que estás obligado a 
cumplir. No existe anulación parcial ni total de la matrícula. [{'creator': 'Microsoft® Word 2019', 'source': './guia_matricula.pdf', 'page': 1, 'creationdate': '2025-07-21T17:12:32-05:00', 'moddate': '2025-07-21T17:12:32-05:00', 'title': 'UNIVERSIDA

In [28]:
retriever = chroma_vs2.as_retriever()
tool = retriever.as_tool()
tool.invoke("¿Cuáles son los requisitos para la matrícula?")

[Document(id='e7cd1352-9668-472b-9357-2ca13c7b797a', metadata={'creationdate': '2025-07-21T17:12:32-05:00', 'moddate': '2025-07-21T17:12:32-05:00', 'producer': 'Microsoft® Word 2019', 'author': 'Ggonzal', 'total_pages': 4, 'source': './guia_matricula.pdf', 'page': 1, 'page_label': '2', 'creator': 'Microsoft® Word 2019', 'title': 'UNIVERSIDAD DE LIMA'}, page_content='2 \n \nPROCESO DE MATRÍCULA \nLa matrícula es un acto formal y voluntario que implica el compromiso de cumplir con la Ley \nUniversitaria, el Estatuto y los reglamentos de la Universidad. \n \nPara comenzar con tu proceso de matrícula, debes seguir estos pasos: \n \n1. Ingresa a la página de la Universidad de Lima ( www.ulima.edu.pe) y luego al portal \nuniversitario Mi Ulima  (intranet) con el usuario que se te ha indicado en el correo \nelectrónico y la contraseña que has generado. \n \n2. Sigue las instrucciones indicadas en el manual de usuario y completa los formularios en \ncompañía de tus padres o tutores.  Para el c