# ChromaDB testing

test notebook

In [1]:
%pip install python-dotenv

from dotenv import load_dotenv
load_dotenv()


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


True

### Connection

In [2]:
import chromadb

class ChromaDBConnection:
    _instance = None
    _conn = None

    def __new__(cls, *args, **kwargs):
        if not cls._instance:
            cls._instance = super().__new__(cls)
            cls._conn = chromadb.HttpClient(host="localhost", port=8000)
        return cls._instance

    def get_conn(self):
        return self._conn
    
    def close(self):
        self._conn = None
        ChromaDBConnection._instance = None


In [3]:
conn1 = ChromaDBConnection().get_conn()
conn2 = ChromaDBConnection().get_conn()
conn1 == conn2

True

In [4]:
col = conn1.get_or_create_collection("test-collection")
col.add(
    ids=["doc1"],
    embeddings=[[0.1, 0.2, 0.3]],
    documents=["This is a test document."],
    metadatas=[{"source": "unit_test"}]
)

In [5]:
col.get(ids=["doc1"])

{'ids': ['doc1'],
 'embeddings': None,
 'metadatas': [{'source': 'unit_test'}],
 'documents': ['This is a test document.'],
 'data': None,
 'uris': None,
 'included': ['metadatas', 'documents']}

### Using Langchain

In [6]:
%pip install -qU openai langchain-openai langchain-chroma


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from langchain_chroma import Chroma

chroma_vs = Chroma(
    collection_name="embedding-collection",
    embedding_function=embeddings,
    client=conn1
)

In [9]:
from uuid import uuid4

from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
    id=1,
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
    id=2,
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
    id=3,
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
    id=4,
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
    id=5,
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
    id=6,
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
    id=7,
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
    id=8,
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
    id=9,
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
    id=10,
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]
uuids = [str(uuid4()) for _ in range(len(documents))]

chroma_vs.add_documents(documents=documents, ids=uuids)

['1620b42c-2b9c-49a5-b58e-17909d2a3f99',
 '4c132711-7a96-4a91-b4fe-c6c885087097',
 'c0aa2517-c981-43bc-8372-b42a66b5aae3',
 '5246eab2-3112-4f1e-b507-a8928958b816',
 'd9085170-c12d-4610-8032-8ff7f5700ab5',
 'd1b3233f-8b11-4f75-9c1b-d54579cf1723',
 '0c5e6c1c-d4d6-4c62-b7c3-6f4b7292a4f7',
 'ccad1e9b-6dc1-4f14-969d-41902ef4bbc4',
 '700c6228-39fd-4581-b665-d33b966a3b84',
 '4b47388b-155b-4211-adff-f75062e9f38d']

In [10]:
# chroma_vs.delete(ids=uuids[-1])

In [11]:
results = chroma_vs.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    k=2,
    filter={"source": "tweet"},
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* Building an exciting new project with LangChain - come check it out! [{'source': 'tweet'}]
* Building an exciting new project with LangChain - come check it out! [{'source': 'tweet'}]


In [12]:
results = chroma_vs.similarity_search_with_score(
    "Will it be hot tomorrow?", k=1, filter={"source": "news"}
)
for res, score in results:
    print(f"* [SIM={score:3f}] {res.page_content} [{res.metadata}]")

* [SIM=0.893731] The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees. [{'source': 'news'}]


In [13]:
results = chroma_vs.similarity_search_by_vector(
    embedding=embeddings.embed_query("I love green eggs and ham!"), k=1
)
for doc in results:
    print(f"* {doc.page_content} [{doc.metadata}]")

* I had chocolate chip pancakes and scrambled eggs for breakfast this morning. [{'source': 'tweet'}]


como retriever

In [14]:
retriever = chroma_vs.as_retriever(
    search_type="mmr", search_kwargs={"k": 1, "fetch_k": 5}
)
retriever.invoke("Stealing from the bank is a crime", filter={"source": "news"})

[Document(id='5246eab2-3112-4f1e-b507-a8928958b816', metadata={'source': 'news'}, page_content='Robbers broke into the city bank and stole $1 million in cash.')]

### ULIMA testing

In [15]:
%pip install -qU langchain-community pypdf


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [16]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "./guia_matricula.pdf"
loader = PyPDFLoader(file_path)

In [17]:
import pprint
docs = loader.load()

pprint.pp(docs[0].metadata)
[d.page_content for d in docs[:3]]

{'producer': 'Microsoft® Word 2019',
 'creator': 'Microsoft® Word 2019',
 'creationdate': '2025-07-21T17:12:32-05:00',
 'title': 'UNIVERSIDAD DE LIMA',
 'author': 'Ggonzal',
 'moddate': '2025-07-21T17:12:32-05:00',
 'source': './guia_matricula.pdf',
 'total_pages': 4,
 'page': 0,
 'page_label': '1'}


['1 \n \nGUÍA DE INFORMACIÓN DE MATRÍCULA \nPARA INGRESANTES 2025-2 \n \nCALENDARIO \n \nENVÍO ELECTRÓNICO DE MATERIAL DE \nMATRÍCULA Lunes 21 de julio de 2025, 19.00 horas \nFECHA DE VENCIMIENTO DE LA BOLETA \nDE PAGO \nDesde el lunes 21 hasta el viernes 25 de \njulio de 2025, 18.00 horas \nCARGA DE DOCUMENTOS DE MATRÍCULA \nA TRAVÉS DEL PORTAL Mi Ulima \nDesde el martes 22 de julio hasta el \nviernes 1 de agosto de 2025, 18.00 horas \nEVALUACIÓN PSICOLÓGICA EN LÍNEA Jueves 24 y viernes 25 de julio de 2025 \nEVALUACIÓN DE ANTECEDENTES DE \nSALUD \nDesde el miércoles 30 de julio hasta el \nviernes 29 de agosto de 2025 \nMATRÍCULA DE INGRESANTES \nEsta actividad es realizada por la Universidad y \nse te confirmará por correo electrónico \nMartes 5 de agosto de 2025, 19.00 horas \nPUBLICACIÓN DE HORARIOS Martes 12 de agosto de 2025, 11.00 horas \nCEREMONIA DE BIENVENIDA Viernes 15 de agosto de 2025, 11.00 horas \nAuditorio ZUM \nEVALUACIÓN DE INGLÉS(1) \nSolo interesados en estudiar en e

In [18]:
chroma_vs.add_documents(documents=docs)

['9436c88f-de75-4d3c-8db2-87356222b666',
 '1595beaa-d363-4418-bfd9-ea8b6620d32c',
 '4f708715-2c27-4d1a-a0c4-bfa355807be3',
 '2b43a369-f661-4958-9403-cfe2c3906d17']

In [19]:
results = chroma_vs.similarity_search("¿Cuáles son los requisitos para la matrícula?", k=2)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* 2 
 
PROCESO DE MATRÍCULA 
La matrícula es un acto formal y voluntario que implica el compromiso de cumplir con la Ley 
Universitaria, el Estatuto y los reglamentos de la Universidad. 
 
Para comenzar con tu proceso de matrícula, debes seguir estos pasos: 
 
1. Ingresa a la página de la Universidad de Lima ( www.ulima.edu.pe) y luego al portal 
universitario Mi Ulima  (intranet) con el usuario que se te ha indicado en el correo 
electrónico y la contraseña que has generado. 
 
2. Sigue las instrucciones indicadas en el manual de usuario y completa los formularios en 
compañía de tus padres o tutores.  Para el caso de hermanos que han ingresado 
simultáneamente, el registro en el portal debe efectuarse por cada hermano en forma 
independiente. 
 
 
 
 
Es importante que tengas en consideración lo siguiente: 
 
✓ Para que la Universidad efectúe tu matrícula el martes 5 de agosto, es indispensable que 
pagues la primera boleta de pago y present es los documentos solicitados según la 
mo