In [1]:
import chromadb
chroma_client = chromadb.HttpClient(host="localhost", port="8000")

chromadb.__version__

'0.4.6'

In [2]:
from dotenv import load_dotenv
load_dotenv()

from langchain.embeddings import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

In [8]:
from langchain.vectorstores import Chroma

collection_name = "test_langchain"

lc_client = Chroma(client=chroma_client,
                   collection_name=collection_name,
                   embedding_function=embedding)


In [4]:
from langchain.docstore.document import Document

doc = Document(page_content='A collective term for plutonic rocks in which the quartz content is more than 90% of the felsic minerals. Now defined modally in QAPF (Streckeisen, 1973) field 1a (Le Maitre et al., 2005).', metadata={'source': 'http://resource.geolba.ac.at/lithology/251'})
doc

Document(page_content='A collective term for plutonic rocks in which the quartz content is more than 90% of the felsic minerals. Now defined modally in QAPF (Streckeisen, 1973) field 1a (Le Maitre et al., 2005).', metadata={'source': 'http://resource.geolba.ac.at/lithology/251'})

In [9]:
lc_client._client.list_collections()

[Collection(name=test_langchain)]

In [10]:
lc_client.add_documents([doc])

['43a33904-3b80-11ee-896b-010101010000']

In [16]:
# we can access the underlying chromadb HttpClient using lc_client._client for granular operations
# ⚠️ important to not get these confused!

collection = lc_client._client.get_collection(collection_name)

lc_client._client._peek(collection.id) # _peek needs UUID instead of collection_name

{'ids': ['43a33904-3b80-11ee-896b-010101010000'],
 'embeddings': [[0.0012121215157071537,
   0.006519694522980599,
   0.0007121626669836645,
   -0.03387334792346866,
   -0.037678154374335665,
   0.029804317819339884,
   -0.043385362187991106,
   -0.02399141979932171,
   -0.005869046058563694,
   -0.027373469150027908,
   0.013950957296953191,
   0.01577409473025971,
   0.0016868967046398077,
   -0.02574849945666772,
   -0.02096606884149328,
   0.015615560352038139,
   0.0216662594737274,
   -0.0033737934092796153,
   0.0013244161958143933,
   -0.004917844445846943,
   -0.005357114825844712,
   0.01885228892862444,
   0.02091322560095698,
   -0.02251177181140395,
   -0.015655192782440364,
   0.015324915078508217,
   0.004366279851403201,
   -0.019116510719241146,
   -0.022749571981752505,
   0.02082074713605085,
   0.01208818743324445,
   0.004320040618950138,
   -0.0034811336863406267,
   0.01479646963462974,
   -0.026712911879518544,
   -0.005740237632958227,
   0.0016637772048285926,

In [17]:
collection.peek() # does the same thing as above

{'ids': ['43a33904-3b80-11ee-896b-010101010000'],
 'embeddings': [[0.0012121215157071537,
   0.006519694522980599,
   0.0007121626669836645,
   -0.03387334792346866,
   -0.037678154374335665,
   0.029804317819339884,
   -0.043385362187991106,
   -0.02399141979932171,
   -0.005869046058563694,
   -0.027373469150027908,
   0.013950957296953191,
   0.01577409473025971,
   0.0016868967046398077,
   -0.02574849945666772,
   -0.02096606884149328,
   0.015615560352038139,
   0.0216662594737274,
   -0.0033737934092796153,
   0.0013244161958143933,
   -0.004917844445846943,
   -0.005357114825844712,
   0.01885228892862444,
   0.02091322560095698,
   -0.02251177181140395,
   -0.015655192782440364,
   0.015324915078508217,
   0.004366279851403201,
   -0.019116510719241146,
   -0.022749571981752505,
   0.02082074713605085,
   0.01208818743324445,
   0.004320040618950138,
   -0.0034811336863406267,
   0.01479646963462974,
   -0.026712911879518544,
   -0.005740237632958227,
   0.0016637772048285926,

In [29]:
# updating documents (when something changes about a node)
get_res = lc_client.get(where={'source': 'http://resource.geolba.ac.at/lithology/251'}, include=['metadatas'])
display(get_res)

display(get_res['ids'][0])

from langchain.docstore.document import Document

doc_new = Document(page_content='SOME NEW CONTENT YEAH BABY!! old: A collective term for plutonic rocks in which the quartz content is more than 90% of the felsic minerals. Now defined modally in QAPF (Streckeisen, 1973) field 1a (Le Maitre et al., 2005).', metadata={'source': 'http://resource.geolba.ac.at/lithology/251'})
display(doc_new)

lc_client.update_document(get_res['ids'][0], doc_new)

{'ids': ['43a33904-3b80-11ee-896b-010101010000'],
 'embeddings': None,
 'metadatas': [{'source': 'http://resource.geolba.ac.at/lithology/251'}],
 'documents': None}

'43a33904-3b80-11ee-896b-010101010000'

Document(page_content='SOME NEW CONTENT YEAH BABY!! old: A collective term for plutonic rocks in which the quartz content is more than 90% of the felsic minerals. Now defined modally in QAPF (Streckeisen, 1973) field 1a (Le Maitre et al., 2005).', metadata={'source': 'http://resource.geolba.ac.at/lithology/251'})

In [32]:
# adding documents when unsure if URI exists
get_res = lc_client.get(where={'source': 'http://resource.geolba.ac.at/lithology/61'}, include=['metadatas'])
display(get_res)

if not get_res['ids']:
    from langchain.docstore.document import Document

    doc_add = Document(page_content='Ultramafic rock composed almost entirely of one or more pyroxenes and occasionally biotite, hornblende, or olivine (Le Maitre et al., 2005).', metadata={'source': 'http://resource.geolba.ac.at/lithology/61'})
    display(doc_add)

    lc_client.add_documents([doc_add])
else:
    # do the stuff above to update...
    display(get_res['ids'][0])

    from langchain.docstore.document import Document

    doc_new = Document(page_content='SOME NEW CONTENT YEAH BABY!! old: A collective term for plutonic rocks in which the quartz content is more than 90% of the felsic minerals. Now defined modally in QAPF (Streckeisen, 1973) field 1a (Le Maitre et al., 2005).', metadata={'source': 'http://resource.geolba.ac.at/lithology/251'})
    display(doc_new)

    lc_client.update_document(get_res['ids'][0], doc_new)

{'ids': [], 'embeddings': None, 'metadatas': [], 'documents': None}

Document(page_content='Ultramafic rock composed almost entirely of one or more pyroxenes and occasionally biotite, hornblende, or olivine (Le Maitre et al., 2005).', metadata={'source': 'http://resource.geolba.ac.at/lithology/61'})

In [33]:
collection = lc_client._client.get_collection(collection_name)

lc_client._client._peek(collection.id)

{'ids': ['43a33904-3b80-11ee-896b-010101010000',
  '2f52ab1f-3b83-11ee-b8e8-010101010000'],
 'embeddings': [[0.00838271926354785,
   0.0025369639809760466,
   0.0072014874161932425,
   -0.029450259130467886,
   -0.03707457445890043,
   0.021369558473772962,
   -0.03761149684056012,
   -0.028698564443382948,
   -0.01794667037445031,
   -0.023570944336397146,
   0.01002033643916484,
   0.024846139183790625,
   0.0005637697347451242,
   -0.02265817554251759,
   -0.021275597336379296,
   0.015866091954567292,
   0.02539648564944667,
   0.005245071994389185,
   -0.0005432156777325973,
   -0.0031359692586591208,
   -0.0005075605739620988,
   0.020872904618811926,
   0.016054014229354624,
   -0.02314140531348227,
   -0.009107565782640078,
   0.009946509565120502,
   0.006855842864658669,
   -0.013678128313294258,
   -0.00481553343713561,
   0.02010778771037584,
   0.011134452523150684,
   0.01275864654609392,
   -0.0032198634972087727,
   0.007295449019248209,
   -0.01681913113779071,
   -0.0

### why not to use chromadb directly

---

In [21]:
# we can't do this, since the embedding funtion is managed by langchain Chroma() and incompatible as far as i can tell
# so the below produces an error when embedding is needed, like in add/upsert

import chromadb
chroma_client = chromadb.HttpClient(host="localhost", port="8000")

# display(chromadb.__version__)

collection_name = "test_langchain"

from dotenv import load_dotenv
load_dotenv()

from langchain.embeddings import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

collection = chroma_client.get_collection(name=collection_name, embedding_function=embedding)
collection.id

UUID('0541eb0e-d3bf-4243-9251-fef01052a631')

In [22]:
collection.peek()

{'ids': ['43a33904-3b80-11ee-896b-010101010000'],
 'embeddings': [[0.0012121215157071537,
   0.006519694522980599,
   0.0007121626669836645,
   -0.03387334792346866,
   -0.037678154374335665,
   0.029804317819339884,
   -0.043385362187991106,
   -0.02399141979932171,
   -0.005869046058563694,
   -0.027373469150027908,
   0.013950957296953191,
   0.01577409473025971,
   0.0016868967046398077,
   -0.02574849945666772,
   -0.02096606884149328,
   0.015615560352038139,
   0.0216662594737274,
   -0.0033737934092796153,
   0.0013244161958143933,
   -0.004917844445846943,
   -0.005357114825844712,
   0.01885228892862444,
   0.02091322560095698,
   -0.02251177181140395,
   -0.015655192782440364,
   0.015324915078508217,
   0.004366279851403201,
   -0.019116510719241146,
   -0.022749571981752505,
   0.02082074713605085,
   0.01208818743324445,
   0.004320040618950138,
   -0.0034811336863406267,
   0.01479646963462974,
   -0.026712911879518544,
   -0.005740237632958227,
   0.0016637772048285926,

In [24]:
documents=["lorem ipsum...", "doc2", "doc3"]

collection.add(
    documents=documents,
    # embeddings=embedding(texts=documents),
    metadatas=[{"chapter": "3", "verse": "16"}, {"chapter": "3", "verse": "5"}, {"chapter": "29", "verse": "11"}],
    ids=["id1", "id2", "id3"],
)

collection.peek()

TypeError: 'OpenAIEmbeddings' object is not callable