# Geo QA 

### setup

In [1]:
from dotenv import load_dotenv

# create a `.env` file in the same directory as `.env-example` and fill in your key
load_dotenv()

True

## Document Creation - from Webpages

### get URIs from .rdf files

In [None]:
def get_URIs_from_file(fp, baseURL):
    try:
        from rdflib import Graph
        from tqdm import tqdm

        uris = []
        uris_not_relevant = []
        for node in tqdm(Graph().parse(fp)): # Create a Graph # Parse in an RDF file:
            if str(node[0]).startswith(baseURL) and not str(node[0]).endswith('adms/0.1'):
                uris += [str(node[0])]
            else:
                uris_not_relevant += [str(node[0])]
                pass
        # cast to set to remove duplicates since one URI has many associated triples ("nodes")
        return list(set(uris)), list(set(uris_not_relevant)) 
    except:
        print('Error: couldn\'t parse file. Check the file path, and if rdflib is installed.')
        return []

In [None]:
# check URI list
relevant,not_relevant = get_URIs_from_file(fp='data/rdf/lithology.rdf', baseURL='http://resource.geolba.ac.at/lithology/') 
print(len(relevant), relevant[0:2])
print(len(not_relevant), not_relevant[0:2])

In [None]:
import os

data_dir = 'data/rdf/'
thesaurus_baseURL = 'http://resource.geolba.ac.at/' # {graphname} + '/'
uri_dict = {}

for entry in os.scandir(data_dir):
    if entry.path.endswith((".rdf")):
        fp = entry.path
        graph_name = fp.split('/')[-1].split('.')[0]
        baseURL = f"{thesaurus_baseURL}{graph_name}/"
        print(f'Processing file {fp} with baseURL {baseURL}')
        file_uris, _ = get_URIs_from_file(fp=fp, baseURL=baseURL)
        uri_dict[graph_name] = file_uris

print('\nEntities found:')
for key in uri_dict.keys():
    print(f'{key}: {len(uri_dict[key])} nodes')

### customize URLLoader

In [None]:
"""
Overriding the load() function of SeleniumURLLoader
"""
from langchain.document_loaders import SeleniumURLLoader

import logging
logger = logging.getLogger(SeleniumURLLoader.__name__)

from typing import TYPE_CHECKING, List, Literal, Optional, Union
from langchain.docstore.document import Document

from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.remote.webelement import WebElement
from selenium.common.exceptions import TimeoutException

from tqdm import tqdm

class SeleniumURLLoaderCustom(SeleniumURLLoader):

    def _build_doc_text(self, elements: List[WebElement]) -> str:
        # TODO remove old function
        doc_text = ''
        
        for elem in elements:
            print(elem.text)

            doc_text += elem.text

        return doc_text
    
    def _format_lnbreak(self, text:str, relation: str) -> str:
        return text.replace(f'{relation}\n', f'{relation}: ')
    
    def load(self) -> List[Document]:
        """Load the specified URLs using Selenium and create Document instances.

        Returns:
            List[Document]: A list of Document instances with loaded content.
        """
        from unstructured.partition.html import partition_html

        docs: List[Document] = list()
        driver = self._get_driver()

        html_temp = []

        for url in (pbar := tqdm(self.urls)):
            pbar.set_description(f'Processing URL: {"".join(url.split("/")[-2:-1])}')
            try:
                try: 
                    driver.get(url)
                    elements = WebDriverWait(driver, 5).until(
                        EC.all_of(
                            # EC.presence_of_element_located((By.CSS_SELECTOR, ".abstract")),
                            EC.presence_of_element_located((By.CSS_SELECTOR, "h3")),
                        )
                    )
                except TimeoutException:
                    if 'no results for <br>URI' in driver.page_source:
                        continue # skip this iteration if URI is not "real"

                # page is now loaded, get source HTML
                html = driver.page_source

                # build document structure here
                # extract all innerHTML from the page source
                elements = partition_html(text=html)
                text = "\n".join([str(el) for el in elements])

                # use only the text in the main section
                text = text.split('RDF download')[1].split('skos:prefLabel')[0]

                # remove line breaks from name
                text = text.replace('  de\n', '  (de) / ')
                text = text.replace('  en\n', '  (en)\n')

                # reformat concept relations
                relations = [
                    'broader',
                    'broadMatch',
                    'narrower',
                    'narrowMatch',
                    'exactMatch',
                    'related',
                    'closeMatch'
                ]
                for relation in relations:
                    text = self._format_lnbreak(text, relation)

                # html_temp += [html]
                docs.append(Document(page_content=text, metadata={"source": url}))
            except Exception as e:
                if self.continue_on_failure:
                    logger.error(f"Error fetching or processing {url}, {type(e).__name__}: {e}")
                else:
                    raise e

        driver.quit()
        return docs

In [None]:
# check the output of the custom URLLoader for an example URL
SeleniumURLLoaderCustom(urls=['http://resource.geolba.ac.at/tectonicunit/194'], browser='chrome').load()

### use URIs and URLLoader to create documents

In [None]:
# create a ⚠️testing⚠️ dataset

test_uris = {}

for k in uri_dict.keys():
    test_uris[k] = uri_dict[k][10:30]

display(test_uris)

In [None]:
# fetch data for each URI

docs_dict = {}

for k in uri_dict.keys():
    loader = SeleniumURLLoaderCustom(urls=test_uris[k], browser='chrome')
    docs = loader.load()
    docs_dict[k] = docs

display(docs_dict)

## Document Creation - .tsv generated by SPARQL

In [2]:
import csv
from tqdm import tqdm

parsed = {}

with open('data/query-result-thesaurus.tsv', encoding='utf8') as tsvfile:
    tsvreader = csv.reader(tsvfile,  delimiter='\t')
    next(tsvreader) # skip header row
    for row in tqdm(tsvreader):
        # structure: [uri, name, relation, value]
        uri, name, relation, value = row
        if '|[de]:' in value:
            # remove german description parts
            value = value.split('|[de]:')[0]
        t = relation+value
        if uri not in parsed:
            # format strings here
            uri_clean = uri.translate({ord(i): None for i in '<>'})
            parsed[uri] = {'link': uri_clean, 'name': name.split('@')[0], 'data': [t]}
        else:
            new = parsed[uri]['data']
            new = list(set(new + [t])) # remove duplicates since some lines are identical
            parsed[uri]['data'] = new
        
print(len(parsed))

# example output:
parsed.get("<http://resource.geolba.ac.at/GeologicUnit/225>")

19986it [00:00, 201152.19it/s]

4378





{'link': 'http://resource.geolba.ac.at/GeologicUnit/225',
 'name': 'Alticola Formation',
 'data': ['is further described as [en]: Depositional environment: Moderately deep shelf. Fossil content: Acritarchs, bivalves, brachiopods, bryozoans, cephalopods, chitinozoans, conodonts, corals, echinoderms, enigmatic phosphatic plates, foraminifers, gastropods, graptolites, ostracodes, scolecodonts, scyphocrinitids, trace fossils, trilobites. Boundaries: Underlying units – Cardiola Formation (conformable, sharp contact); Overlying units – Rauchkofel Formation (conformable, gradual contact), Seekopf Formation (conformable, gradual contact); Lateral units – Nölbling Formation. (Ferretti et al., 2015c)',
  'is part of [Carnic Alps]',
  'also known as Megaerella Formation',
  'has a bibliographic reference: Ferretti, A., et al. (2015c)',
  'is described with Main outcrop areas: The Alticola Formation is well developed in the Carnic Alps, mainly at the Lake Wolayer next to Mt. Rauchkofel, Mt. Cellon

In [3]:
import csv
from tqdm import tqdm

with open('data/query-result-gk50.tsv', encoding='utf8') as tsvfile:
    tsvreader = csv.reader(tsvfile,  delimiter='\t')
    next(tsvreader) # skip header row
    a = False
    for row in tqdm(tsvreader):
        # structure: [uri, text, map]
        uri = row[0]
        t = row[1]+row[2]
        if uri in parsed: # assume we already got all the entities, if URI not present: skip it
            t = t.replace(f'{parsed[uri]["name"]} ', '')
            new = parsed[uri]['data']
            new = list(set(new + [t])) # remove duplicates since some lines are identical
            parsed[uri]['data'] = new
        
print(len(parsed))

# example output:
parsed.get("<http://resource.geolba.ac.at/GeologicUnit/340>")

6281it [00:00, 215775.18it/s]

4378





{'link': 'http://resource.geolba.ac.at/GeologicUnit/340',
 'name': 'Loess loam',
 'data': ['a lithogenetic unit, mainly consiting of Silt formed during Pleistocene under weathering and earth surface setting, on map sheet Wels Bl. 49, 1996',
  'is part of Aeolean deposit',
  'is described with A layer of weathered loess, largely free of carbonate minerals and frequently overlying unweathered loess (Hinze et al., 1989).',
  'a lithogenetic unit, mainly consiting of Silt formed during Pleistocene under mechanical deposition and aeolian process setting, on map sheet Krems a. d. Donau Bl. 38, 1984',
  'a lithologic unit, mainly consiting of Loam formed during Pleistocene under weathering and earth surface setting, on map sheet Eisenstadt Bl. 77, 1994',
  'a lithogenetic unit, mainly consiting of Loam formed during Pleistocene under deposition and earth surface setting, on map sheet Grünau i. Almtal Bl. 67, 2007',
  'is related to Loess|Red loam|Weathered loam',
  'has a bibliographic refere

In [4]:
from langchain.docstore.document import Document

docs = []

for uri in parsed.keys():
    if uri.startswith('<http://resource.geolba.ac.at/ref/'):
        continue # drop ref URIs
    val = parsed[uri]
    uri_link = uri.translate({ord(i): None for i in '<>'}) # remove brackets
    text =  'Name: ' + val['name']
    text += '\n'
    text += '-has URL ' + uri_link
    text += '\n'
    text += '-'
    text += '\n-'.join(val['data'])
    text += '\n'
    docs += [Document(page_content=text, metadata={"source": uri_link})]

display(len(docs))

docs[10:20]

3065

[Document(page_content='Name: Lower Devonian\n-has URL http://resource.geolba.ac.at/GeologicTimeScale/52\n-is derived from http://www.stratigraphy.org/ICSchart/ChronostratChart2013-01.pdf\n-is described with Series/Epoch; 419.2±3.2–393.3±1.2 Ma (ICS  2013).|Series/Epoch; 419.2±3.2–393.3±1.2 Ma (Cohen et al., 2022/10)\n-is displayed on maps with webcolor #E5AC4D\n-has a bibliographic reference: Cohen, K.M., Finney, S. & Gibbard, P.L. (International Commission on Stratigraphy [Ed.]) (2013/01)|Cohen, K.M., Finney, S.C., Gibbard, P.L. & Fan, J.-X. (International Commission on Stratigraphy [Ed.]) (2022/10)\n-includes Lochkovian|Pragian|Emsian\n-also known as Early Devonian\n-is part of Devonian\n-see also http://resource.geosciml.org/classifier/ics/ischart/LowerDevonian|http://inspire.ec.europa.eu/codelist/GeochronologicEraValue/lowerDevonian\n', metadata={'source': 'http://resource.geolba.ac.at/GeologicTimeScale/52'}),
 Document(page_content='Name: Lower Ordovician\n-has URL http://resourc

## Document Embedding

### evaluate token usage

In [5]:
import tiktoken
from langchain.docstore.document import Document

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def num_tokens_from_document(doc: Document, encoding_name: str) -> int:
    """Returns the number of tokens in a Document object."""
    return num_tokens_from_string(doc.page_content, encoding_name)

In [6]:
docs_ext = {}

for doc in docs:
    docs_ext[doc.metadata['source']] = {'doc': doc,
                                        'len': len(doc.page_content),
                                        'tokens': num_tokens_from_document(doc, "cl100k_base")}

In [7]:
# str length
total = 0
max, min = 0, 9999
max_id, min_id = '', ''

for doc in docs:
    l = docs_ext[doc.metadata['source']]['len']
    total += l
    if l > max:
        max = l
        max_id = doc.metadata['source']
    if l < min:
        min = l
        min_id = doc.metadata['source']

print(f'average document len: {total/len(docs)}')
print(f'min document len: {min}, meta: {min_id}')
print(f'max document len: {max}, meta: {max_id}')

average document len: 951.0078303425774
min document len: 110, meta: http://resource.geolba.ac.at/tectonicunit/180
max document len: 26387, meta: http://resource.geolba.ac.at/tectonicunit/112


In [9]:
# tokens
tokens_total = 0
tokens_max, tokens_min = 0, 9999
tokens_max_id, tokens_min_id = '', ''

for doc in docs:
    l = docs_ext[doc.metadata['source']]['tokens']
    tokens_total += l
    if l > tokens_max:
        tokens_max = l
        tokens_max_id = doc.metadata['source']
    if l < tokens_min:
        tokens_min = l
        tokens_min_id = doc.metadata['source']

# Ada v2 pricing: 	$0.0001 / 1K tokens
print(f'embedding {tokens_total} tokens would cost ${tokens_total/1000*0.0001}')
print()
print(f'average document len: {tokens_total/len(docs)}')
print(f'min document len: {tokens_min}, meta: {tokens_min_id}')
print(f'max document len: {tokens_max}, meta: {tokens_max_id}')

embedding 898765 tokens would cost $0.0898765

average document len: 293.2349102773246
min document len: 35, meta: http://resource.geolba.ac.at/minres/33
max document len: 10817, meta: http://resource.geolba.ac.at/tectonicunit/112


In [10]:
print(docs_ext[tokens_min_id]['doc'].page_content)

Name: [Kalzit]
-has URL http://resource.geolba.ac.at/minres/33
-is part of Industrial minerals
-is in database coded with Cal



In [11]:
print(docs_ext[tokens_max_id]['doc'].page_content)

Name: Upper Pliocene to Quaternary Sediments
-has URL http://resource.geolba.ac.at/tectonicunit/112
-includes Lacustrine deposit, on map sheet Rust Bl. 78, 1993 or Muhr Bl. 156, 1995 or Geologisches Modell 1:200.000 Österreich(Salzburganteil) or Geologisches Modell 1:200.000 Österreich(Oberösterreichanteil) or Geologisches Modell 1:200.000 Österreich(Niederösterreich)
-includes Basal till, on map sheet Ried i. Innkreis Bl. 47, 2008 or Eisenerz Bl. 101, 2010 or Geologisches Modell 1:200.000 Österreich(Salzburganteil) or Radenthein-Ost BL.310602, 2019 or Geologisches Modell 1:200.000 Österreich(Tirolanteil)
-includes Calcareous sinter, on map sheet Ober-Grafendorf Bl. 55, 2012 or Wels Bl. 49, 1996 or Straßwalchen Bl. 64, 2003 or Mondsee Bl. 65, 1989 or Grünau i. Almtal Bl. 67, 2007 or Wiener Neustadt Bl. 76, 1982 or St. Wolfgang i. S. Bl. 95, 1982 or Bad Ischl Bl. 96, 1982 or Eisenerz Bl. 101, 2010 or Galtür Bl. 170, 1990 or Obertilliach Bl. 196, 1997 or St. Pölten Bl. 56, 2016
-includes

In [12]:
# clear the file
if docs_ext:
    open('data/docs_humanreadable.txt', 'w').close() 

    # write document data to file for browsing at a glance
    with open('data/docs_humanreadable.txt', 'a', encoding='utf8') as txtf:
        for uri in docs_ext.keys():
            txtf.write(docs_ext[uri]['doc'].page_content)

In [13]:
# writing to file for persistence
import pickle

with open('data/docs_formatted.pkl', 'wb') as outf:
    pickle.dump(docs_ext, outf, pickle.HIGHEST_PROTOCOL)

In [14]:
# parsing from file
import pickle
with open('data/docs_formatted.pkl', 'rb') as inf:
    docs_ext_r = pickle.load(inf)
docs_ext_r

{'http://resource.geolba.ac.at/GeologicTimeScale/216': {'doc': Document(page_content='Name: Riss\n-has URL http://resource.geolba.ac.at/GeologicTimeScale/216\n-is related to Ionian\n-is displayed on maps with webcolor #FFFFE5\n-is described with Glacial stage which corresponds to the Marine Isotope Stage 6 (van Husen & Reitner, 2011).\n-has a bibliographic reference: van Husen, D. & Reitner, J. (2011)\n-is part of Alpine glacial cycles\n-is related to Middle Pleistocene\n-also known as Riß Glacial\n', metadata={'source': 'http://resource.geolba.ac.at/GeologicTimeScale/216'}),
  'len': 409,
  'tokens': 118},
 'http://resource.geolba.ac.at/GeologicTimeScale/42': {'doc': Document(page_content='Name: Series 3\n-has URL http://resource.geolba.ac.at/GeologicTimeScale/42\n-is described with Series/Epoch; ~509–~497 Ma (ICS  2013).|Series/Epoch; ~509–~497 Ma (Cohen et al., 2022/10)\n-is derived from http://www.stratigraphy.org/ICSchart/ChronostratChart2013-01.pdf\n-is displayed on maps with web

### document splitting

probably required to handle larger documents

In [15]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

chunk_size = 2000

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = chunk_size,
    chunk_overlap  = 20,
    is_separator_regex = False,
    separators=['\n-']
)

In [16]:
# split documents into more if too big, keep header for each one
all_docs = []

for uri in docs_ext_r.keys():
    content = str(docs_ext_r[uri]['doc'].page_content).splitlines(True)
    meta = docs_ext_r[uri]['doc'].metadata

    header = ''.join(content[0:2])
    body = ''.join(content[2:])
    # print(header)
    # print(body)
    split_list = text_splitter.create_documents([body]) # can return 1...n documents, splitted

    for idx, doc in enumerate(split_list):
        split_list[idx].page_content = header + doc.page_content
        split_list[idx].metadata = meta
        # print(f'---------{num_tokens_from_string(new_doc, "cl100k_base")}---------')
        # print(new_doc)
    
    docs_ext_r[uri]['split_list'] = split_list
    all_docs += split_list

# example:
docs_ext_r['http://resource.geolba.ac.at/GeologicTimeScale/216']

{'doc': Document(page_content='Name: Riss\n-has URL http://resource.geolba.ac.at/GeologicTimeScale/216\n-is related to Ionian\n-is displayed on maps with webcolor #FFFFE5\n-is described with Glacial stage which corresponds to the Marine Isotope Stage 6 (van Husen & Reitner, 2011).\n-has a bibliographic reference: van Husen, D. & Reitner, J. (2011)\n-is part of Alpine glacial cycles\n-is related to Middle Pleistocene\n-also known as Riß Glacial\n', metadata={'source': 'http://resource.geolba.ac.at/GeologicTimeScale/216'}),
 'len': 409,
 'tokens': 118,
 'split_list': [Document(page_content='Name: Riss\n-has URL http://resource.geolba.ac.at/GeologicTimeScale/216\n-is related to Ionian\n-is displayed on maps with webcolor #FFFFE5\n-is described with Glacial stage which corresponds to the Marine Isotope Stage 6 (van Husen & Reitner, 2011).\n-has a bibliographic reference: van Husen, D. & Reitner, J. (2011)\n-is part of Alpine glacial cycles\n-is related to Middle Pleistocene\n-also known as

In [17]:
print(f'unique URIs (documents before splitting):\t{len(docs_ext_r)}')
print(f'documents after splitting:\t\t\t{len(all_docs)}')

unique URIs (documents before splitting):	3065
documents after splitting:			3399


In [18]:
print(f"number of splits for longest document with URI {tokens_max_id}: {len(docs_ext_r[tokens_max_id]['split_list'])}")
for doc in docs_ext_r[tokens_max_id]['split_list']:
    print(doc.page_content)

number of splits for longest document with URI http://resource.geolba.ac.at/tectonicunit/112: 15
Name: Upper Pliocene to Quaternary Sediments
-has URL http://resource.geolba.ac.at/tectonicunit/112
-includes Lacustrine deposit, on map sheet Rust Bl. 78, 1993 or Muhr Bl. 156, 1995 or Geologisches Modell 1:200.000 Österreich(Salzburganteil) or Geologisches Modell 1:200.000 Österreich(Oberösterreichanteil) or Geologisches Modell 1:200.000 Österreich(Niederösterreich)
-includes Basal till, on map sheet Ried i. Innkreis Bl. 47, 2008 or Eisenerz Bl. 101, 2010 or Geologisches Modell 1:200.000 Österreich(Salzburganteil) or Radenthein-Ost BL.310602, 2019 or Geologisches Modell 1:200.000 Österreich(Tirolanteil)
-includes Calcareous sinter, on map sheet Ober-Grafendorf Bl. 55, 2012 or Wels Bl. 49, 1996 or Straßwalchen Bl. 64, 2003 or Mondsee Bl. 65, 1989 or Grünau i. Almtal Bl. 67, 2007 or Wiener Neustadt Bl. 76, 1982 or St. Wolfgang i. S. Bl. 95, 1982 or Bad Ischl Bl. 96, 1982 or Eisenerz Bl. 101

In [19]:
# writing to file for persistence
import pickle
with open('data/docs_formatted_split.pkl', 'wb') as outf:
    pickle.dump(docs_ext_r, outf, pickle.HIGHEST_PROTOCOL)

In [20]:
# parsing from file
import pickle
with open('data/docs_formatted_split.pkl', 'rb') as inf:
    docs_split = pickle.load(inf)
docs_split

{'http://resource.geolba.ac.at/GeologicTimeScale/216': {'doc': Document(page_content='Name: Riss\n-has URL http://resource.geolba.ac.at/GeologicTimeScale/216\n-is related to Ionian\n-is displayed on maps with webcolor #FFFFE5\n-is described with Glacial stage which corresponds to the Marine Isotope Stage 6 (van Husen & Reitner, 2011).\n-has a bibliographic reference: van Husen, D. & Reitner, J. (2011)\n-is part of Alpine glacial cycles\n-is related to Middle Pleistocene\n-also known as Riß Glacial\n', metadata={'source': 'http://resource.geolba.ac.at/GeologicTimeScale/216'}),
  'len': 409,
  'tokens': 118,
  'split_list': [Document(page_content='Name: Riss\n-has URL http://resource.geolba.ac.at/GeologicTimeScale/216\n-is related to Ionian\n-is displayed on maps with webcolor #FFFFE5\n-is described with Glacial stage which corresponds to the Marine Isotope Stage 6 (van Husen & Reitner, 2011).\n-has a bibliographic reference: van Husen, D. & Reitner, J. (2011)\n-is part of Alpine glacial

In [21]:
# writing to file for persistence
import pickle
with open('data/all_docs.pkl', 'wb') as outf:
    pickle.dump(all_docs, outf, pickle.HIGHEST_PROTOCOL)

In [22]:
import pickle
with open('data/all_docs.pkl', 'rb') as inf:
    all_docs = pickle.load(inf)

In [23]:
# clear the file
open('./outfile.txt', 'w').close() 

for uri in docs_split.keys():
    if len(docs_split[uri]['split_list']) > 1:
        for doc in docs_split[uri]['split_list']: 
            # write document data to file for browsing at a glance
            with open('./outfile.txt', 'a', encoding='utf8') as txtf:
                txtf.write(f'{doc.page_content}\n')

### ChromaDB setup

In [24]:
# import the chromaDB driver and open a connection
import chromadb
chroma_client = chromadb.HttpClient(host="localhost", port="8000")


print(f'installed chromadb local client version: {chromadb.__version__}')
print(f'remote chromadb server version: {chroma_client.get_version()}')

installed chromadb local client version: 0.4.14
remote chromadb server version: 0.4.14


In [25]:
# 
# ⚠️ DESTRUCTIVE ⚠️
# will cause errors with collection UUIDs, only for local testing here
#

## lc_client.delete_collection()

In [26]:
# LangChain wrapper around chromaDB - don't mix these up!
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

collection_name = "geo_demo"

lc_client = Chroma(client=chroma_client,
                   collection_name=collection_name,
                   embedding_function=OpenAIEmbeddings())

In [27]:
lc_client._client.list_collections()

[Collection(name=geo_demo)]

In [28]:
collection = lc_client._client.get_collection(collection_name)

lc_client._client._peek(collection.id) # _peek needs UUID instead of collection_name

{'ids': [], 'embeddings': [], 'metadatas': [], 'documents': []}

In [29]:
print(collection)

name='geo_demo' id=UUID('258d7841-0189-44a8-8098-c70c8e29ad4a') metadata=None


### embedding and storing documents in chromaDB

In [31]:
# subset for dev
# tokens_total = 0

# # could randomize, but won't
# # focus one a few continuous blocks of data instead of one-off docs for the demo
# subset = all_docs[0:100] 

# # clear the file
# open('./demo-kb-humanreadable.txt', 'w').close() 

# for doc in subset:
#     l = num_tokens_from_document(doc, 'cl100k_base')
#     tokens_total += l
#     print(doc.page_content)
#     with open('./demo-kb-humanreadable.txt', 'a', encoding='utf8') as txtf:
#         txtf.write(f'{doc.page_content}\n')

In [30]:
print(f'embedding {tokens_total} tokens would cost ${tokens_total*0.0001}')

embedding 898765 tokens would cost $89.87650000000001


In [38]:
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def add_with_backoff(client, docs):
    return client.add_documents(docs)

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def update_with_backoff(client, document_id, doc):
    return client.update_document(document_id, doc)

In [39]:
from tqdm import tqdm

for uri in tqdm(docs_split.keys()):
    doc = docs_split[uri]['doc']

    # adding documents when unsure if URI exists
    get_res = lc_client.get(where=doc.metadata, include=['metadatas'])
    
    if not get_res['ids']:
        # document is entirely new, no results found
        add_with_backoff(lc_client, [doc])
    else:
        # do the stuff above to update...
        update_with_backoff(lc_client, get_res['ids'][0], doc)

100%|██████████| 3065/3065 [14:58<00:00,  3.41it/s]  


### check chromaDB collections

In [40]:
# we can access the underlying chromadb HttpClient using lc_client._client for granular operations
# ⚠️ important to not get these confused!

collection = lc_client._client.get_collection(collection_name)

lc_client._client._peek(collection.id) # _peek needs UUID instead of collection_name

{'ids': ['faedbe58-80ed-11ee-8920-edfbead5900d',
  'fb25d47a-80ed-11ee-9142-edfbead5900d',
  'fb4eb702-80ed-11ee-b097-edfbead5900d',
  'fb81e0d3-80ed-11ee-ae8c-edfbead5900d',
  'fbb3432c-80ed-11ee-b1d6-edfbead5900d',
  'fbe51927-80ed-11ee-a732-edfbead5900d',
  'fc0d2bcd-80ed-11ee-a5fc-edfbead5900d',
  'fc46c696-80ed-11ee-800d-edfbead5900d',
  'fc6b7b6d-80ed-11ee-81e2-edfbead5900d',
  'fc9c2a0e-80ed-11ee-9be7-edfbead5900d'],
 'embeddings': [[-0.0017232076497748494,
   -0.032113298773765564,
   0.00916034635156393,
   -0.031585484743118286,
   -0.01090352050960064,
   0.016890039667487144,
   -0.04119725152850151,
   -0.0030470825731754303,
   -0.0025071152485907078,
   -0.037391435354948044,
   0.007535235490649939,
   0.016487235203385353,
   -0.024057194590568542,
   -0.009799279272556305,
   -0.00669837323948741,
   0.015764962881803513,
   0.025154491886496544,
   0.013542589731514454,
   0.024946143850684166,
   -0.011153537780046463,
   0.02026527002453804,
   0.008924218825995922

In [43]:
len(lc_client.get(include=[])['ids'])

3065