# Geo QA 

### setup

In [1]:
from dotenv import load_dotenv

# create a `.env` file in the same directory as `.env-example` and fill in your key
load_dotenv()

True

## Document Creation - from Webpages

### get URIs from .rdf files

In [17]:
def get_URIs_from_file(fp, baseURL):
    try:
        from rdflib import Graph
        from tqdm import tqdm

        uris = []
        uris_not_relevant = []
        for node in tqdm(Graph().parse(fp)): # Create a Graph # Parse in an RDF file:
            if str(node[0]).startswith(baseURL) and not str(node[0]).endswith('adms/0.1'):
                uris += [str(node[0])]
            else:
                uris_not_relevant += [str(node[0])]
                pass
        # cast to set to remove duplicates since one URI has many associated triples ("nodes")
        return list(set(uris)), list(set(uris_not_relevant)) 
    except:
        print('Error: couldn\'t parse file. Check the file path, and if rdflib is installed.')
        return []

In [27]:
# check URI list
relevant,not_relevant = get_URIs_from_file(fp='data/rdf/lithology.rdf', baseURL='http://resource.geolba.ac.at/lithology/') 
print(len(relevant), relevant[0:2])
print(len(not_relevant), not_relevant[0:2])

100%|██████████| 5642/5642 [00:00<00:00, 351482.51it/s]

257 ['http://resource.geolba.ac.at/lithology/78', 'http://resource.geolba.ac.at/lithology/250']
68 ['http://resource.geosciml.org/classifier/cgi/lithology/carbonate_sedimentary_rock', 'http://inspire.ec.europa.eu/codelist/LithologyValue/ashTuffLapillistoneAndLapilliTuff']





In [34]:
import os

data_dir = 'data/rdf/'
thesaurus_baseURL = 'http://resource.geolba.ac.at/' # {graphname} + '/'
uri_dict = {}

for entry in os.scandir(data_dir):
    if entry.path.endswith((".rdf")):
        fp = entry.path
        graph_name = fp.split('/')[-1].split('.')[0]
        baseURL = f"{thesaurus_baseURL}{graph_name}/"
        print(f'Processing file {fp} with baseURL {baseURL}')
        file_uris, _ = get_URIs_from_file(fp=fp, baseURL=baseURL)
        uri_dict[graph_name] = file_uris

print('\nEntities found:')
for key in uri_dict.keys():
    print(f'{key}: {len(uri_dict[key])} nodes')

Processing file data/rdf/GeologicTimeScale.rdf with baseURL http://resource.geolba.ac.at/GeologicTimeScale/


100%|██████████| 5532/5532 [00:00<00:00, 355839.80it/s]


Processing file data/rdf/GeologicUnit.rdf with baseURL http://resource.geolba.ac.at/GeologicUnit/


100%|██████████| 19446/19446 [00:00<00:00, 365496.79it/s]


Processing file data/rdf/lithology.rdf with baseURL http://resource.geolba.ac.at/lithology/


100%|██████████| 5642/5642 [00:00<00:00, 294878.11it/s]


Processing file data/rdf/mineral.rdf with baseURL http://resource.geolba.ac.at/mineral/


100%|██████████| 16118/16118 [00:00<00:00, 359437.86it/s]


Processing file data/rdf/minres.rdf with baseURL http://resource.geolba.ac.at/minres/


100%|██████████| 1761/1761 [00:00<00:00, 319775.28it/s]


Processing file data/rdf/structure.rdf with baseURL http://resource.geolba.ac.at/structure/


100%|██████████| 4984/4984 [00:00<00:00, 366975.83it/s]


Processing file data/rdf/tectonicunit.rdf with baseURL http://resource.geolba.ac.at/tectonicunit/


100%|██████████| 6915/6915 [00:00<00:00, 353435.36it/s]


Entities found:
GeologicTimeScale: 221 nodes
GeologicUnit: 970 nodes
lithology: 257 nodes
mineral: 825 nodes
minres: 103 nodes
structure: 285 nodes
tectonicunit: 393 nodes





### customize URLLoader

In [44]:
"""
Overriding the load() function of SeleniumURLLoader
"""
from langchain.document_loaders import SeleniumURLLoader

import logging
logger = logging.getLogger(SeleniumURLLoader.__name__)

from typing import TYPE_CHECKING, List, Literal, Optional, Union
from langchain.docstore.document import Document

from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.remote.webelement import WebElement
from selenium.common.exceptions import TimeoutException

from tqdm import tqdm

class SeleniumURLLoaderCustom(SeleniumURLLoader):

    def _build_doc_text(self, elements: List[WebElement]) -> str:
        # TODO remove old function
        doc_text = ''
        
        for elem in elements:
            print(elem.text)

            doc_text += elem.text

        return doc_text
    
    def _format_lnbreak(self, text:str, relation: str) -> str:
        return text.replace(f'{relation}\n', f'{relation}: ')
    
    def load(self) -> List[Document]:
        """Load the specified URLs using Selenium and create Document instances.

        Returns:
            List[Document]: A list of Document instances with loaded content.
        """
        from unstructured.partition.html import partition_html

        docs: List[Document] = list()
        driver = self._get_driver()

        html_temp = []

        for url in (pbar := tqdm(self.urls)):
            pbar.set_description(f'Processing URL: {"".join(url.split("/")[-2:-1])}')
            try:
                try: 
                    driver.get(url)
                    elements = WebDriverWait(driver, 5).until(
                        EC.all_of(
                            # EC.presence_of_element_located((By.CSS_SELECTOR, ".abstract")),
                            EC.presence_of_element_located((By.CSS_SELECTOR, "h3")),
                        )
                    )
                except TimeoutException:
                    if 'no results for <br>URI' in driver.page_source:
                        continue # skip this iteration if URI is not "real"

                # page is now loaded, get source HTML
                html = driver.page_source

                # build document structure here
                # extract all innerHTML from the page source
                elements = partition_html(text=html)
                text = "\n".join([str(el) for el in elements])

                # use only the text in the main section
                text = text.split('RDF download')[1].split('skos:prefLabel')[0]

                # remove line breaks from name
                text = text.replace('  de\n', '  (de) / ')
                text = text.replace('  en\n', '  (en)\n')

                # reformat concept relations
                relations = [
                    'broader',
                    'broadMatch',
                    'narrower',
                    'narrowMatch',
                    'exactMatch',
                    'related',
                    'closeMatch'
                ]
                for relation in relations:
                    text = self._format_lnbreak(text, relation)

                # html_temp += [html]
                docs.append(Document(page_content=text, metadata={"source": url}))
            except Exception as e:
                if self.continue_on_failure:
                    logger.error(f"Error fetching or processing {url}, {type(e).__name__}: {e}")
                else:
                    raise e

        driver.quit()
        return docs

In [43]:
# check the output of the custom URLLoader for an example URL
SeleniumURLLoaderCustom(urls=['http://resource.geolba.ac.at/tectonicunit/194'], browser='chrome').load()

Processing URL: tectonicunit: 100%|██████████| 1/1 [00:01<00:00,  1.94s/it]


[Document(page_content='\nTroiseck-Floning-Teildecke  (de) / Troiseck-Floning Subnappe  (en)\nConcept relations\nbroader: Bösenstein-Pletzen Nappe\n', metadata={'source': 'http://resource.geolba.ac.at/tectonicunit/194'})]

### use URIs and URLLoader to create documents

In [7]:
# create a ⚠️testing⚠️ dataset

test_uris = {}

for k in uri_dict.keys():
    test_uris[k] = uri_dict[k][10:30]

display(test_uris)

{'GeologicTimeScale': ['http://resource.geolba.ac.at/GeologicTimeScale/20',
  'http://resource.geolba.ac.at/GeologicTimeScale/183',
  'http://resource.geolba.ac.at/GeologicTimeScale/182',
  'http://resource.geolba.ac.at/GeologicTimeScale/114',
  'http://resource.geolba.ac.at/GeologicTimeScale/32',
  'http://resource.geolba.ac.at/GeologicTimeScale/40',
  'http://resource.geolba.ac.at/GeologicTimeScale/156',
  'http://resource.geolba.ac.at/GeologicTimeScale/205',
  'http://resource.geolba.ac.at/GeologicTimeScale/152',
  'http://resource.geolba.ac.at/GeologicTimeScale/53',
  'http://resource.geolba.ac.at/GeologicTimeScale/96',
  'http://resource.geolba.ac.at/GeologicTimeScale/119',
  'http://resource.geolba.ac.at/GeologicTimeScale/213',
  'http://resource.geolba.ac.at/GeologicTimeScale/48',
  'http://resource.geolba.ac.at/GeologicTimeScale/94',
  'http://resource.geolba.ac.at/GeologicTimeScale/197',
  'http://resource.geolba.ac.at/GeologicTimeScale/184',
  'http://resource.geolba.ac.at/Ge

In [None]:
# fetch data for each URI

docs_dict = {}

for k in uri_dict.keys():
    loader = SeleniumURLLoaderCustom(urls=test_uris[k], browser='chrome')
    docs = loader.load()
    docs_dict[k] = docs

display(docs_dict)

## Document Creation - .tsv generated by SPARQL

In [2]:
import csv
from tqdm import tqdm

parsed = {}

with open('data/query-result-thesaurus.tsv', encoding='utf8') as tsvfile:
    tsvreader = csv.reader(tsvfile,  delimiter='\t')
    next(tsvreader) # skip header row
    for row in tqdm(tsvreader):
        # structure: [uri, name, relation, value]
        uri, name, relation, value = row
        if '|[de]:' in value:
            # remove german description parts
            value = value.split('|[de]:')[0]
        t = relation+value
        if uri not in parsed:
            # format strings here
            uri_clean = uri.translate({ord(i): None for i in '<>'})
            parsed[uri] = {'link': uri_clean, 'name': name.split('@')[0], 'data': [t]}
        else:
            new = parsed[uri]['data']
            new = list(set(new + [t])) # remove duplicates since some lines are identical
            parsed[uri]['data'] = new
        
print(len(parsed))

# example output:
parsed.get("<http://resource.geolba.ac.at/GeologicUnit/225>")

0it [00:00, ?it/s]

19986it [00:00, 185133.55it/s]

4378





{'link': 'http://resource.geolba.ac.at/GeologicUnit/225',
 'name': 'Alticola Formation',
 'data': ['is part of [Carnic Alps]',
  'is further described as [en]: Depositional environment: Moderately deep shelf. Fossil content: Acritarchs, bivalves, brachiopods, bryozoans, cephalopods, chitinozoans, conodonts, corals, echinoderms, enigmatic phosphatic plates, foraminifers, gastropods, graptolites, ostracodes, scolecodonts, scyphocrinitids, trace fossils, trilobites. Boundaries: Underlying units – Cardiola Formation (conformable, sharp contact); Overlying units – Rauchkofel Formation (conformable, gradual contact), Seekopf Formation (conformable, gradual contact); Lateral units – Nölbling Formation. (Ferretti et al., 2015c)',
  'has a bibliographic reference: Ferretti, A., et al. (2015c)',
  'is described with Main outcrop areas: The Alticola Formation is well developed in the Carnic Alps, mainly at the Lake Wolayer next to Mt. Rauchkofel, Mt. Cellon (2,238 m), and Mt. Hoher Trieb (2,199 m

In [3]:
import csv
from tqdm import tqdm

with open('data/query-result-gk50.tsv', encoding='utf8') as tsvfile:
    tsvreader = csv.reader(tsvfile,  delimiter='\t')
    next(tsvreader) # skip header row
    a = False
    for row in tqdm(tsvreader):
        # structure: [uri, text, map]
        uri = row[0]
        t = row[1]+row[2]
        if uri in parsed: # assume we already got all the entities, if URI not present: skip it
            t = t.replace(f'{parsed[uri]["name"]} ', '')
            new = parsed[uri]['data']
            new = list(set(new + [t])) # remove duplicates since some lines are identical
            parsed[uri]['data'] = new
        
print(len(parsed))

# example output:
parsed.get("<http://resource.geolba.ac.at/GeologicUnit/340>")

6281it [00:00, 212472.16it/s]

4378





{'link': 'http://resource.geolba.ac.at/GeologicUnit/340',
 'name': 'Loess loam',
 'data': ['a lithogenetic unit, mainly consiting of Loam formed during Quaternary under mechanical deposition and earth surface setting, on map sheet Ober-Grafendorf Bl. 55, 2012',
  'a lithogenetic unit, mainly consiting of Loam formed during Pleistocene under deposition and earth surface setting, on map sheet Grünau i. Almtal Bl. 67, 2007',
  'a lithogenetic unit, mainly consiting of Loam formed during Holocene under traction saltation or suspension deposition and aeolian process setting, on map sheet Baden Bl. 58, 1997',
  'is related to Loess|Red loam|Weathered loam',
  'is described with A layer of weathered loess, largely free of carbonate minerals and frequently overlying unweathered loess (Hinze et al., 1989).',
  'is displayed on maps with webcolor #FFEFCC',
  'a lithogenetic unit, mainly consiting of Loam formed during Pleistocene under weathering and earth surface setting, on map sheet Wels Bl. 

In [5]:
from langchain.docstore.document import Document

docs = []

for uri in parsed.keys():
    if uri.startswith('<http://resource.geolba.ac.at/ref/'):
        continue # drop ref URIs
    val = parsed[uri]
    uri_link = uri.translate({ord(i): None for i in '<>'}) # remove brackets
    text =  'Name: ' + val['name']
    text += '\n'
    text += '-has URL ' + uri_link
    text += '\n'
    text += '-'
    text += '\n-'.join(val['data'])
    text += '\n'
    docs += [Document(page_content=text, metadata={"source": uri_link})]

display(len(docs))

docs[10:20]

3065

[Document(page_content='Name: Lower Devonian\n-has URL http://resource.geolba.ac.at/GeologicTimeScale/52\n-also known as Early Devonian\n-see also http://resource.geosciml.org/classifier/ics/ischart/LowerDevonian|http://inspire.ec.europa.eu/codelist/GeochronologicEraValue/lowerDevonian\n-is described with Series/Epoch; 419.2±3.2–393.3±1.2 Ma (ICS  2013).|Series/Epoch; 419.2±3.2–393.3±1.2 Ma (Cohen et al., 2022/10)\n-is derived from http://www.stratigraphy.org/ICSchart/ChronostratChart2013-01.pdf\n-is displayed on maps with webcolor #E5AC4D\n-has a bibliographic reference: Cohen, K.M., Finney, S. & Gibbard, P.L. (International Commission on Stratigraphy [Ed.]) (2013/01)|Cohen, K.M., Finney, S.C., Gibbard, P.L. & Fan, J.-X. (International Commission on Stratigraphy [Ed.]) (2022/10)\n-is part of Devonian\n-includes Lochkovian|Pragian|Emsian\n', metadata={'source': 'http://resource.geolba.ac.at/GeologicTimeScale/52'}),
 Document(page_content='Name: Lower Ordovician\n-has URL http://resourc

## Document Embedding

### evaluate token usage

In [7]:
import tiktoken
from langchain.docstore.document import Document

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def num_tokens_from_document(doc: Document, encoding_name: str) -> int:
    """Returns the number of tokens in a Document object."""
    return num_tokens_from_string(doc.page_content, encoding_name)

In [8]:
docs_ext = {}

for doc in docs:
    docs_ext[doc.metadata['source']] = {'doc': doc,
                                        'len': len(doc.page_content),
                                        'tokens': num_tokens_from_document(doc, "cl100k_base")}

In [9]:
# str length
total = 0
max, min = 0, 9999
max_id, min_id = '', ''

for doc in docs:
    l = docs_ext[doc.metadata['source']]['len']
    total += l
    if l > max:
        max = l
        max_id = doc.metadata['source']
    if l < min:
        min = l
        min_id = doc.metadata['source']

print(f'average document len: {total/len(docs)}')
print(f'min document len: {min}, meta: {min_id}')
print(f'max document len: {max}, meta: {max_id}')

average document len: 951.0078303425774
min document len: 110, meta: http://resource.geolba.ac.at/tectonicunit/180
max document len: 26387, meta: http://resource.geolba.ac.at/tectonicunit/112


In [10]:
# tokens
tokens_total = 0
tokens_max, tokens_min = 0, 9999
tokens_max_id, tokens_min_id = '', ''

for doc in docs:
    l = docs_ext[doc.metadata['source']]['tokens']
    tokens_total += l
    if l > tokens_max:
        tokens_max = l
        tokens_max_id = doc.metadata['source']
    if l < tokens_min:
        tokens_min = l
        tokens_min_id = doc.metadata['source']

# Ada v2 pricing: 	$0.0001 / 1K tokens
print(f'embedding {tokens_total} tokens would cost ${tokens_total*0.0001}')
print()
print(f'average document len: {tokens_total/len(docs)}')
print(f'min document len: {tokens_min}, meta: {tokens_min_id}')
print(f'max document len: {tokens_max}, meta: {tokens_max_id}')

embedding 898765 tokens would cost $89.87650000000001

average document len: 293.2349102773246
min document len: 35, meta: http://resource.geolba.ac.at/minres/33
max document len: 10817, meta: http://resource.geolba.ac.at/tectonicunit/112


In [11]:
print(docs_ext[tokens_min_id]['doc'].page_content)

Name: [Kalzit]
-has URL http://resource.geolba.ac.at/minres/33
-is in database coded with Cal
-is part of Industrial minerals



In [12]:
print(docs_ext[tokens_max_id]['doc'].page_content)

Name: Upper Pliocene to Quaternary Sediments
-has URL http://resource.geolba.ac.at/tectonicunit/112
-includes Proglacial gravel, on map sheet Ried i. Innkreis Bl. 47, 2008 or Eisenerz Bl. 101, 2010 or Bad Ischl Bl. 96, 1982 or Spittal a. d. Drau Bl. 182, 2006 or Geologisches Modell 1:200.000 Österreich(Salzburganteil) or Hallein Bl. 94, 1987 or St. Wolfgang i. S. Bl. 95, 1982 or Grünau i. Almtal Bl. 67, 2007 or Mondsee Bl. 65, 1989 or Straßwalchen Bl. 64, 2003 or Geologisches Modell 1:200.000 Österreich(Oberösterreichanteil) or Geologisches Modell 1:200.000 Österreich(Tirolanteil)
-includes Torrent deposit, on map sheet Wiener Neustadt Bl. 76, 1982
-includes Debris flow deposits, on map sheet Eisenerz Bl. 101, 2010
-includes Hochterrasse, on map sheet Ober-Grafendorf Bl. 55, 2012 or Ried i. Innkreis Bl. 47, 2008 or Wels Bl. 49, 1996 or Wien Bl. 59, 1985 or Eisenstadt Bl. 77, 1994 or Krems a. d. Donau Bl. 38, 1984 or Passau Bl. 12, 1994 or Freistadt Bl. 16, 2010 or Steyregg Bl. 33, 2002

In [13]:
# clear the file
if docs_ext:
    open('data/docs_humanreadable.txt', 'w').close() 

    # write document data to file for browsing at a glance
    with open('data/docs_humanreadable.txt', 'a', encoding='utf8') as txtf:
        for uri in docs_ext.keys():
            txtf.write(docs_ext[uri]['doc'].page_content)

In [14]:
# writing to file for persistence
import pickle

with open('data/docs_formatted.pkl', 'wb') as outf:
    pickle.dump(docs_ext, outf, pickle.HIGHEST_PROTOCOL)

In [15]:
# parsing from file

import pickle

with open('data/docs_formatted.pkl', 'rb') as inf:
    docs_ext_r = pickle.load(inf)

docs_ext_r

{'http://resource.geolba.ac.at/GeologicTimeScale/216': {'doc': Document(page_content='Name: Riss\n-has URL http://resource.geolba.ac.at/GeologicTimeScale/216\n-has a bibliographic reference: van Husen, D. & Reitner, J. (2011)\n-also known as Riß Glacial\n-is related to Middle Pleistocene\n-is related to Ionian\n-is described with Glacial stage which corresponds to the Marine Isotope Stage 6 (van Husen & Reitner, 2011).\n-is displayed on maps with webcolor #FFFFE5\n-is part of Alpine glacial cycles\n', metadata={'source': 'http://resource.geolba.ac.at/GeologicTimeScale/216'}),
  'len': 409,
  'tokens': 118},
 'http://resource.geolba.ac.at/GeologicTimeScale/42': {'doc': Document(page_content='Name: Series 3\n-has URL http://resource.geolba.ac.at/GeologicTimeScale/42\n-is part of Cambrian\n-is derived from http://www.stratigraphy.org/ICSchart/ChronostratChart2013-01.pdf\n-is displayed on maps with webcolor #A6CF86\n-also known as Series 3\n-has a bibliographic reference: Cohen, K.M., Finn

### document splitting

probably required to handle larger documents

In [33]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

chunk_size = 2000

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = chunk_size,
    chunk_overlap  = 20,
    is_separator_regex = False,
    separators=['\n-']
)

In [34]:
# split documents into more if too big, keep header for each one
all_docs = []

for uri in docs_ext_r.keys():
    content = str(docs_ext_r[uri]['doc'].page_content).splitlines(True)
    meta = docs_ext_r[uri]['doc'].metadata

    header = ''.join(content[0:2])
    body = ''.join(content[2:])
    # print(header)
    # print(body)
    split_list = text_splitter.create_documents([body]) # can return 1...n documents, splitted

    for idx, doc in enumerate(split_list):
        split_list[idx].page_content = header + doc.page_content
        split_list[idx].metadata = meta
        # print(f'---------{num_tokens_from_string(new_doc, "cl100k_base")}---------')
        # print(new_doc)
    
    docs_ext_r[uri]['split_list'] = split_list
    all_docs += split_list

# example:
docs_ext_r['http://resource.geolba.ac.at/GeologicTimeScale/216']

{'doc': Document(page_content='Name: Riss\n-has URL http://resource.geolba.ac.at/GeologicTimeScale/216\n-has a bibliographic reference: van Husen, D. & Reitner, J. (2011)\n-also known as Riß Glacial\n-is related to Middle Pleistocene\n-is related to Ionian\n-is described with Glacial stage which corresponds to the Marine Isotope Stage 6 (van Husen & Reitner, 2011).\n-is displayed on maps with webcolor #FFFFE5\n-is part of Alpine glacial cycles\n', metadata={'source': 'http://resource.geolba.ac.at/GeologicTimeScale/216'}),
 'len': 409,
 'tokens': 118,
 'split_list': [Document(page_content='Name: Riss\n-has URL http://resource.geolba.ac.at/GeologicTimeScale/216\n-has a bibliographic reference: van Husen, D. & Reitner, J. (2011)\n-also known as Riß Glacial\n-is related to Middle Pleistocene\n-is related to Ionian\n-is described with Glacial stage which corresponds to the Marine Isotope Stage 6 (van Husen & Reitner, 2011).\n-is displayed on maps with webcolor #FFFFE5\n-is part of Alpine gl

In [35]:
print(f'unique URIs (documents before splitting):\t{len(docs_ext_r)}')
print(f'documents after splitting:\t\t\t{len(all_docs)}')

unique URIs (documents before splitting):	3065
documents after splitting:			3410


In [36]:
print(f"number of splits for longest document with URI {tokens_max_id}: {len(docs_ext_r[tokens_max_id]['split_list'])}")
for doc in docs_ext_r[tokens_max_id]['split_list']:
    print(doc.page_content)

number of splits for longest document with URI http://resource.geolba.ac.at/tectonicunit/112: 17
Name: Upper Pliocene to Quaternary Sediments
-has URL http://resource.geolba.ac.at/tectonicunit/112
-includes Proglacial gravel, on map sheet Ried i. Innkreis Bl. 47, 2008 or Eisenerz Bl. 101, 2010 or Bad Ischl Bl. 96, 1982 or Spittal a. d. Drau Bl. 182, 2006 or Geologisches Modell 1:200.000 Österreich(Salzburganteil) or Hallein Bl. 94, 1987 or St. Wolfgang i. S. Bl. 95, 1982 or Grünau i. Almtal Bl. 67, 2007 or Mondsee Bl. 65, 1989 or Straßwalchen Bl. 64, 2003 or Geologisches Modell 1:200.000 Österreich(Oberösterreichanteil) or Geologisches Modell 1:200.000 Österreich(Tirolanteil)
-includes Torrent deposit, on map sheet Wiener Neustadt Bl. 76, 1982
-includes Debris flow deposits, on map sheet Eisenerz Bl. 101, 2010
-includes Hochterrasse, on map sheet Ober-Grafendorf Bl. 55, 2012 or Ried i. Innkreis Bl. 47, 2008 or Wels Bl. 49, 1996 or Wien Bl. 59, 1985 or Eisenstadt Bl. 77, 1994 or Krems a

In [37]:
# writing to file for persistence
import pickle

with open('data/docs_formatted_split.pkl', 'wb') as outf:
    pickle.dump(docs_ext_r, outf, pickle.HIGHEST_PROTOCOL)

In [38]:
# parsing from file

import pickle

with open('data/docs_formatted_split.pkl', 'rb') as inf:
    docs_split = pickle.load(inf)

docs_split

{'http://resource.geolba.ac.at/GeologicTimeScale/216': {'doc': Document(page_content='Name: Riss\n-has URL http://resource.geolba.ac.at/GeologicTimeScale/216\n-has a bibliographic reference: van Husen, D. & Reitner, J. (2011)\n-also known as Riß Glacial\n-is related to Middle Pleistocene\n-is related to Ionian\n-is described with Glacial stage which corresponds to the Marine Isotope Stage 6 (van Husen & Reitner, 2011).\n-is displayed on maps with webcolor #FFFFE5\n-is part of Alpine glacial cycles\n', metadata={'source': 'http://resource.geolba.ac.at/GeologicTimeScale/216'}),
  'len': 409,
  'tokens': 118,
  'split_list': [Document(page_content='Name: Riss\n-has URL http://resource.geolba.ac.at/GeologicTimeScale/216\n-has a bibliographic reference: van Husen, D. & Reitner, J. (2011)\n-also known as Riß Glacial\n-is related to Middle Pleistocene\n-is related to Ionian\n-is described with Glacial stage which corresponds to the Marine Isotope Stage 6 (van Husen & Reitner, 2011).\n-is disp

In [40]:
# clear the file
open('./outfile.txt', 'w').close() 

for uri in docs_split.keys():
    if len(docs_split[uri]['split_list']) > 1:
        for doc in docs_split[uri]['split_list']: 
            # write document data to file for browsing at a glance
            with open('./outfile.txt', 'a', encoding='utf8') as txtf:
                txtf.write(f'{doc.page_content}\n')

### ChromaDB setup

In [45]:
# import the chromaDB driver and open a connection
import chromadb
chroma_client = chromadb.HttpClient(host="localhost", port="8000")


print(f'installed chromadb local client version: {chromadb.__version__}')
print(f'remote chromadb server version: {chroma_client.get_version()}')

installed chromadb local client version: 0.4.14
remote chromadb server version: 0.4.14


In [3]:
# LangChain wrapper around chromaDB - don't mix these up!
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

collection_name = "geo_demo"

lc_client = Chroma(client=chroma_client,
                   collection_name=collection_name,
                   embedding_function=OpenAIEmbeddings())

In [4]:
lc_client._client.list_collections()

[Collection(name=test_langchain), Collection(name=geo_test)]

In [28]:
collection = lc_client._client.get_collection(collection_name)

lc_client._client._peek(collection.id) # _peek needs UUID instead of collection_name

{'ids': ['c8f4d2ff-41cb-11ee-a421-c98b5e6f8eef',
  'cc64144b-41cb-11ee-81eb-c98b5e6f8eef',
  'cce06edf-41cb-11ee-9969-c98b5e6f8eef',
  '95e0155f-41cc-11ee-9312-c98b5e6f8eef',
  'a52c9369-41cc-11ee-a5cf-c98b5e6f8eef',
  'b46fdeef-41cc-11ee-a3ed-c98b5e6f8eef',
  'c385116b-41cc-11ee-be15-c98b5e6f8eef',
  'd2d6b071-41cc-11ee-a4e8-c98b5e6f8eef',
  'e2433c2c-41cc-11ee-9e73-c98b5e6f8eef',
  'f18a3308-41cc-11ee-99d3-c98b5e6f8eef'],
 'embeddings': [[-0.01761186681687832,
   -0.02491205371916294,
   0.008221112191677094,
   -0.009417643770575523,
   -0.013914719223976135,
   0.03041072003543377,
   -0.018821842968463898,
   -0.016643887385725975,
   0.006543951574712992,
   -0.024145735427737236,
   0.020583029836416245,
   0.027936993166804314,
   -0.014371821656823158,
   -0.004026529844850302,
   -0.012462749145925045,
   0.013565171509981155,
   0.039687201380729675,
   0.017181653529405594,
   0.03570772334933281,
   -0.00474579306319356,
   -0.004009724594652653,
   0.01841851696372032,
  

### embedding and storing documents in chromaDB

TODO: add proper rate limiting like in https://cookbook.openai.com/examples/how_to_handle_rate_limits

In [62]:
import time
from tqdm import tqdm

for k in tqdm(docs_dict.keys()):
    for doc in tqdm(docs_dict[k]):
        # adding documents when unsure if URI exists
        get_res = lc_client.get(where=doc.metadata, include=['metadatas'])

        if not get_res['ids']:
            lc_client.add_documents([doc])
        else:
            # do the stuff above to update...
            lc_client.update_document(get_res['ids'][0], doc)
        
        time.sleep(25) # rate limit for embedding: 3/min



100%|██████████| 19/19 [08:07<00:00, 25.66s/it]
100%|██████████| 20/20 [08:30<00:00, 25.55s/it]
100%|██████████| 20/20 [08:32<00:00, 25.61s/it]
100%|██████████| 19/19 [08:03<00:00, 25.46s/it]
100%|██████████| 20/20 [08:28<00:00, 25.44s/it]
100%|██████████| 20/20 [08:30<00:00, 25.50s/it]
100%|██████████| 17/17 [07:12<00:00, 25.46s/it]
100%|██████████| 7/7 [57:25<00:00, 492.27s/it]


### check chromaDB collections

In [63]:
# we can access the underlying chromadb HttpClient using lc_client._client for granular operations
# ⚠️ important to not get these confused!

collection = lc_client._client.get_collection(collection_name)

lc_client._client._peek(collection.id) # _peek needs UUID instead of collection_name

{'ids': ['c8f4d2ff-41cb-11ee-a421-c98b5e6f8eef',
  'cc64144b-41cb-11ee-81eb-c98b5e6f8eef',
  'cce06edf-41cb-11ee-9969-c98b5e6f8eef',
  '95e0155f-41cc-11ee-9312-c98b5e6f8eef',
  'a52c9369-41cc-11ee-a5cf-c98b5e6f8eef',
  'b46fdeef-41cc-11ee-a3ed-c98b5e6f8eef',
  'c385116b-41cc-11ee-be15-c98b5e6f8eef',
  'd2d6b071-41cc-11ee-a4e8-c98b5e6f8eef',
  'e2433c2c-41cc-11ee-9e73-c98b5e6f8eef',
  'f18a3308-41cc-11ee-99d3-c98b5e6f8eef'],
 'embeddings': [[-0.01761186681687832,
   -0.02491205371916294,
   0.008221112191677094,
   -0.009417643770575523,
   -0.013914719223976135,
   0.03041072003543377,
   -0.018821842968463898,
   -0.016643887385725975,
   0.006543951574712992,
   -0.024145735427737236,
   0.020583029836416245,
   0.027936993166804314,
   -0.014371821656823158,
   -0.004026529844850302,
   -0.012462749145925045,
   0.013565171509981155,
   0.039687201380729675,
   0.017181653529405594,
   0.03570772334933281,
   -0.00474579306319356,
   -0.004009724594652653,
   0.01841851696372032,
  

In [65]:
lc_client.get(include=['documents', 'metadatas'])

{'ids': ['c8f4d2ff-41cb-11ee-a421-c98b5e6f8eef',
  'cc64144b-41cb-11ee-81eb-c98b5e6f8eef',
  'cce06edf-41cb-11ee-9969-c98b5e6f8eef',
  '95e0155f-41cc-11ee-9312-c98b5e6f8eef',
  'a52c9369-41cc-11ee-a5cf-c98b5e6f8eef',
  'b46fdeef-41cc-11ee-a3ed-c98b5e6f8eef',
  'c385116b-41cc-11ee-be15-c98b5e6f8eef',
  'd2d6b071-41cc-11ee-a4e8-c98b5e6f8eef',
  'e2433c2c-41cc-11ee-9e73-c98b5e6f8eef',
  'f18a3308-41cc-11ee-99d3-c98b5e6f8eef',
  '00cb23cc-41cd-11ee-be28-c98b5e6f8eef',
  '0fea54bc-41cd-11ee-9f22-c98b5e6f8eef',
  '2006c8e3-41cd-11ee-8454-c98b5e6f8eef',
  '2f24c760-41cd-11ee-a244-c98b5e6f8eef',
  '3e3e7b37-41cd-11ee-b7a9-c98b5e6f8eef',
  '4d654af9-41cd-11ee-8df1-c98b5e6f8eef',
  '5c770505-41cd-11ee-a28f-c98b5e6f8eef',
  '6bb12d11-41cd-11ee-9fd5-c98b5e6f8eef',
  '7ad39531-41cd-11ee-bf95-c98b5e6f8eef',
  '8a2f4120-41cd-11ee-ad16-c98b5e6f8eef',
  '995b5f5d-41cd-11ee-b61b-c98b5e6f8eef',
  'a87199da-41cd-11ee-baa3-c98b5e6f8eef',
  'b78473aa-41cd-11ee-b7be-c98b5e6f8eef',
  'c6a7bc0d-41cd-11ee-bbfd-