# Geo QA 

### setup

In [35]:
from dotenv import load_dotenv

# create a file here: `/src/.env` and populate it like shown in `src/.env-example`
load_dotenv()

## Document Creation 

### get URIs from .rdf files

In [17]:
def get_URIs_from_file(fp, baseURL):
    try:
        from rdflib import Graph
        from tqdm import tqdm

        uris = []
        uris_not_relevant = []
        for node in tqdm(Graph().parse(fp)): # Create a Graph # Parse in an RDF file:
            if str(node[0]).startswith(baseURL) and not str(node[0]).endswith('adms/0.1'):
                uris += [str(node[0])]
            else:
                uris_not_relevant += [str(node[0])]
                pass
        # cast to set to remove duplicates since one URI has many associated triples ("nodes")
        return list(set(uris)), list(set(uris_not_relevant)) 
    except:
        print('Error: couldn\'t parse file. Check the file path, and if rdflib is installed.')
        return []

In [27]:
# check URI list
relevant,not_relevant = get_URIs_from_file(fp='data/rdf/lithology.rdf', baseURL='http://resource.geolba.ac.at/lithology/') 
print(len(relevant), relevant[0:2])
print(len(not_relevant), not_relevant[0:2])

100%|██████████| 5642/5642 [00:00<00:00, 351482.51it/s]

257 ['http://resource.geolba.ac.at/lithology/78', 'http://resource.geolba.ac.at/lithology/250']
68 ['http://resource.geosciml.org/classifier/cgi/lithology/carbonate_sedimentary_rock', 'http://inspire.ec.europa.eu/codelist/LithologyValue/ashTuffLapillistoneAndLapilliTuff']





In [34]:
import os

data_dir = 'data/rdf/'
thesaurus_baseURL = 'http://resource.geolba.ac.at/' # {graphname} + '/'
uri_dict = {}

for entry in os.scandir(data_dir):
    if entry.path.endswith((".rdf")):
        fp = entry.path
        graph_name = fp.split('/')[-1].split('.')[0]
        baseURL = f"{thesaurus_baseURL}{graph_name}/"
        print(f'Processing file {fp} with baseURL {baseURL}')
        file_uris, _ = get_URIs_from_file(fp=fp, baseURL=baseURL)
        uri_dict[graph_name] = file_uris

print('\nEntities found:')
for key in uri_dict.keys():
    print(f'{key}: {len(uri_dict[key])} nodes')

Processing file data/rdf/GeologicTimeScale.rdf with baseURL http://resource.geolba.ac.at/GeologicTimeScale/


100%|██████████| 5532/5532 [00:00<00:00, 355839.80it/s]


Processing file data/rdf/GeologicUnit.rdf with baseURL http://resource.geolba.ac.at/GeologicUnit/


100%|██████████| 19446/19446 [00:00<00:00, 365496.79it/s]


Processing file data/rdf/lithology.rdf with baseURL http://resource.geolba.ac.at/lithology/


100%|██████████| 5642/5642 [00:00<00:00, 294878.11it/s]


Processing file data/rdf/mineral.rdf with baseURL http://resource.geolba.ac.at/mineral/


100%|██████████| 16118/16118 [00:00<00:00, 359437.86it/s]


Processing file data/rdf/minres.rdf with baseURL http://resource.geolba.ac.at/minres/


100%|██████████| 1761/1761 [00:00<00:00, 319775.28it/s]


Processing file data/rdf/structure.rdf with baseURL http://resource.geolba.ac.at/structure/


100%|██████████| 4984/4984 [00:00<00:00, 366975.83it/s]


Processing file data/rdf/tectonicunit.rdf with baseURL http://resource.geolba.ac.at/tectonicunit/


100%|██████████| 6915/6915 [00:00<00:00, 353435.36it/s]


Entities found:
GeologicTimeScale: 221 nodes
GeologicUnit: 970 nodes
lithology: 257 nodes
mineral: 825 nodes
minres: 103 nodes
structure: 285 nodes
tectonicunit: 393 nodes





### customize URLLoader

In [44]:
"""
Overriding the load() function of SeleniumURLLoader
"""
from langchain.document_loaders import SeleniumURLLoader

import logging
logger = logging.getLogger(SeleniumURLLoader.__name__)

from typing import TYPE_CHECKING, List, Literal, Optional, Union
from langchain.docstore.document import Document

from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.remote.webelement import WebElement
from selenium.common.exceptions import TimeoutException

from tqdm import tqdm

class SeleniumURLLoaderCustom(SeleniumURLLoader):

    def _build_doc_text(self, elements: List[WebElement]) -> str:
        # TODO remove old function
        doc_text = ''
        
        for elem in elements:
            print(elem.text)

            doc_text += elem.text

        return doc_text
    
    def _format_lnbreak(self, text:str, relation: str) -> str:
        return text.replace(f'{relation}\n', f'{relation}: ')
    
    def load(self) -> List[Document]:
        """Load the specified URLs using Selenium and create Document instances.

        Returns:
            List[Document]: A list of Document instances with loaded content.
        """
        from unstructured.partition.html import partition_html

        docs: List[Document] = list()
        driver = self._get_driver()

        html_temp = []

        for url in (pbar := tqdm(self.urls)):
            pbar.set_description(f'Processing URL: {"".join(url.split("/")[-2:-1])}')
            try:
                try: 
                    driver.get(url)
                    elements = WebDriverWait(driver, 5).until(
                        EC.all_of(
                            # EC.presence_of_element_located((By.CSS_SELECTOR, ".abstract")),
                            EC.presence_of_element_located((By.CSS_SELECTOR, "h3")),
                        )
                    )
                except TimeoutException:
                    if 'no results for <br>URI' in driver.page_source:
                        continue # skip this iteration if URI is not "real"

                # page is now loaded, get source HTML
                html = driver.page_source

                # build document structure here
                # extract all innerHTML from the page source
                elements = partition_html(text=html)
                text = "\n".join([str(el) for el in elements])

                # use only the text in the main section
                text = text.split('RDF download')[1].split('skos:prefLabel')[0]

                # remove line breaks from name
                text = text.replace('  de\n', '  (de) / ')
                text = text.replace('  en\n', '  (en)\n')

                # reformat concept relations
                relations = [
                    'broader',
                    'broadMatch',
                    'narrower',
                    'narrowMatch',
                    'exactMatch',
                    'related',
                    'closeMatch'
                ]
                for relation in relations:
                    text = self._format_lnbreak(text, relation)

                # html_temp += [html]
                docs.append(Document(page_content=text, metadata={"source": url}))
            except Exception as e:
                if self.continue_on_failure:
                    logger.error(f"Error fetching or processing {url}, {type(e).__name__}: {e}")
                else:
                    raise e

        driver.quit()
        return docs

In [43]:
# check the output of the custom URLLoader for an example URL
SeleniumURLLoaderCustom(urls=['http://resource.geolba.ac.at/tectonicunit/194'], browser='chrome').load()

Processing URL: tectonicunit: 100%|██████████| 1/1 [00:01<00:00,  1.94s/it]


[Document(page_content='\nTroiseck-Floning-Teildecke  (de) / Troiseck-Floning Subnappe  (en)\nConcept relations\nbroader: Bösenstein-Pletzen Nappe\n', metadata={'source': 'http://resource.geolba.ac.at/tectonicunit/194'})]

### use URIs and URLLoader to create documents

In [7]:
# create a ⚠️testing⚠️ dataset

test_uris = {}

for k in uri_dict.keys():
    test_uris[k] = uri_dict[k][10:30]

display(test_uris)

{'GeologicTimeScale': ['http://resource.geolba.ac.at/GeologicTimeScale/20',
  'http://resource.geolba.ac.at/GeologicTimeScale/183',
  'http://resource.geolba.ac.at/GeologicTimeScale/182',
  'http://resource.geolba.ac.at/GeologicTimeScale/114',
  'http://resource.geolba.ac.at/GeologicTimeScale/32',
  'http://resource.geolba.ac.at/GeologicTimeScale/40',
  'http://resource.geolba.ac.at/GeologicTimeScale/156',
  'http://resource.geolba.ac.at/GeologicTimeScale/205',
  'http://resource.geolba.ac.at/GeologicTimeScale/152',
  'http://resource.geolba.ac.at/GeologicTimeScale/53',
  'http://resource.geolba.ac.at/GeologicTimeScale/96',
  'http://resource.geolba.ac.at/GeologicTimeScale/119',
  'http://resource.geolba.ac.at/GeologicTimeScale/213',
  'http://resource.geolba.ac.at/GeologicTimeScale/48',
  'http://resource.geolba.ac.at/GeologicTimeScale/94',
  'http://resource.geolba.ac.at/GeologicTimeScale/197',
  'http://resource.geolba.ac.at/GeologicTimeScale/184',
  'http://resource.geolba.ac.at/Ge

In [None]:
# fetch data for each URI

docs_dict = {}

for k in uri_dict.keys():
    loader = SeleniumURLLoaderCustom(urls=test_uris[k], browser='chrome')
    docs = loader.load()
    docs_dict[k] = docs

display(docs_dict)

## Document Embedding

### ChromaDB setup

In [1]:
# import the chromaDB driver and open a connection
import chromadb
chroma_client = chromadb.HttpClient(host="localhost", port="8000")

chromadb.__version__

'0.4.6'

In [42]:
# LangChain wrapper around chromaDB - don't mix these up!
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

collection_name = "geo_test"

lc_client = Chroma(client=chroma_client,
                collection_name=collection_name,
                embedding_function=OpenAIEmbeddings())

In [43]:
lc_client._client.list_collections()

[Collection(name=test_langchain), Collection(name=geo_test)]

### embedding and storing documents in chromaDB

In [62]:
import time
from tqdm import tqdm

for k in tqdm(docs_dict.keys()):
    for doc in tqdm(docs_dict[k]):
        # adding documents when unsure if URI exists
        get_res = lc_client.get(where=doc.metadata, include=['metadatas'])

        if not get_res['ids']:
            lc_client.add_documents([doc])
        else:
            # do the stuff above to update...
            lc_client.update_document(get_res['ids'][0], doc)
        
        time.sleep(25) # rate limit for embedding: 3/min



100%|██████████| 19/19 [08:07<00:00, 25.66s/it]
100%|██████████| 20/20 [08:30<00:00, 25.55s/it]
100%|██████████| 20/20 [08:32<00:00, 25.61s/it]
100%|██████████| 19/19 [08:03<00:00, 25.46s/it]
100%|██████████| 20/20 [08:28<00:00, 25.44s/it]
100%|██████████| 20/20 [08:30<00:00, 25.50s/it]
100%|██████████| 17/17 [07:12<00:00, 25.46s/it]
100%|██████████| 7/7 [57:25<00:00, 492.27s/it]


### check chromaDB collections

In [63]:
# we can access the underlying chromadb HttpClient using lc_client._client for granular operations
# ⚠️ important to not get these confused!

collection = lc_client._client.get_collection(collection_name)

lc_client._client._peek(collection.id) # _peek needs UUID instead of collection_name

{'ids': ['c8f4d2ff-41cb-11ee-a421-c98b5e6f8eef',
  'cc64144b-41cb-11ee-81eb-c98b5e6f8eef',
  'cce06edf-41cb-11ee-9969-c98b5e6f8eef',
  '95e0155f-41cc-11ee-9312-c98b5e6f8eef',
  'a52c9369-41cc-11ee-a5cf-c98b5e6f8eef',
  'b46fdeef-41cc-11ee-a3ed-c98b5e6f8eef',
  'c385116b-41cc-11ee-be15-c98b5e6f8eef',
  'd2d6b071-41cc-11ee-a4e8-c98b5e6f8eef',
  'e2433c2c-41cc-11ee-9e73-c98b5e6f8eef',
  'f18a3308-41cc-11ee-99d3-c98b5e6f8eef'],
 'embeddings': [[-0.01761186681687832,
   -0.02491205371916294,
   0.008221112191677094,
   -0.009417643770575523,
   -0.013914719223976135,
   0.03041072003543377,
   -0.018821842968463898,
   -0.016643887385725975,
   0.006543951574712992,
   -0.024145735427737236,
   0.020583029836416245,
   0.027936993166804314,
   -0.014371821656823158,
   -0.004026529844850302,
   -0.012462749145925045,
   0.013565171509981155,
   0.039687201380729675,
   0.017181653529405594,
   0.03570772334933281,
   -0.00474579306319356,
   -0.004009724594652653,
   0.01841851696372032,
  

In [65]:
lc_client.get(include=['documents', 'metadatas'])

{'ids': ['c8f4d2ff-41cb-11ee-a421-c98b5e6f8eef',
  'cc64144b-41cb-11ee-81eb-c98b5e6f8eef',
  'cce06edf-41cb-11ee-9969-c98b5e6f8eef',
  '95e0155f-41cc-11ee-9312-c98b5e6f8eef',
  'a52c9369-41cc-11ee-a5cf-c98b5e6f8eef',
  'b46fdeef-41cc-11ee-a3ed-c98b5e6f8eef',
  'c385116b-41cc-11ee-be15-c98b5e6f8eef',
  'd2d6b071-41cc-11ee-a4e8-c98b5e6f8eef',
  'e2433c2c-41cc-11ee-9e73-c98b5e6f8eef',
  'f18a3308-41cc-11ee-99d3-c98b5e6f8eef',
  '00cb23cc-41cd-11ee-be28-c98b5e6f8eef',
  '0fea54bc-41cd-11ee-9f22-c98b5e6f8eef',
  '2006c8e3-41cd-11ee-8454-c98b5e6f8eef',
  '2f24c760-41cd-11ee-a244-c98b5e6f8eef',
  '3e3e7b37-41cd-11ee-b7a9-c98b5e6f8eef',
  '4d654af9-41cd-11ee-8df1-c98b5e6f8eef',
  '5c770505-41cd-11ee-a28f-c98b5e6f8eef',
  '6bb12d11-41cd-11ee-9fd5-c98b5e6f8eef',
  '7ad39531-41cd-11ee-bf95-c98b5e6f8eef',
  '8a2f4120-41cd-11ee-ad16-c98b5e6f8eef',
  '995b5f5d-41cd-11ee-b61b-c98b5e6f8eef',
  'a87199da-41cd-11ee-baa3-c98b5e6f8eef',
  'b78473aa-41cd-11ee-b7be-c98b5e6f8eef',
  'c6a7bc0d-41cd-11ee-bbfd-