Example ODV

In [1]:
from src.odv import *
import json
import os
from pathlib import Path

from SPARQLWrapper import SPARQLWrapper, JSON
from langchain import PromptTemplate, OpenAI

Set environment variables

In [2]:
os.environ["OPENAI_API_KEY"] = "" # enter your Open API Key here
os.environ["TEST_MODE"] = "true"
os.environ["SPARQL_ENDPOINT"] = "https://bodc.cluster-clc4weqlymzs.ap-southeast-2.neptune.amazonaws.com:8182/sparql"

Get the 'head' of an ODV file

In [3]:
odv_text = read_file(Path("data/000545_ODV_77AR2009_00095_H09_V0.txt"))
print(odv_text)

﻿//<sdn_reference xlink:href="https://cdi.seadatanet.org/report/edmo/545/77AR2009_00095_H09" xlink:role="isDescribedBy" xlink:type="SDN:L23::CDI"/>
//
//SDN_parameter_mapping
//<subject>SDN:LOCAL:DEPH</subject><object>SDN:P01::ADEPZZ01</object><units>SDN:P06::ULAA</units>
//<subject>SDN:LOCAL:TEMP</subject><object>SDN:P01::TEMPPR01</object><units>SDN:P06::UPAA</units>
//<subject>SDN:LOCAL:PSAL</subject><object>SDN:P01::PSLTZZ01</object><units>SDN:P06::UUUU</units>
//<subject>SDN:LOCAL:PHPH</subject><object>SDN:P01::PHXXZZXX</object><units>SDN:P06::UUPH</units>
//<subject>SDN:LOCAL:DOX1</subject><object>SDN:P01::DOXYZZXX</object><units>SDN:P06::UMLL</units>
//<subject>SDN:LOCAL:HSUL</subject><object>SDN:P01::H2SXZZXX</object><units>SDN:P06::UPOX</units>
//<subject>SDN:LOCAL:PHOS</subject><object>SDN:P01::PHOSZZXX</object><units>SDN:P06::UPOX</units>
//<subject>SDN:LOCAL:TPHS</subject><object>SDN:P01::TPHSZZXX</object><units>SDN:P06::UPOX</units>
//<subject>SDN:LOCAL:NTRI</subject><objec

Populate a Large Language Model prompt with the head of the file:

In [4]:
prompt = create_odv_prompt(odv_text)
print(prompt)

    The following data is the first 10000 characters from an Ocean Data View file.
    There may be comment lines at the beginning of the file, which start with //.
    I am interested in, for "value" columns:
        1. Vocabularies/concepts used for the columns, these may be specified in columns with a URN, or they may not be specified at all.
        2. Units/concepts for the same columns. These may also be specified with a URN, or not at all, or in the column heading itself or both.
    I am not interested in "Local" URNs. These are of the form "SDN:LOCAL:ABCD". These are only used to map from the comments to the column labels in the data itself.
    I am interested in non "Local" URNs. These are of the form "SDN:P01::ABCDEFG" These refer to external vocabularies.
    I am also interested in Instrument and Observation Platform information if available.
    If a column name is repeated multiple times, it's probably not actually a column - please ignore it.
    Please extract this in

Run the populated prompt against a large language model - this is akin to sending a question to ChatGPT

In [5]:
llm = OpenAI(model_name="gpt-3.5-turbo-0613")
output = llm(prompt)
print(output)
# tests/data/odv_response.json



{
    "columns": [
        {
            "column_name": {
                "column_unit_text": "[meter]",
                "column_vocabulary_urn": "SDN:P01::ADEPZZ01",
                "column_vocabulary_text": "Bot. Depth"
            }
        },
        {
            "column_name": {
                "column_vocabulary_urn": "SDN:P01::TEMPPR01",
                "column_unit_text": "[Celsius degree]",
                "column_vocabulary_text": "TEMP"
            }
        },
        {
            "column_name": {
                "column_vocabulary_text": "PSAL",
                "column_vocabulary_urn": "SDN:P01::PSLTZZ01",
                "column_unit_text": "[P.S.U.]"
            }
        },
        {
            "column_name": {
                "column_unit_text": "[pH unit]",
                "column_vocabulary_text": "PHPH",
                "column_vocabulary_urn": "SDN:P01::PHXXZZXX"
            }
        },
        {
            "column_name": {
                "column_unit_text": 

Parse URNs, URIs and text from the structured output

In [7]:
vocab_urns, unit_urns = get_urns_from_odv(output)
vocab_query = find_vocabs_sparql(vocab_urns)
unit_query = find_vocabs_sparql(unit_urns)
print(vocab_query)


PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX dcterms: <http://purl.org/dc/terms/>

SELECT ?collection_or_scheme {
  ?concept dcterms:identifier ?urn .
  {
    ?collection_or_scheme skos:member ?concept .
  } UNION {
    ?concept skos:inScheme ?collection_or_scheme .
  }
FILTER(?urn IN (
    "SDN:P01::ADEPZZ01", "SDN:P01::TEMPPR01", "SDN:P01::PSLTZZ01", "SDN:P01::PHXXZZXX", "SDN:P01::DOXYZZXX", "SDN:P01::H2SXZZXX", "SDN:P01::PHOSZZXX", "SDN:P01::TPHSZZXX", "SDN:P01::NTRIZZXX", "SDN:P01::NTRAZZXX", "SDN:P01::AMONZZXX", "SDN:P01::NTOTZZXX", "SDN:P01::ALKYZZXX", "SDN:P01::SLCAZZXX", "SDN:P01::NTOTZZZZ", "SDN:P01::MDMAP010", "SDN:P01::CORGZZTL", "SDN:P01::CPHLZZXX" 
  ))
} 
GROUP BY ?collection_or_scheme
    


Execute the query against the knowledge base

In [8]:
vocabs_collections_uris = get_vocabs_from_sparql_endpoint(vocab_query)
unit_vocabs_collections_uris = get_vocabs_from_sparql_endpoint(unit_query)
print(vocabs_collections_uris, unit_vocabs_collections_uris)

['http://vocab.nerc.ac.uk/collection/P01/current/'] []
