In [1]:
import re
import json

from pandas import DataFrame

from rdflib.graph import Graph
from rdflib.plugins.sparql.processor import SPARQLResult

In [2]:
def sparql_results_to_df(results: SPARQLResult) -> DataFrame:
    """
    Export results from an rdflib SPARQL query into a `pandas.DataFrame`,
    using Python types. See https://github.com/RDFLib/rdflib/issues/1179.
    """
    return DataFrame(
        data=([None if x is None else x.toPython() for x in row] for row in results),
        columns=[str(x) for x in results.vars],
    )

In [21]:
OUTPUT_FILE = "indexing_data.json"
ontology_files = [
    {"variable": "ARCHIVES", "in":["archive.ttl"]},
    {"variable": "INTERPRETATION", "in":["interpretation.ttl"]},    
    {"variable": "PROXIES", "in":["paleo_proxy.ttl", "chron_proxy.ttl"]},
    {"variable": "UNITS", "in":["paleo_units.ttl", "chron_units.ttl"]},
    {"variable": "VARIABLES", "in":["paleo_variables.ttl", "chron_variables.ttl"]}
]

data = []

for ontfile in ontology_files:
    ttl_files = ontfile["in"]
    var_name = ontfile["variable"]

    indexing_data = {}
    for ttl_file in ttl_files:
        graph = Graph()
        graph.parse(ttl_file)

        result = graph.query("""
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX le: <http://linked.earth/ontology#>

        SELECT ?id ?type ?label ?description ?noaa (GROUP_CONCAT(?syn;separator=" | ") as ?synonyms) WHERE {
            ?id rdfs:label ?label .
            ?id a ?type .
            ?id rdfs:comment ?description .
            FILTER( STRSTARTS(STR(?type),str(:))) .
            OPTIONAL { ?id le:hasNoaaPastName ?noaa }
            OPTIONAL { ?id le:hasSynonym ?syn }
        }
        GROUP BY ?id
        """)

        df = sparql_results_to_df(result)
        # display(df)

        tmp = df.to_dict(orient="records")
        for line in tmp:
            label = line["label"]
            noaa = line["noaa"]
            description = line["description"]
            id = line["id"]

            # Get the type
            type = re.sub("^.*#", "", line["type"])
            type_text = re.sub(r"([a-z])([A-Z])", "\\1 \\2", type)
            
            # Get the localname
            localname = re.sub("^.*#", "", id)

            # Add synonyms  
            line["synonyms"] = re.split("\s*\|\s*", line["synonyms"])
            line["synonyms"].insert(0, label.lower()) # Label is also a synonym
            line["synonyms"].insert(1, localname.lower()) # Localname is also a synonym
            if noaa:
                line["synonyms"].insert(2, noaa.lower()) # NOAA past name is also a synonym
            
            if '' in line["synonyms"]:
                line["synonyms"].remove('')
            line["synonyms"] = list(set(line["synonyms"]))

            synonyms_text = ", ".join([f"'{syn}'" for syn in line["synonyms"]])
            full_text = f"{label} {type_text}: {description}. Synonyms: {synonyms_text}"
            indexing_data = {
                "id": id,
                "type": line["type"],
                "name": label,
                "description": description,
                "synonyms": line["synonyms"],
                "full_text": full_text
            }
            data.append(indexing_data)

with open(OUTPUT_FILE, "w") as ofile:
    json.dump(data, ofile, indent=3)

In [20]:
data

[{'id': 'http://linked.earth/ontology/archive#Borehole',
  'type': 'http://linked.earth/ontology/archive#ArchiveType',
  'name': 'Borehole',
  'description': 'a data type that consists of direct measurements of subsurface temperature from boreholes drilled into materials such as rock and glacier ice.',
  'synonyms': ['borehole'],
  'full_text': "Borehole Archive Type: a data type that consists of direct measurements of subsurface temperature from boreholes drilled into materials such as rock and glacier ice.. Synonyms: 'borehole'"},
 {'id': 'http://linked.earth/ontology/archive#Coral',
  'type': 'http://linked.earth/ontology/archive#ArchiveType',
  'name': 'Coral',
  'description': 'an identifiable organism that belongs to the kingdom Animalia, phylum Cnindaria.',
  'synonyms': ['coral'],
  'full_text': "Coral Archive Type: an identifiable organism that belongs to the kingdom Animalia, phylum Cnindaria.. Synonyms: 'coral'"},
 {'id': 'http://linked.earth/ontology/archive#GlacierIce',
  