In [1]:
import json
import re # For regular expressions, useful for cleaning blank node IDs
from rdflib import Graph, Literal, URIRef, BNode # Import Graph, Literal, URIRef, BNode from rdflib
from rdflib.namespace import RDF, DCAT, DCTERMS, FOAF, PROV, SKOS, XSD, Namespace # Import necessary namespaces

try:
    from shapely import wkt
    SHAPELY_AVAILABLE = True
except ImportError:
    print("Shapely not found.")
    SHAPELY_AVAILABLE = False
    

In [4]:
def dcat_turtle_to_ogc_records_feature(turtlefile: str) -> dict:
    """
    Converts a DCAT Dataset RDF Turtle file to an OGC API Records compatible
    GeoJSON Feature structure by directly parsing the Turtle.

    Args:
        turtlefile (str): The input DCAT Dataset RDF data in Turtle format.

    Returns:
        dict: A GeoJSON Feature dictionary representing the OGC API Record.
              Returns an empty dictionary if parsing or conversion fails.
    """
    g = Graph()
    try:
        # Parse the Turtle file into an RDF graph
        g.parse(turtlefile, format="turtle")
    except Exception as e:
        print(f"Error parsing Turtle string: {e}")
        return {}

    # Define prefixes for common namespaces to use in the output
    prefixes = {
        str(DCAT): "dcat",
        str(DCTERMS): "dcterms",
        str(FOAF): "foaf",
        str(PROV): "prov",
        str(SKOS): "skos",
        str(XSD): "xsd",
        str(Namespace("http://www.w3.org/2006/vcard/ns#")): "vcard",
        str(Namespace("http://www.w3.org/2011/content#")): "cnt",
        str(Namespace("http://www.w3.org/ns/dqv#")): "dqv",
        str(Namespace("http://www.opengis.net/ont/geosparql#")): "geo",
        str(Namespace("http://www.w3.org/ns/locn#")): "locn", 
        str(Namespace("http://www.w3.org/2002/07/owl#")): "owl",
        str(Namespace("http://schema.org/")): "schema1",
        str(Namespace("http://www.w3.org/ns/odrl/2/")): "odrl" 
    }

    def get_prefixed_name(uri_ref: URIRef) -> str:
        """Converts a full URI into a prefixed name (e.g., http://www.w3.org/ns/dcat#Dataset -> dcat:Dataset)."""
        uri_str = str(uri_ref)
        for prefix_uri, prefix_name in prefixes.items():
            if uri_str.startswith(prefix_uri):
                return f"{prefix_name}:{uri_str[len(prefix_uri):]}"
        return uri_str # Return full URI if no prefix found

    def parse_wkt_to_geojson(wkt_string):
        """Attempts to parse a WKT string to GeoJSON using shapely."""
        if SHAPELY_AVAILABLE:
            try:
                geom = wkt.loads(wkt_string)
                return json.loads(json.dumps(geom.__geo_interface__))
            except Exception as e:
                print(f"Warning: Could not parse WKT '{wkt_string}' to GeoJSON: {e}")
                return wkt_string # Fallback to string
        return wkt_string # Return as string if shapely not available

    def process_rdf_node(node, graph, visited_complex_nodes: set):
        """
        Recursively processes an RDF node (URI, Literal, or Blank Node) into a JSON value.
        `visited_complex_nodes` is used to prevent infinite recursion for cyclic graphs
        and to track URIRefs that are treated as complex objects.
        """
        if isinstance(node, Literal):
            # Handle literals with language tags or datatypes
            if node.language:
                return {node.language: str(node)}
            elif node.datatype:
                # Specific handling for geoJSONLiteral and wktLiteral
                if node.datatype == URIRef("http://www.opengis.net/ont/geosparql#geoJSONLiteral"):
                    try:
                        return json.loads(str(node))
                    except json.JSONDecodeError:
                        return str(node)
                elif node.datatype == URIRef("http://www.opengis.net/ont/geosparql#wktLiteral"):
                    return parse_wkt_to_geojson(str(node))
                # Attempt to convert to Python native types based on xsd:datatype
                elif node.datatype == XSD.integer:
                    return int(node)
                elif node.datatype == XSD.decimal or node.datatype == XSD.double:
                    return float(node)
                elif node.datatype == XSD.boolean:
                    return str(node).lower() == 'true'
                elif node.datatype == XSD.date or node.datatype == XSD.dateTime: # Handle date and dateTime
                    return str(node) # Return as string, or parse to datetime object if needed
                else:
                    return str(node) # Default to string for other datatypes
            else:
                # Simple string literal
                return str(str(node)) # Ensure it's a plain string
        elif isinstance(node, (URIRef, BNode)):
            # Check if this node (URI or Blank Node) has outgoing properties
            # If it has properties, treat it as a complex object that needs to be represented as a dict
            if any(True for _ in graph.predicate_objects(node)):
                # If this node has already been visited as a complex object, return its ID to prevent recursion
                if node in visited_complex_nodes:
                    return str(node) # Return its URI or blank node ID

                visited_complex_nodes.add(node) # Mark as visited for complex processing

                # Initialize object with its own ID if it's a URIRef
                obj_data = {"@id": str(node)} if isinstance(node, URIRef) else {}
                
                # Iterate over predicate-object pairs for this node
                for p, o in graph.predicate_objects(node):
                    prop_key = get_prefixed_name(p)
                    value = process_rdf_node(o, graph, visited_complex_nodes) # Recursive call

                    if prop_key not in obj_data:
                        obj_data[prop_key] = value
                    else:
                        # If property already exists, convert to list if not already
                        if not isinstance(obj_data[prop_key], list):
                            obj_data[prop_key] = [obj_data[prop_key]]
                        
                        # Special handling for merging multi-language values if adding to a list
                        if isinstance(value, dict) and len(value) == 1 and list(value.keys())[0].isalpha() and len(list(value.keys())[0]) <= 3:
                            # Check if existing values are also single-language dicts
                            all_lang_dicts = True
                            for existing_item in obj_data[prop_key]:
                                if not (isinstance(existing_item, dict) and len(existing_item) == 1 and list(existing_item.keys())[0].isalpha() and len(list(existing_item.keys())[0]) <= 3):
                                    all_lang_dicts = False
                                    break
                            if all_lang_dicts:
                                # Merge into a single language map
                                merged_lang_dict = {}
                                for item in obj_data[prop_key]:
                                    merged_lang_dict.update(item)
                                merged_lang_dict.update(value)
                                obj_data[prop_key] = merged_lang_dict
                            else:
                                obj_data[prop_key].append(value)
                        else:
                            obj_data[prop_key].append(value)
                return obj_data
            else:
                # If it's a URIRef or BNode but has no further properties (just a simple reference), return its string ID
                return str(node)
        else:
            return str(node) # Fallback for unexpected node types


    # Find the main dcat:Dataset.
    # It can be a blank node or a URIRef. We search for any subject with rdf:type dcat:Dataset.
    main_dataset_subject = None
    for s, p, o in g.triples((None, RDF.type, DCAT.Dataset)):
        main_dataset_subject = s
        break # Assuming one main dataset for this transformation

    if not main_dataset_subject:
        print("Error: Could not find the main dcat:Dataset in the Turtle content.")
        return {}

    # Process the main dataset and all its connected complex nodes
    # We pass a set to track visited nodes to prevent infinite loops
    processed_dataset_data = process_rdf_node(main_dataset_subject, g, set())

    # Initialize the OGC API Records Feature structure
    ogc_record = {
        "type": "Feature",
        "conformsTo": [
        "http://www.opengis.net/spec/ogcapi-records-1/1.0/req/record-core",
        "http://modellen.geostandaarden.nl/dcat-ap-nl/"
        ],
        # Use dcterms:identifier if available, otherwise the URI of the dataset, or a generated ID for blank nodes
        "id": processed_dataset_data.get("dcterms:identifier", str(main_dataset_subject)),
        "geometry": None, # Will be populated from dcat:bbox
        "properties": {}
    }
    
    # Iterate through the processed dataset data and map properties to OGC API Records 'properties' object
    properties = ogc_record["properties"]

    properties["type"] = "dcat:Dataset" # Set the type explicitly for OGC API Records'
    # Iterate through the processed_dataset_data (which is the main dataset's flattened structure)
    # and map its properties to the OGC API Records 'properties' object.
    for key, value in processed_dataset_data.items():
        # Skip RDF type and ID if they are still present from the processing
        if key == str(RDF.type) or key == "@id" or key == str(DCAT.Dataset): # Also skip the dcat:Dataset type itself
            continue

        # Map dcat:bbox to geometry
        if key == "dcat:bbox":
            ogc_record["geometry"] = value
            continue # Already handled

        # Handle specific mappings for OGC API Records
        if key == "dcterms:title":
            properties["title"] = value
        elif key == "dcterms:description":
            properties["description"] = value
        elif key == "dcterms:issued":
            properties["issued"] = value
        elif key == "dcterms:language":
            properties["language"] = value
        elif key == "dcat:keyword":
            # Keywords might be a language map or a list of strings
            if isinstance(value, dict):
                properties["keywords"] = list(value.values())
            elif isinstance(value, list):
                properties["keywords"] = value
            else: # Single string keyword
                properties["keywords"] = [value] if value else []
        elif key == "dcat:landingPage":
            if value:
                ogc_record.setdefault("links", []).append({
                    "rel": "about",
                    "href": value,
                    "type": "text/html",
                    # Try to get English title from properties, or fallback
                    "title": properties.get("title", {}).get("en", "Landing Page") if isinstance(properties.get("title"), dict) else (properties.get("title") or "Landing Page")
                })
        elif key == "dcat:spatialResolutionInMeters":
            properties["spatialResolutionInMeters"] = value
        elif key == "dcterms:provenance":
            # Provenance is a nested object
            properties["provenance"] = value
        elif key == "dcterms:temporal":
            # Temporal is a nested object
            properties["temporalExtent"] = {
                "startDate": value.get("schema1:startDate") or value.get("dcterms:startDate"), # Check both schema1 and dcterms
                "endDate": value.get("schema1:endDate") or value.get("dcterms:endDate") # Check both schema1 and dcterms
            }
        elif key == "dcat:contactPoint":
            # ContactPoint is a nested object
            properties["contactPoint"] = {
                "organizationName": value.get("vcard:fn"),
                "email": value.get("vcard:hasEmail")
            }
        elif key == "dcat:theme":
            # Theme is a list of nested objects
            properties["themes"] = []
            if isinstance(value, list):
                for theme_item in value:
                    theme_obj = {"prefLabel": theme_item.get("skos:prefLabel")}
                    if "skos:inScheme" in theme_item:
                        scheme_node_id = theme_item["skos:inScheme"]
                        # Resolve the scheme node if it's a URI or BNode
                        if isinstance(scheme_node_id, (URIRef, BNode)):
                            scheme_node = process_rdf_node(scheme_node_id, g, set()) # New set for this sub-graph
                            if scheme_node:
                                theme_obj["inScheme"] = {"title": scheme_node.get("dcterms:title")}
                        else: # It's a literal string
                            theme_obj["inScheme"] = {"title": scheme_node_id}
                    properties["themes"].append(theme_obj)
            else: # Handle single theme
                theme_obj = {"prefLabel": value.get("skos:prefLabel")}
                if "skos:inScheme" in value:
                    scheme_node_id = value["skos:inScheme"]
                    if isinstance(scheme_node_id, (URIRef, BNode)):
                        scheme_node = process_rdf_node(scheme_node_id, g, set())
                        if scheme_node:
                            theme_obj["inScheme"] = {"title": scheme_node.get("dcterms:title")}
                    else:
                        theme_obj["inScheme"] = {"title": scheme_node_id}
                properties["themes"].append(theme_obj)
        elif key == "prov:qualifiedAttribution":
            # Qualified Attribution is a nested object
            agent_data = value.get("prov:agent")
            if agent_data:
                properties["publisher"] = {
                    "name": agent_data.get("foaf:name"),
                    "email": agent_data.get("foaf:mbox")
                }
        elif key == "dcterms:creator":
            # Creator is a foaf:Organization
            if isinstance(value, dict): # If it's a nested object (already processed by process_rdf_node)
                properties["creator"] = {
                    "name": value.get("foaf:name"),
                    "email": value.get("foaf:mbox")
                }
            else: # If it's a URI, try to find its details by processing it
                creator_node_data = process_rdf_node(URIRef(value), g, set())
                if isinstance(creator_node_data, dict):
                    properties["creator"] = {
                        "name": creator_node_data.get("foaf:name"),
                        "email": creator_node_data.get("foaf:mbox")
                    }
                else:
                    properties["creator"] = value # Fallback to URI string
        elif key == "dcterms:publisher":
            # Publisher is a foaf:Organization
            if isinstance(value, dict): # If it's a nested object (already processed by process_rdf_node)
                properties["publisher"] = {
                    "name": value.get("foaf:name"),
                    "email": value.get("foaf:mbox")
                }
            else: # If it's a URI, try to find its details by processing it
                publisher_node_data = process_rdf_node(URIRef(value), g, set())
                if isinstance(publisher_node_data, dict):
                    properties["publisher"] = {
                        "name": publisher_node_data.get("foaf:name"),
                        "email": publisher_node_data.get("foaf:mbox")
                    }
                else:
                    properties["publisher"] = value # Fallback to URI string
        elif key == "dcat:distribution":
            # Distribution is a nested object (dcat:Distribution)
            properties.setdefault("distributions", [])
            if isinstance(value, list):
                for dist_uri in value:
                    dist_node = process_rdf_node(URIRef(dist_uri), g, set())
                    if isinstance(dist_node, dict): # Ensure it's a dict before accessing properties
                        properties["distributions"].append({
                            "title": dist_node.get("dcterms:title"),
                            "description": dist_node.get("dcterms:description"),
                            "accessURL": dist_node.get("dcat:accessURL"),
                            "mediaType": dist_node.get("dcat:mediaType"),
                            "format": dist_node.get("dcterms:format"),
                            "license": dist_node.get("dcterms:license"),
                            "accessService": dist_node.get("dcat:accessService"), # This might need further resolution
                            "conformsTo": dist_node.get("dcterms:conformsTo"),
                            "odrl:hasPolicy": dist_node.get("odrl:hasPolicy")
                        })
                        # Also add accessURL to top-level links if it's a primary access point
                        if dist_node.get("dcat:accessURL"):
                             ogc_record.setdefault("links", []).append({
                                "rel": "access",
                                "href": dist_node.get("dcat:accessURL"),
                                "type": dist_node.get("dcat:mediaType") or dist_node.get("dcterms:format"),
                                "title": dist_node.get("dcterms:title", {}).get("nl", "Access URL") if isinstance(dist_node.get("dcterms:title"), dict) else (dist_node.get("dcterms:title") or "Access URL")
                            })
            else: # Single distribution
                dist_node = process_rdf_node(URIRef(value), g, set())
                if isinstance(dist_node, dict): # Ensure it's a dict
                    properties["distributions"].append({
                        "title": dist_node.get("dcterms:title"),
                        "description": dist_node.get("dcterms:description"),
                        "accessURL": dist_node.get("dcat:accessURL"),
                        "mediaType": dist_node.get("dcat:mediaType"),
                        "format": dist_node.get("dcterms:format"),
                        "license": dist_node.get("dcterms:license"),
                        "accessService": dist_node.get("dcat:accessService"),
                        "conformsTo": dist_node.get("dcterms:conformsTo"),
                        "odrl:hasPolicy": dist_node.get("odrl:hasPolicy")
                    })
                    if dist_node.get("dcat:accessURL"):
                         ogc_record.setdefault("links", []).append({
                            "rel": "access",
                            "href": dist_node.get("dcat:accessURL"),
                            "type": dist_node.get("dcat:mediaType") or dist_node.get("dcterms:format"),
                            "title": dist_node.get("dcterms:title", {}).get("nl", "Access URL") if isinstance(dist_node.get("dcterms:title"), dict) else (dist_node.get("dcterms:title") or "Access URL")
                        })
        elif key == "dcat:accessService":
            # Access Service is a nested object (dcat:DataService)
            properties.setdefault("accessServices", [])
            if isinstance(value, list):
                for service_uri in value:
                    service_node = process_rdf_node(URIRef(service_uri), g, set())
                    if isinstance(service_node, dict): # Ensure it's a dict
                        properties["accessServices"].append({
                            "title": service_node.get("dcterms:title"),
                            "description": service_node.get("dcterms:description"),
                            "endpointURL": service_node.get("dcat:endpointURL"),
                            "mediaType": service_node.get("dcat:mediaType"),
                            "conformsTo": service_node.get("dcterms:conformsTo"),
                            "identifier": service_node.get("dcterms:identifier"),
                            "publisher": service_node.get("dcterms:publisher") # This might need further resolution
                        })
                        if service_node.get("dcat:endpointURL"):
                            ogc_record.setdefault("links", []).append({
                                "rel": "service",
                                "href": service_node.get("dcat:endpointURL"),
                                "type": service_node.get("dcat:mediaType") or "application/json", # Default to json
                                "title": service_node.get("dcterms:title", {}).get("nl", "Access Service") if isinstance(service_node.get("dcterms:title"), dict) else (service_node.get("dcterms:title") or "Access Service")
                            })
            else: # Single service
                service_node = process_rdf_node(URIRef(value), g, set())
                if isinstance(service_node, dict): # Ensure it's a dict
                    properties["accessServices"].append({
                        "title": service_node.get("dcterms:title"),
                        "description": service_node.get("dcterms:description"),
                        "endpointURL": service_node.get("dcat:endpointURL"),
                        "mediaType": service_node.get("dcat:mediaType"),
                        "conformsTo": service_node.get("dcterms:conformsTo"),
                        "identifier": service_node.get("dcterms:identifier"),
                        "publisher": service_node.get("dcterms:publisher")
                    })
                    if service_node.get("dcat:endpointURL"):
                        ogc_record.setdefault("links", []).append({
                            "rel": "service",
                            "href": service_node.get("dcat:endpointURL"),
                            "type": service_node.get("dcat:mediaType") or "application/json",
                            "title": service_node.get("dcterms:title", {}).get("nl", "Access Service") if isinstance(service_node.get("dcterms:title"), dict) else (service_node.get("dcterms:title") or "Access Service")
                        })
        elif key == "dcterms:license":
            if isinstance(value, dict): # If it's a nested object (e.g., LicenseDocument)
                properties["license"] = {
                    "href": value.get("@id"),
                    "title": value.get("dcterms:title")
                }
            else: # If it's a direct URI
                properties["license"] = value
        elif key == "dcterms:conformsTo":
            properties["conformsTo"] = value
        elif key == "odrl:hasPolicy":
            properties["odrlPolicy"] = value
        elif key == "dcat:servesDataset": # This property is on DataService, not Dataset
            pass # Skip, handled when processing DataService
        else:
            # For any other top-level properties not explicitly mapped, add them directly
            # Ensure the key is prefixed for consistency, but avoid re-adding if already mapped
            if get_prefixed_name(URIRef(key)) not in properties:
                properties[get_prefixed_name(URIRef(key))] = value

    # If the ID is still a blank node ID, replace it with a placeholder for API compatibility
    if ogc_record["id"].startswith("_:"):
        ogc_record["id"] = "urn:ogc:record:generated-id" # Use a more generic ID if no identifier or URI

    return ogc_record

In [6]:
# filepath = '.\\..\\dcat\\goudappel_dcat.ttl'
filepath = '.\\..\\dcat\\waarneemstations~5~metadata.ttl'
ogc_records_feature = dcat_turtle_to_ogc_records_feature(filepath)

# print(json.dumps(ogc_records_feature, indent=2))
json_output = json.dumps(ogc_records_feature, indent=2, ensure_ascii=False)

# f = open("goudappel_dcat.geojson", "a")
f = open("waarneemstations~5~metadata.geojson", "a")
f.write(json_output)
f.close()