# GOE Example dataset


The GDI project has its own specific metadata profile based HealthDCAT-AP.


Now we are ready to define the dataset. We can do that as a Python dictionary.


In [6]:
from getpass import getpass

from rdflib import URIRef
from rdflib.namespace import DCTERMS, RDF

from sempyro.adms import Identifier
from sempyro.dcat import AccessRights
from sempyro.healthdcatap import (
    HEALTHDCATAPAgent,
    HEALTHDCATAPDataset,
    HEALTHDCATAPDistribution,
    HEALTHDCATAPKind,
    HEALTHDCATAPPublisher,
)

dataset_subject = URIRef("https://gdi-fdp-prd.healthdata.nl/dataset/69d0d4a4-d70d-4131-ac1f-e74ad6e5fe5f")
distribution_subject = URIRef(
    "https://gdi-fdp-prd.healthdata.nl/distribution/ed723757-e030-4ec0-a905-996f77f68c36"
)

contact_point_erasmus = HEALTHDCATAPKind(
    hasEmail="mailto:j.vanrooij@erasmusmc.nl",
    formatted_name=["Jeroen van Rooij"],
)

creator_erasmus = HEALTHDCATAPAgent(
    name=["Erasmus MC"],
    identifier="https://ror.org/018906e22",
    mbox=["mailto:j.vanrooij@erasmusmc.nl"],
    homepage="https://erasmusmc.nl/",
)

publisher_erasmus = HEALTHDCATAPPublisher(
    name=["Erasmus MC"],
    identifier="https://ror.org/018906e22",
    mbox=["mailto:j.vanrooij@erasmusmc.nl"],
    homepage="https://erasmusmc.nl/",
    contact_point=contact_point_erasmus,
)

conforms_to_uri = "http://data.gdi.eu/core/p2/ExternallyGoverned"

other_identifier_eva = Identifier(
    notation="https://www.ebi.ac.uk/eva/?eva-study=PRJEB20726",
    schema_agency="EBI EVA",
)

dataset_definition = {
    "access_rights": AccessRights.public,
    "applicable_legislation": ["http://data.europa.eu/eli/reg/2025/327/oj"],
    "conforms_to": [conforms_to_uri],
    "contact_point": [contact_point_erasmus],
    "creator": [creator_erasmus],
    "description": [
        "All variants detected by whole exome sequencing of 2628 Dutch healthy elderly individuals"
    ],
    "distribution": [str(distribution_subject)],
    "identifier": ["GOE-NL-ERS-0001"],
    "other_identifier": [other_identifier_eva],
    "keyword": ["Cohort Study"],
    "language": ["http://publications.europa.eu/resource/authority/language/ENG"],
    "license": "http://purl.org/NET/rdflicense/cc-zero1.0",
    "publisher": [publisher_erasmus],
    "theme": ["http://publications.europa.eu/resource/authority/data-theme/HEAL"],
    "health_category": ["http://data.gdi.eu/core/p2/HealthCategoryHumanGenetic"],
    "title": ["Exome Sequencing"],
}

# Note: HEALTHDCATAPDistribution requires http/https URLs, so the FTP URLs are
# normalized to https for validation.
distribution_access_url = (
    "https://ftp.sra.ebi.ac.uk/vol1/analysis/ERZ407/ERZ407864/170419_RSX2_EVA.EVA.vcf.gz"
)

distribution_definition = {
    "title": ["Allele Frequencies of 2628 Exome Sequenced Samples."],
    "description": [
        "All variants detected by whole exome sequencing of 2628 Dutch healthy elderly individuals"
    ],
    "access_url": [distribution_access_url],
    "download_url": [distribution_access_url],
    "byte_size": 6000000,
    "format": "https://edamontology.org/format_3016",
    "rights": ["https://www.ebi.ac.uk/eva/?eva-study=PRJEB20726"],
    "license": "https://definities.geostandaarden.nl/DCAT-AP-NL/id/waarde/licentieValue/niet-open",
    "applicable_legislation": ["http://data.europa.eu/eli/reg/2025/327/oj"],
}


Finally, we instantiate the dataset class and print the serialization.


In [None]:
example_dataset = HEALTHDCATAPDataset(**dataset_definition)
example_dataset_graph = example_dataset.to_graph(dataset_subject)

# Satisfy GDI SHACL: dcterms:conformsTo must point to a dcterms:Standard
example_dataset_graph.add((URIRef(conforms_to_uri), RDF.type, DCTERMS.Standard))

example_distribution = HEALTHDCATAPDistribution(**distribution_definition)
example_distribution_graph = example_distribution.to_graph(distribution_subject)

# Add them up for prettier visualization
print((example_dataset_graph + example_distribution_graph).serialize(format="turtle"))


Now, we can push the Dataset to a FAIR Data Point. For this, we use the Health-RI developed
[FAIRClient](https://github.com/Health-RI/fairclient) library.

First, we define a couple of settings. Note: you will need an existing Catalog in the FDP to add
the dataset to. If you don't have one, you can easily create them using the web interface.


In [None]:
fdp_baseurl = input("Enter FDP server url: ")
fdp_user = input("Enter FDP username: ")
fdp_pass = getpass(prompt="Password: ")
parent_catalog_id = input("Enter parent Catalog ID: ")
fdp_parent_catalog = f"{fdp_baseurl}/catalog/{parent_catalog_id}"

print(f"fdp:{fdp_baseurl}")
print(f"parent catalog: {fdp_parent_catalog}")


In [None]:
import fairclient.fdpclient

# Log in to the FAIR Data Point
fdpclient = fairclient.fdpclient.FDPClient(base_url=fdp_baseurl, username=fdp_user, password=fdp_pass)

# Add a reference to the parent catalog to make the FDP happy
example_dataset_graph.add((dataset_subject, DCTERMS.isPartOf, URIRef(fdp_parent_catalog)))


In [None]:
new_dataset = fdpclient.create_and_publish("dataset", example_dataset_graph)
print(new_dataset)


To conclude we can now add the distribution to the dataset. This is done by adding the distribution to the dataset graph and then pushing it to the FDP.


In [None]:
example_distribution_graph.add((distribution_subject, DCTERMS.isPartOf, URIRef(f"{new_dataset}")))
distribution_fdp_id = fdpclient.create_and_publish(resource_type="distribution", metadata=example_distribution_graph)

print(distribution_fdp_id)


Note: Manually setting the `fdp_id` of the distribution into the dataset's distribution field is still required, as this has not been automated yet!
