# GDI Example dataset


The GDI project has its own specific metadata profile based HealthDCAT-AP.


Now we are ready to define the dataset. We can do that as a Python dictionary.


In [17]:
from getpass import getpass

from rdflib import URIRef
from rdflib.namespace import DCTERMS

from sempyro.dcat import AccessRights
from sempyro.healthdcatap import (
    HEALTHDCATAPAgent,
    HEALTHDCATAPDataService,
    HEALTHDCATAPDataset,
    HEALTHDCATAPDistribution,
    HEALTHDCATAPKind,
    HEALTHDCATAPHdab,
    HEALTHDCATAPPublisher,
)

# beacon sample level server is disable by now.

dataset_subject = URIRef("http://example.com/gdi/dataset")
distribution_subject_af = URIRef("http://example.com/distribution_1")
dataservice_definition_subject_af = URIRef("http://example.com/gdi/dataserviceaf")

env = "test"  # test or acc
server_fdp = f"https://gdi-fdp-{env}.healthdata.nl"
beacon_server_af = f"https://gdi-beacon-af-{env}.healthdata.nl/api/info"

contact_point_goe = HEALTHDCATAPKind(
    hasEmail="mailto:info@goe.nl",
    contact_page="https://GoE.example.nl",
    formatted_name=["GoE - NL"],
)

creator_goe = HEALTHDCATAPAgent(
    name=["Genome of Europe - NL node"],
    identifier="GOE-NL-HRI-1",
    mbox=["mailto:info@goe.nl"],
    homepage="https://GoE.example.nl",
)

hdab_definition = HEALTHDCATAPHdab(
    name=["HDAB-NL"],
    identifier="HDAB-NL",
    mbox=["mailto:info@hdab.nl"],
    homepage="https://GoE.example.nl",
    contact_point=contact_point_goe,
)

publisher_hdab = HEALTHDCATAPPublisher(
    name=["HDAB-NL"],
    identifier="HDAB-NL",
    mbox=["mailto:info@hdab.nl"],
    homepage="https://GoE.example.nl",
    contact_point=contact_point_goe,
)

dataset_definition = {
    "access_rights": AccessRights.public,
    "applicable_legislation": ["https://data.europa.eu/eli/reg/2025/327/oj"],
    "conforms_to": ["http://data.gdi.eu/core/p2/ExternallyGoverned"],
    "health_category": [
        "http://data.gdi.eu/core/p2/HealthCategoryHumanGenetic",
        "http://data.gdi.eu/core/p2/HealthCategoryHumanEpigenomic",
        "http://data.gdi.eu/core/p2/HealthCategoryHumanGenomic",
    ],
    "contact_point": [contact_point_goe],
    "hdab": hdab_definition,
    "creator": [creator_goe],
    "description": [
        "The first dummy GoE examplar AF-browser dataset and on 2025-11-27 we decided that also the description "
        "how we arrived at this dataset will be part of the description"
    ],
    "distribution": [str(distribution_subject_af)],
    "identifier": ["GOE-NL-HRI-1"],
    "legal_basis": ["https://w3id.org/dpv#Consent"],
    "number_of_records": 12340,
    "number_of_unique_individuals": 1234,
    "publisher": [publisher_hdab],
    "theme": ["https://publications.europa.eu/resource/authority/data-theme/HEAL"],
    "type": ["https://publications.europa.eu/resource/authority/dataset-type/SYNTHETIC_DATA"],
    "title": ["Genome of Europe Dutch Dummy dataset"],
    "keyword": ["Genome of Europe", "GDI"],
}


distribution_definition_af = {
    "title": ["Example VCF from BSC"],
    "description": ["VCF file containing GWAS and allele frequency lookup data of synthetic Colorectal cancer cases."],
    "access_url": ["https://rems-goe.nl/2323"],
    "media_type": "https://www.iana.org/assignments/media-types/application/vcf",
    "license": "https://creativecommons.org/licenses/by-sa/4.0/",
    "format": "https://publications.europa.eu/resource/authority/file-type/VCF",
    "byte_size": 1024,
    "applicable_legislation": ["http://data.europa.eu/eli/reg/2025/327/oj"],
}


dataservice_definition_af = {
    "title": ["GoE NL Beacon"],
    "description": ["Beacon allele frequency queries on cancer genomics data"],
    "endpoint_url": [f"{beacon_server_af}"],
    "endpoint_description": [f"{beacon_server_af}"],
    "serves_dataset": [str(dataset_subject)],
    "access_rights": AccessRights.public,
    "contact_point": [contact_point_goe],
    "identifier": ["GOE-NL-HRI-2"],
    "license": "https://creativecommons.org/licenses/by-nc/4.0/",
    "publisher": [publisher_hdab],
    "theme": ["https://publications.europa.eu/resource/authority/data-theme/HEAL"],
    "keyword": ["beacon v2", "allele frequency beacon"],
}


Finally, we instantiate the dataset class and print the serialization.


In [None]:
example_dataset = HEALTHDCATAPDataset(**dataset_definition)
example_dataset_graph = example_dataset.to_graph(URIRef("http://example.com/gdi/dataset"))


example_distribution_af = HEALTHDCATAPDistribution(**distribution_definition_af)
example_distribution_graph_af = example_distribution_af.to_graph(distribution_subject_af)
example_dataservice_af = HEALTHDCATAPDataService(**dataservice_definition_af)
example_dataservice_graph_af = example_dataservice_af.to_graph(dataservice_definition_subject_af)

# Add them up for prettier visualization
print((example_dataset_graph + example_distribution_graph_af + example_dataservice_graph_af).serialize(format="turtle"))


Now, we can push the Dataset to a FAIR Data Point. For this, we use the Health-RI developed
[FAIRClient](https://github.com/Health-RI/fairclient) library.

First, we define a couple of settings. Note: you will need an existing Catalog in the FDP to add
the dataset to. If you don't have one, you can easily create them using the web interface.


In [None]:
fdp_baseurl = server_fdp
fdp_user = input("Enter FDP username: ")
fdp_pass = getpass(prompt="Password: ")
parent_catalog_id = input("Enter parent Catalog ID: ")
fdp_parent_catalog = f"{server_fdp}/catalog/{parent_catalog_id}"

print(f"fdp:{fdp_baseurl}")
print(f"parent catalog: {fdp_parent_catalog}")


In [None]:
import fairclient.fdpclient

# Log in to the FAIR Data Point
fdpclient = fairclient.fdpclient.FDPClient(base_url=fdp_baseurl, username=fdp_user, password=fdp_pass)

# Add a reference to the parent catalog to make the FDP happy
example_dataset_graph.add((dataset_subject, DCTERMS.isPartOf, URIRef(fdp_parent_catalog)))


In [None]:
new_dataset = fdpclient.create_and_publish("dataset", example_dataset_graph)
print(new_dataset)


To conclude we can now add the distribution to the dataset. This is done by adding the distribution to the dataset graph and then pushing it to the FDP.


In [None]:
example_distribution_graph_af.add((distribution_subject_af, DCTERMS.isPartOf, URIRef(f"{new_dataset}")))
distribution_fdp_id_af = fdpclient.create_and_publish(resource_type="distribution", metadata=example_distribution_graph_af)

print(distribution_fdp_id_af)

example_dataservice_graph_af.add((dataservice_definition_subject_af, DCTERMS.isPartOf, URIRef(f"{distribution_fdp_id_af}")))
dataservice_fdp_id_af = fdpclient.create_and_publish(resource_type="dataservice", metadata=example_dataservice_graph_af)

print(dataservice_fdp_id_af)


Note: Manually setting the fdp_id of the distribution and dataservice into the dataset's distribution field and the distribution's access_service field is still required, as this has not been automated yet!
