In [None]:
### load yaml

import yaml
def load_yaml(file_path):
    """
    Load a YAML file and return its content.
    
    :param file_path: Path to the YAML file.
    :return: Content of the YAML file as a Python object.
    """
    with open(file_path, 'r') as file:
        return yaml.safe_load(file)
    
tmp = load_yaml('/Users/tplas/Downloads/clean_entity_type_map.yaml')

In [7]:
metadata_fields = set()

for ds_url, ds_info in tmp.items():
    for md_field, md_values in ds_info.items():
        metadata_fields.add(md_field)
print("Metadata fields found in the YAML file:")
for field in sorted(metadata_fields):
    print(field)
assert len(metadata_fields) == 22
metadata_fields = metadata_fields - {'Landing page'}

print(f"Total metadata fields: {len(metadata_fields)}")

Metadata fields found in the YAML file:
Access rights
Data contact point
Data creator
Data publisher
Description
Distribution access URL
Distribution byte size
Distribution format
Keywords
Landing page
License
Metadata date
Metadata language
Resource type
Responsible organization metadata
Spatial coverage
Spatial reference system
Spatial resolution
Temporal coverage
Temporal resolution
Title
Unique Identifier
Total metadata fields: 21


In [8]:
new_yml = {}
for ds_url, ds_info in tmp.items():
    new_yml[ds_url] = {}
    for md_field in metadata_fields:
        new_yml[ds_url][md_field] = [None]

with open('outputs/ground_truth_metadata_annotations-2025-05-30.yaml', 'w') as file:
    yaml.dump(new_yml, file, default_flow_style=False, sort_keys=True)

## Convert yaml

In [27]:
tmp = load_yaml('outputs/annotations/ground_truth_metadata_annotations-2025-06-03.yaml')

new_yml = {}
for ds_url, ds_info in tmp.items():
    new_yml[ds_url] = {}
    for md_field, md_values in ds_info.items():
        assert type(md_values) == list, f"Expected list for {ds_url} {md_field}, got {type(md_values)}"
        assert len(md_values) == 1, f"Expected single value for {ds_url} {md_field}, got {len(md_values)}"
        assert type(md_values[0]) == dict, f"Expected dict for {ds_url} {md_field}, got {type(md_values[0])}"
        new_yml[ds_url][md_field] = md_values[0]
        new_yml[ds_url][md_field]['findability'] = None
    for new_col in ['Date published', 'Date last modified', 'Same as']:
        if new_col not in new_yml[ds_url]:
            new_yml[ds_url][new_col] = {'findability': None, 'text': None}

with open('outputs/annotations/ground_truth_metadata_annotations-2025-06-03.yaml', 'w') as file:
    yaml.dump(new_yml, file, default_flow_style=False, sort_keys=True, allow_unicode=True)

In [25]:
tmp = load_yaml('outputs/annotations/ground_truth_metadata_annotations-2025-06-03.yaml')