# LLM-for-Metadata-Harvesting

This notebook contains the experimental results from [P6: Groot zeegras (2023)](https://datahuiswadden.openearth.nl/geonetwork/srv/api/records/TF1TbsTxTqykP5rv6MXJEg).  
The results can be found under the last code block. Note that not all code is directly relevant to this experiment; some parts are retained for future development and elaboration.


In [None]:
from cheatsheet import CHEATSHEETS
from prompt import PROMPTS
from webutils import readWebContent, downloadAndParseXML

dataPortalURL = [
    "https://developers.google.com/earth-engine/datasets/catalog/NASA_HLS_HLSS30_v002",
    "https://lpdaac.usgs.gov/products/mod09a1v061/",
    "https://stac.ecodatacube.eu/veg_quercus.robur_anv.eml/collection.json?.language=en",
    "https://stac.ecodatacube.eu/ndvi_glad.landsat.ard2.seasconv/collection.json?.language=en",
    "https://zenodo.org/records/8319440",
    "https://lifesciences.datastations.nl/dataset.xhtml?persistentId=doi:10.17026/dans-2bd-kskz",
    "https://www.gbif.org/dataset/4fa7b334-ce0d-4e88-aaae-2e0c138d049e",
    "https://www.gbif.org/dataset/74196cd9-7ebc-4b20-bc27-3c2d22e31ed7",
    "https://www.gbif.org/dataset/f9ba3c2e-0636-4f66-a4b5-b8c138046e9e",
    "https://www.gbif.org/dataset/bc0acb9a-131f-4085-93ae-a46e08564ac5",
    "https://zenodo.org/records/11440456",
    "https://stac.ecodatacube.eu/blue_glad.landsat.ard2.seasconv.m.yearly/collection.json",
    "https://datahuiswadden.openearth.nl/geonetwork/srv/eng/catalog.search#/metadata/L-mHomzGRuKAHGMkUPjY9g",
    "https://datahuiswadden.openearth.nl/geonetwork/srv/eng/catalog.search#/metadata/0fe7e64b-50b3-4cee-b64a-02659fc2b6c7",
    "https://stac.ecodatacube.eu/green_glad.landsat.ard2.seasconv.m.yearly/collection.json",
    "https://datahuiswadden.openearth.nl/geonetwork/srv/api/records/A0h06_NlSEuNlium5OO3FA",
]

# Get the web content
url = dataPortalURL[0]


# soup = readWebContent(url)
# if soup is None:
#     raise ValueError("Failed to retrieve web content")

# # Extract text from the webpage - adjust the selector based on the webpage structure
# # This is a basic example - you might need to modify based on the specific webpage
# text = soup.get_text(separator='\n', strip=True)

# text_xml, _ = downloadAndParseXML("https://datahuiswadden.openearth.nl/geonetwork/srv/api/records/A0h06_NlSEuNlium5OO3FA/formatters/xml")
# text += "\n" + text_xml
# full_text = text
import nest_asyncio
import asyncio
from webutils import extract_full_page_text

# Apply nest_asyncio to allow asyncio.run() in Jupyter
nest_asyncio.apply()

# Run the async function
full_text = await extract_full_page_text(url)

# Optionally display or save it
print(full_text)  # Print the first 1000 characters

In [None]:
from tqdm import tqdm
from harvester_operations import extract_entities
initial_url_map = {}
clean_url_map = {}

index = 0

for url in tqdm(dataPortalURL):
    index += 1
    # Apply nest_asyncio to allow asyncio.run() in Jupyter
    nest_asyncio.apply()

    # Run the async function
    if url.startswith("https://lpdaac.usgs.gov"):
        soup = readWebContent(url)
        if soup is None:
            raise ValueError("Failed to retrieve web content")

        # Extract text from the webpage - adjust the selector based on the webpage structure
        # This is a basic example - you might need to modify based on the specific webpage
        full_text = soup.get_text(separator='\n', strip=True)
    else:
        full_text = await extract_full_page_text(url)

    # Optionally display or save it
    ###############################################################
    special_interest = CHEATSHEETS.get("special_interests", "Focus on metadata fields and their relationships")
    entity_types = PROMPTS["DEFAULT_ENTITY_TYPES"]
    is_croissant=False
    # special_interest = CHEATSHEETS.get("special_interests_croissant")
    # entity_types = PROMPTS["CROISSANT_ENTITY_TYPES"]
    # is_croissant=True
    ###############################################################

    initial_nodes, clean_nodes = extract_entities(
        text=full_text, 
        entity_types=entity_types, 
        special_interest=special_interest,
        is_croissant=is_croissant,
    )

    initial_entity_type_map = {}

    for entity_group in initial_nodes.values():
        for item in entity_group:
            entity_name = item.get('entity_name')
            entity_type = item.get('entity_type')
            entity_description = item.get('description')

            # Initialize the list for this entity_type if not already present
            if entity_name not in initial_entity_type_map:
                initial_entity_type_map[entity_type] = []

            # Append the (entity_name, description) pair
            initial_entity_type_map[entity_type].append(entity_name + '; ' + entity_description)
    initial_url_map[url] = initial_entity_type_map

    # Create a dictionary to store entity_type: [(entity_name, description), ...]
    clean_entity_type_map = {}

    for entity_group in clean_nodes.values():
        for item in entity_group:
            entity_name = item.get('entity_name')
            entity_value = item.get('entity_value')

            # Initialize the list for this entity_type if not already present
            if entity_name not in clean_entity_type_map:
                clean_entity_type_map[entity_name] = []

            # Append the (entity_name, description) pair
            clean_entity_type_map[entity_name].append(entity_value)
    clean_url_map[url] = clean_entity_type_map

In [None]:
import yaml
import os

from datetime import datetime
now = datetime.now()
date_str = now.strftime("%Y-%m-%d")
prefix = "cedar_openai_"

output_file_path = "outputs/" + date_str + "/" + prefix + "clean_entity_type_map.yaml"
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

# Sort the inner dictionary keys for each dataset URL
sorted_data = {
    url: dict(sorted(fields.items()))
    for url, fields in clean_url_map.items()
}

# Save to YAML
with open(output_file_path, "w") as file:
    yaml.dump(sorted_data, file, sort_keys=False, allow_unicode=True)

In [None]:
import yaml
import os

from datetime import datetime
now = datetime.now()
date_str = now.strftime("%Y-%m-%d")

output_file_path = "outputs/" + date_str + "/" + prefix + "initial_entity_type_map.yaml"
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

# Sort the inner dictionary keys for each dataset URL
sorted_data = {
    url: dict(sorted(fields.items()))
    for url, fields in initial_url_map.items()
}

# Save to YAML
with open(output_file_path, "w") as file:
    yaml.dump(sorted_data, file, sort_keys=False, allow_unicode=True)

In [None]:
import yaml

# Path to your YAML file
input_file_path = "outputs/2025-06-04/cedar_gemini_clean_entity_type_map.yaml"

# Load the YAML content
with open(input_file_path, "r") as file:
    loaded_data = yaml.safe_load(file)

# Now `loaded_data` is a Python dictionary
print(loaded_data.keys())  # For example, show the top-level URLs

In [None]:
pip install dill