In [18]:
import os
import sys
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from typing import List, Dict, Any, Tuple
from concurrent.futures import ThreadPoolExecutor, as_completed

import httpx
from loguru import logger

In [4]:
BASE_NOMAD_URL = "http://nomad-lab.eu/prod/v1/api/v1"
OUTPUT_DIR = "../data/nomad"

In [5]:
def test_nomad_connection(base_url: str ="http://nomad-lab.eu/prod/v1/api/v1") -> bool:
    """ Test connection to the NOMAD API.

    Parameters:
    -----------
    base_url : str
        The base URL of the NOMAD API. Default is "http://nomad-lab.eu/prod/v1/api/v1".

    Returns:
    --------
    bool
        True if the connection is successful, False otherwise.
    """
    logger.info("Testing connection to NOMAD API...")
    try:
        r = httpx.get(f"{base_url}/entries", timeout=5)
        if r.status_code == 200:
            logger.success("Connected to NOMAD API successfully !")
            return True
    except httpx.RequestException:
        logger.error("Failed to connect to NOMAD API.")
        return False

test_nomad_connection()

[32m2025-10-30 16:25:49.273[0m | [1mINFO    [0m | [36m__main__[0m:[36mtest_nomad_connection[0m:[36m14[0m - [1mTesting connection to NOMAD API...[0m
[32m2025-10-30 16:25:49.676[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mtest_nomad_connection[0m:[36m18[0m - [32m[1mConnected to NOMAD API successfully ![0m


True

In [None]:
def fetch_entries_md_related_by_batch(page_size: int = 50) -> Tuple[List[Dict[str, Any]], str]:
    """
    Fetch all Molecular Dynamics (MD)-related entries from the NOMAD API with pagination.

    Parameters
    ----------
    page_size : int
        Number of entries to fetch per page.

    Returns
    -------
    Tuple[List[Dict[str, Any]], str]:
        - A list of all MD-related entries (JSON objects).
        - The current timestamp in ISO 8601 format.
    """
    logger.info("Fetching Molecular Dynamics related entries from NOMAD API by batch...")
    fetch_time = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")

    all_entries = []
    next_page_value = None
    total_entries = None

    # Fetch the first page
    try:
        url = (
            f"{BASE_NOMAD_URL}/entries"
            "?owner=public"
            f"&page_size={page_size}"
            "&order=asc"
            "&json_query=%7B%22results.method.workflow_name%22%3A%22MolecularDynamics%22%7D"
            "&exclude=quantities&exclude=sections"
        )
        logger.debug(f"Requesting first page: {url}")
        response = httpx.get(url, timeout=1000)
        response.raise_for_status()

        first_50_entries_with_request_md = response.json()
        all_entries.extend(first_50_entries_with_request_md["data"])

        total_entries = first_50_entries_with_request_md["pagination"]["total"]
        next_page_value = first_50_entries_with_request_md["pagination"]["next_page_after_value"]

        logger.debug(f"Fetched first {len(first_50_entries_with_request_md['data'])} entries / {total_entries}")
        
    except httpx.HTTPError as e:
        logger.error(f"HTTP error occurred: {e}")
        return [], fetch_time

    # Paginate through remaining entries
    with tqdm(
        total=total_entries,
        desc="Fetching MD entries from NOMAD",
        colour="blue",
        ncols=100,
        ascii="░▒█",
        unit="entry",
        bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]"
    ) as pbar:
        # Initial update for the first batch already fetched
        pbar.update(len(all_entries))
        while len(all_entries) < total_entries and next_page_value:
            try:
                url = (
                    f"{BASE_NOMAD_URL}/entries"
                    f"?owner=visible"
                    f"&page_size={page_size}"
                    f"&order=asc"
                    f"&filter=results.method.workflow_name:MolecularDynamics"
                    f"&exclude=quantities&exclude=sections"
                    f"&page_after_value={next_page_value}"
                )
                response = httpx.get(url, timeout=1000)
                response.raise_for_status()

                next_batch = response.json()
                entries_count = len(next_batch["data"])
                all_entries.extend(next_batch["data"])

                # Update the bar progression
                pbar.update(entries_count)
                # Update the next entry to begin with
                next_page_value = next_batch["pagination"]["next_page_after_value"]

            except httpx.HTTPError as e:
                logger.error(f"HTTP error occurred while fetching next page: {e}")
                break

    logger.success(f"Fetched {len(all_entries)} Molecular Dynamics entries from NOMAD successfully !")
    return all_entries, fetch_time


nomad_data, fetch_time = fetch_entries_md_related_by_batch()

[32m2025-10-30 16:29:18.634[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_entries_md_related_by_batch[0m:[36m16[0m - [1mFetching Molecular Dynamics related entries from NOMAD API by batch...[0m
[32m2025-10-30 16:29:18.634[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mfetch_entries_md_related_by_batch[0m:[36m24[0m - [34m[1mRequesting first 50 entries...[0m
[32m2025-10-30 16:29:18.634[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mfetch_entries_md_related_by_batch[0m:[36m34[0m - [34m[1mRequesting first page: http://nomad-lab.eu/prod/v1/api/v1/entries?owner=public&page_size=50&order=asc&json_query=%7B%22results.method.workflow_name%22%3A%22MolecularDynamics%22%7D&exclude=quantities&exclude=sections[0m
[32m2025-10-30 16:29:19.327[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mfetch_entries_md_related_by_batch[0m:[36m44[0m - [34m[1mFetched first 50 entries.[0m
[32m2025-10-30 16:29:19.327[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:

In [30]:
unique_entry_ids = {f["entry_id"] for f in nomad_data}
len(unique_entry_ids)

15950

In [12]:
nomad_data[0].keys()

dict_keys(['upload_id', 'references', 'origin', 'datasets', 'n_quantities', 'nomad_version', 'upload_create_time', 'nomad_commit', 'section_defs', 'publish_time', 'processing_errors', 'results', 'entry_name', 'last_processing_time', 'parser_name', 'calc_id', 'published', 'writers', 'writer_groups', 'processed', 'mainfile', 'main_author', 'viewers', 'viewer_groups', 'entry_create_time', 'with_embargo', 'domain', 'files', 'comment', 'entry_type', 'entry_id', 'upload_name', 'authors', 'license'])

In [None]:
def parse_nomad_files(batch_json: Dict[str, Any], fetch_time: str) -> List[Dict[str, Any]]:
    """
    Extract file metadata from a NOMAD batch JSON.

    Args:
        batch_json (Dict[str, Any]): JSON object returned by NOMAD API for a page of entries.
        fetch_time (str): Timestamp when the files were fetched.

    Returns:
        List[Dict[str, Any]]: List of dictionaries containing file metadata.
    """
    for entry in batch_json.get("data", []):
        entry_id = entry.get("entry_id")
        for file in entry.get("files", []):
            name_file = file["path"].split("/")[-1]
            file_extension = name_file.split(".")[-1]
            file_path = (
                f"https://nomad-lab.eu/prod/v1/gui/search/entries/entry/id/"
                f"{entry_id}/files/{name_file}"
            )
            size = file.get("size", None)

    return {
            "entry_id": entry_id,
            "name_file": name_file,
            "type": file_extension,
            "size": size,
            "file_path": file_path,
            "date_last_crawled": fetch_time
            }

In [None]:
def fetch_files_metadata(page_size: int = 50) -> List[Dict[str, Any]]:
    """
    Fetch file metadata for NOMAD Molecular Dynamics entries.

    Parameters
    ----------
    page_size : int
        Number of entries to fetch per page.

    Returns:
    --------
    List[Dict[str, Any]]: A list of dictionaries containing file metadata.
        Each dictionary has the following structure:
        {
            "entry_id": str,
            "name_file": str,
            "size": int,
            "file_path": str
        }
    """
    logger.info("Fetching files metadata from NOMAD...")
    fetch_time = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
    files_metas = []

    # Fetch the first page
    try:
        url = (
            f"{BASE_NOMAD_URL}/entries/rawdir"
            "?owner=public"
            f"&page_size={page_size}"
            "&order=asc"
            "&json_query=%7B%22results.method.workflow_name%22%3A%22MolecularDynamics%22%7D"
        )
        logger.debug(f"Requesting first page: {url}")
        response = httpx.get(url, timeout=1000)
        response.raise_for_status()

        first_files_metas = response.json()
        files_metas.append(parse_nomad_files(first_files_metas, fetch_time))

        total_entries = first_files_metas["pagination"]["total"]
        next_page_value = first_files_metas["pagination"]["next_page_after_value"]
        logger.debug(
            f"Fetched metadata for the first {len(first_files_metas['data'])} entries "
            f"({len(files_metas)} files out of {total_entries} entries)"
        )

    except httpx.HTTPError as e:
        logger.error(f"HTTP error occurred: {e}")
        return []

    # Paginate through remaining entries
    with tqdm(
        desc="Fetching NOMAD Molecular Dynamics files metadatas",
        colour="blue",
        ncols=100,
        ascii="░▒█",
        unit="file",
        bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]"
    ) as pbar:
        # Initial update for the first batch already fetched
        pbar.update(len(files_metas))
        while next_page_value:
            try:
                url = (
                    f"{BASE_NOMAD_URL}/entries/rawdir"
                    "?owner=public"
                    f"&page_size={page_size}"
                    "&order=asc"
                    "&json_query=%7B%22results.method.workflow_name%22%3A%22MolecularDynamics%22%7D"
                    f"&page_after_value={next_page_value}"
                )
                response = httpx.get(url, timeout=1000)
                response.raise_for_status()

                next_batch = response.json()
                files_metas.append(parse_nomad_files(next_batch, fetch_time))
                pbar.update(len(files_metas))

                # Update the next entry to begin with
                next_page_value = next_batch["pagination"]["next_page_after_value"]

            except httpx.HTTPError as e:
                logger.error(f"HTTP error occurred while fetching next page: {e}")
                break

    unique_entry_ids = {f["entry_id"] for f in files_metas}
    logger.success(
        f"Fetched {len(files_metas)} file metadata entries for {len(unique_entry_ids)} NOMAD entries successfully!"
    )
    return files_metas


files_metadatas = fetch_files_metadata()

[32m2025-11-03 14:32:46.136[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_files_metadata[0m:[36m21[0m - [1mFetching files metadata from NOMAD...[0m
[32m2025-11-03 14:32:46.137[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mfetch_files_metadata[0m:[36m34[0m - [34m[1mRequesting first page: http://nomad-lab.eu/prod/v1/api/v1/entries/rawdir?owner=public&page_size=50&order=asc&json_query=%7B%22results.method.workflow_name%22%3A%22MolecularDynamics%22%7D[0m
[32m2025-11-03 14:32:56.117[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mfetch_files_metadata[0m:[36m61[0m - [34m[1mFetched metadata for 50 entries (1848 files total so far) out of 15934 entries.[0m
Fetching files metadata from NOMAD: |[34m                        [0m| 480199/? [36:47<00:00, 189.64entry/s][0m

In [27]:
files_metadatas

[{'entry_id': '--SHdo9YOBfq2R6-JPQpNw2nTBtP',
  'name_file': 'npt346.cpt',
  'type': 'cpt',
  'size': 3767344,
  'file_path': 'https://nomad-lab.eu/prod/v1/gui/search/entries/entry/id/--SHdo9YOBfq2R6-JPQpNw2nTBtP/files/npt346.cpt'},
 {'entry_id': '--SHdo9YOBfq2R6-JPQpNw2nTBtP',
  'name_file': 'npt346.edr',
  'type': 'edr',
  'size': 10520,
  'file_path': 'https://nomad-lab.eu/prod/v1/gui/search/entries/entry/id/--SHdo9YOBfq2R6-JPQpNw2nTBtP/files/npt346.edr'},
 {'entry_id': '--SHdo9YOBfq2R6-JPQpNw2nTBtP',
  'name_file': 'npt346.gro',
  'type': 'gro',
  'size': 10824628,
  'file_path': 'https://nomad-lab.eu/prod/v1/gui/search/entries/entry/id/--SHdo9YOBfq2R6-JPQpNw2nTBtP/files/npt346.gro'},
 {'entry_id': '--SHdo9YOBfq2R6-JPQpNw2nTBtP',
  'name_file': 'npt346.log',
  'type': 'log',
  'size': 245185,
  'file_path': 'https://nomad-lab.eu/prod/v1/gui/search/entries/entry/id/--SHdo9YOBfq2R6-JPQpNw2nTBtP/files/npt346.log'},
 {'entry_id': '--SHdo9YOBfq2R6-JPQpNw2nTBtP',
  'name_file': 'npt346.m

In [16]:
def fetch_files_metadata(entries_ids: List[str]) -> List[Dict[str, Any]]:
    """
    Fetch file metadata for a list of NOMAD entry IDs.

    This function retrieves metadata for all files associated with a list of
    NOMAD entries. For each entry, it sends an HTTP request to the NOMAD API
    and collects information such as file name, file size, and download URL.

    Args:
        entries_ids (List[str]): A list of NOMAD entry IDs.

    Returns:
        List[Dict[str, Any]]: A list of dictionaries containing file metadata.
            Each dictionary has the following structure:
            {
                "entry_id": str,
                "name_file": str,
                "size": int,
                "file_path": str
            }
    """
    logger.info("Fetching files metadata from NOMAD...")
    files_metas = []

    with tqdm(
        total=len(entries_ids),
        desc="Fetching files metadata from NOMAD",
        colour="blue",
        ncols=100,
        ascii="░▒█",
        unit="entry",
        bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}]"
    ) as pbar:
        for entry_id in entries_ids:
            url = f"{BASE_NOMAD_URL}/entries/{entry_id}/rawdir"
            response = httpx.get(url, timeout=1000)
            response.raise_for_status()
            data = response.json()

            for file in data.get("files", []):
                name_file = file["path"].split("/")[-1]
                file_extension = name_file.split(".")[-1]
                file_path = (
                    f"https://nomad-lab.eu/prod/v1/gui/search/entries/entry/id/"
                    f"{entry_id}/files/{name_file}"
                )
                size = file.get("size", None)

                files_metas.append({
                    "entry_id": entry_id,
                    "name_file": name_file,
                    "type": file_extension,
                    "size": size,
                    "file_path": file_path
                })
            pbar.update(1)

    logger.success(
        f"Fetched {len(files_metas)} file metadata entries for {len(entries_ids)} NOMAD entries successfully!"
    )
    return files_metas


entries_ids = [d["entry_id"] for d in nomad_data]
files_metadatas = fetch_files_metadata(entries_ids)

[32m2025-11-03 12:09:05.577[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_files_metadata[0m:[36m22[0m - [1mFetching files metadata from NOMAD...[0m
Fetching files metadata from NOMAD:   1%|[34m░░░░░░░░░░░░░░░░░░[0m| 180/15950 [04:08<6:03:14,  1.38s/entry][0m


HTTPStatusError: Server error '500 Internal Server Error' for url 'http://nomad-lab.eu/prod/v1/api/v1/entries/-9aKfg-Ou3lteH5w2N4cyzqeoBB3/rawdir'
For more information check: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500

In [35]:
def fetch_entries_md_related() -> Tuple[List[Dict[str, Any]], str]:
    """
    Fetch all Molecular Dynamics (MD)-related entries from the NOMAD API.

    Returns
    -------
    Tuple[List[Dict[str, Any]], str]:
        - A list of entries related to Molecular Dynamics workflows (JSON objects).
        Returns an empty list if the request fails.
        - The current timestamp in ISO 8601 format (e.g., '2023-03-05T22:01:12').

    """
    logger.info("Fetching Molecular Dynamics related entries from NOMAD API...")
    # Current timestamp in ISO format
    fetch_time: str = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")

    try:
        # Build the request URL with a query filtering for 'MolecularDynamics' workflow
        url = (
            f"{BASE_NOMAD_URL}/entries/export"
            "?owner=visible"
            "&json_query=%7B%22results.method.workflow_name%22%3A%22MolecularDynamics%22%7D"
            "&include=authors&include=files&include=comment&include=entry_create_time&include=datasets&include=references"
        )

        # Perform the HTTP GET request with a long timeout to accommodate large data (usually take less than 3 minutes)
        response = httpx.get(url, timeout=1000)
        response.raise_for_status()

        # Parse JSON data
        entries_md = response.json()
        logger.success(f"Fetched {len(entries_md)} MD-related entries from NOMAD successfully !")
        return entries_md, fetch_time
    
    except httpx.HTTPError as e:
        logger.error(f"HTTP error occurred: {e}")
        return [], fetch_time
   

nomad_data, fetch_time = fetch_entries_md_related()

[32m2025-10-29 14:27:44.363[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_entries_md_related[0m:[36m13[0m - [1mFetching Molecular Dynamics related entries from NOMAD API...[0m
[32m2025-10-29 14:27:50.807[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mfetch_entries_md_related[0m:[36m32[0m - [32m[1mFetched 15934 MD-related entries from NOMAD successfully ![0m


In [32]:
nomad_data[0].keys()

dict_keys(['upload_id', 'references', 'origin', 'quantities', 'datasets', 'n_quantities', 'nomad_version', 'upload_create_time', 'nomad_commit', 'section_defs', 'processing_errors', 'results', 'entry_name', 'last_processing_time', 'optimade', 'parser_name', 'calc_id', 'published', 'writers', 'sections', 'processed', 'mainfile', 'main_author', 'viewers', 'entry_create_time', 'with_embargo', 'domain', 'files', 'comment', 'entry_type', 'entry_id', 'upload_name', 'authors', 'license'])

In [36]:
nomad_data[0]

{'upload_id': '-CcbSN00SYWaqFBrFOXuiQ',
 'entry_create_time': '2023-07-05T15:50:41.327000+00:00',
 'references': ['https://doi.org/10.1063/5.0104914',
  'http://doi.org/10.5281/zenodo.6032826'],
 'files': ['NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ3.log',
  'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/#init.gro.1#',
  'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/2678648_2.log',
  'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/2683291_2.log',
  'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1-2.sub',
  'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1-run.mdp',
  'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1.cpt',
  'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1.edr',
  'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1.gro',
  'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1.log',
  'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1.mdp',
  'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1.sub',
  'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1.tpr',
  'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320

In [None]:
nomad_data[0]

{'upload_id': '-CcbSN00SYWaqFBrFOXuiQ',
 'references': ['https://doi.org/10.1063/5.0104914',
  'http://doi.org/10.5281/zenodo.6032826'],
 'origin': 'Joseph Rudzinski',
 'quantities': ['',
  'metadata',
  'metadata.coauthors',
  'metadata.datasets',
  'metadata.domain',
  'metadata.embargo_length',
  'metadata.entry_coauthors',
  'metadata.entry_create_time',
  'metadata.entry_hash',
  'metadata.entry_id',
  'metadata.entry_name',
  'metadata.entry_timestamp',
  'metadata.entry_timestamp.timestamp',
  'metadata.entry_timestamp.token',
  'metadata.entry_timestamp.token_seed',
  'metadata.entry_timestamp.tsa_server',
  'metadata.entry_type',
  'metadata.files',
  'metadata.last_processing_time',
  'metadata.license',
  'metadata.main_author',
  'metadata.mainfile',
  'metadata.nomad_commit',
  'metadata.nomad_version',
  'metadata.optimade',
  'metadata.optimade.cartesian_site_positions',
  'metadata.optimade.chemical_formula_anonymous',
  'metadata.optimade.chemical_formula_descriptive',

In [7]:
source = "NOMAD"
source_id = f"https://nomad-lab.eu/prod/v1/gui/search/entries?entry_id={nomad_data[0]['entry_id']}"
doi = nomad_data[0]['references'][1]
title = nomad_data[0]['datasets'][0]['dataset_name']
date_creation = nomad_data[0]['datasets'][0]['dataset_create_time']
date_last_modification = nomad_data[0]['datasets'][0]['dataset_modified_time']
nb_files = len(nomad_data[0]['files'])
file_names = nomad_data[0]['files']
authors = [author_info['name'] for author_info in nomad_data[0]['authors']]
license = nomad_data[0]['license']
description = nomad_data[0]['comment']
file_analysises = nomad_data[0]['results']

print(source, source_id, doi, title, date_creation, date_last_modification, nb_files, file_names, authors, license, description, file_analysises) 

NOMAD https://nomad-lab.eu/prod/v1/gui/search/entries?entry_id=-Kc1kuZ5yC5z7mEBkti6TiTSbn46 http://doi.org/10.5281/zenodo.6032826 Atomistic Molecular Dynamics Simulations of Pure Liquids and Binary Mixtures for Representative C7O2 Isomers 2023-07-07T09:08:26.816000+00:00 2023-07-07T09:08:26.816000+00:00 61 ['NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ3.log', 'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/#init.gro.1#', 'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/2678648_2.log', 'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/2683291_2.log', 'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1-2.sub', 'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1-run.mdp', 'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1.cpt', 'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1.edr', 'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1.gro', 'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1.log', 'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1.mdp', 'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1.sub', 'NOMAD_upload_21/MOL

In [8]:
def parse_entry_metadata(data: Dict[str, Any], fetch_time: str) -> Dict[str, Any]:
    """
    Parse relevant metadata fields from a single NOMAD entry JSON.

    Parameters
    ----------
    data : Dict[str, Any]
        JSON response for a single NOMAD entry.
    fetch_time : str
        Timestamp when the data was fetched.

    Returns
    -------
    Dict[str, Any]
        Flattened metadata dictionary for one entry.
    """
    entry_id = data.get("entry_id")
    dataset = data.get("datasets", [{}])[0] if data.get("datasets") else {}

    return {
        "source": "NOMAD",
        "source_id": f"https://nomad-lab.eu/prod/v1/gui/search/entries?entry_id={entry_id}",
        "doi": data.get("references"),
        "title": dataset.get("dataset_name"),
        "date_creation": dataset.get("dataset_create_time"),
        "date_last_modification": dataset.get("dataset_modified_time"),
        "date_last_crawled": fetch_time,
        "nb_files": len(data.get("files", [])),
        "file_names": data.get("files", []),
        "authors": [a.get("name") for a in data.get("authors", [])],
        "license": data.get("license"),
        "description": data.get("comment"),
        "file_analyses": data.get("results"),
    }


dict = parse_entry_metadata(nomad_data[0], fetch_time)

In [9]:
dict

{'source': 'NOMAD',
 'source_id': 'https://nomad-lab.eu/prod/v1/gui/search/entries?entry_id=-Kc1kuZ5yC5z7mEBkti6TiTSbn46',
 'doi': ['https://doi.org/10.1063/5.0104914',
  'http://doi.org/10.5281/zenodo.6032826'],
 'title': 'Atomistic Molecular Dynamics Simulations of Pure Liquids and Binary Mixtures for Representative C7O2 Isomers',
 'date_creation': '2023-07-07T09:08:26.816000+00:00',
 'date_last_modification': '2023-07-07T09:08:26.816000+00:00',
 'date_last_crawled': '2025-10-29T10:39:45',
 'nb_files': 61,
 'file_names': ['NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ3.log',
  'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/#init.gro.1#',
  'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/2678648_2.log',
  'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/2683291_2.log',
  'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1-2.sub',
  'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1-run.mdp',
  'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1.cpt',
  'NOMAD_upload_21/MOL0/MOL5/MOL0_80_MOL5_320/EQ1.edr',
 

In [None]:
def parse_nomad_dataset_parallel(nomad_data: List[Dict[str, Any]], fetch_time: str, max_workers: int = 8) -> pd.DataFrame:
    """
    Parse all NOMAD entries in parallel and return a combined DataFrame.

    Parameters
    ----------
    nomad_data : List[Dict[str, Any]]
        List of NOMAD entry JSON objects.
    fetch_time : str
        Timestamp when data was fetched.
    max_workers : int, optional
        Maximum number of threads to use for parallel parsing (default is 8).

    Returns
    -------
    pd.DataFrame
        DataFrame containing parsed metadata for all entries.
    """
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(parse_entry_metadata, entry, fetch_time): entry for entry in nomad_data}
        for future in as_completed(futures):
            try:
                results.append(future.result())
            except Exception as e:
                logger.error(f"Error parsing entry: {e}")

    return pd.DataFrame(results)


def save_nomad_metadata(df: pd.DataFrame, output_dir: str = OUTPUT_DIR, filename: str = "nomad_metadata.parquet") -> str:
    """
    Save parsed NOMAD metadata DataFrame to a local file.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame containing parsed NOMAD metadata.
    output_dir : str, optional
        Directory to store the output file (default is 'data/nomad').
    filename : str, optional
        Output filename (default is 'nomad_metadata.parquet').

    Returns
    -------
    str
        Path to the saved file.
    """
    logger.info("Saving NOMAD metadata to local file...")
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, filename)
    df.to_parquet(output_path, index=False)
    logger.success(f"NOMAD metadata saved to: {output_path} successfully!")
    return output_path

In [None]:
def scrap_nomad_data():
    """ Scrap molecular dynamics datasets and files from NOMAD """
    logger.info("Starting Nomad data scraping...")

    if test_nomad_connection:
        # Define output directory
        output_dir = os.path.join("data", "nomad")
        os.makedirs(output_dir, exist_ok=True)

        # Fetch NOMAD entries metadata
        nomad_data, fetch_time = fetch_entries_md_related()
        if nomad_data == []:
            logger.warning("No data fetched from NOMAD.")
            return
        # Parse NOMAD entries metadata in parallel
        nomad_metadata_df = parse_nomad_dataset_parallel(nomad_data, fetch_time)

        # Save parsed metadata to local file
        save_nomad_metadata(nomad_metadata_df, output_dir=output_dir)
    
        logger.success(f"Scrapped NOMAD data successfully and saved to {output_dir} !")

    else:
        logger.error("Cannot scrap data, no connection to NOMAD API.")
        sys.exit()

scrap_nomad_data()

[32m2025-10-28 11:55:39.368[0m | [1mINFO    [0m | [36m__main__[0m:[36mscrap_nomad_data[0m:[36m3[0m - [1mStarting Nomad data scraping...[0m
[32m2025-10-28 11:55:39.368[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch_entries_md_related[0m:[36m13[0m - [1mFetching Molecular Dynamics related entries from NOMAD API...[0m
[32m2025-10-28 11:57:37.704[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mfetch_entries_md_related[0m:[36m31[0m - [32m[1mFetched 15934 MD-related entries from NOMAD successfully ![0m
[32m2025-10-28 11:57:42.307[0m | [1mINFO    [0m | [36m__main__[0m:[36msave_nomad_metadata[0m:[36m49[0m - [1mSaving NOMAD metadata to local file...[0m


Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error parsing entry: list index out of range
Error pars

[32m2025-10-28 11:57:45.006[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36msave_nomad_metadata[0m:[36m53[0m - [32m[1mNOMAD metadata saved to: data/nomad/nomad_metadata.parquet successfully![0m
[32m2025-10-28 11:57:45.006[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mscrap_nomad_data[0m:[36m21[0m - [32m[1mScrapped NOMAD data successfully and saved to data/nomad ![0m
