In [1]:
'''
Script to get the field metadata for all OpenAlex topics
'''

'\nScript to get the field metadata for all OpenAlex topics\n'

In [2]:
import csv
import requests

from helper_funcs import *
from constants import *

import argparse
import gzip
import json
import sys
import time
from pathlib import Path
from typing import Any, Dict, List, Tuple

from __future__ import annotations

In [3]:
ENDPOINT       = "https://api.openalex.org/fields"
MAX_PER_PAGE   = 200          # OpenAlex hard limit
SLEEP_BETWEEN  = 0.20         # seconds – be polite
PROGRESS_EVERY = 1_000        # console update cadence

In [4]:
def short_id(uri: str) -> str:
    """Return the fragment after the final '/' in any URI."""
    return uri.rstrip("/").rsplit("/", 1)[-1]

In [8]:
def extract_field(raw: Dict[str, Any]) -> Dict[str, Any]:
    """Flatten one raw FIELD object with minted IDs."""
    return {
        "display_name": raw.get("display_name", ""),
        "description":  raw.get("description", ""),      # may be empty
        "openalex_id":  raw["id"],                       # full URL kept
        "siblings": [
            short_id(sib["id"])                       # minted IDs
            for sib in raw.get("siblings", [])
        ],
        "domain": {
            "id":           short_id(raw["domain"]["id"]),
            "display_name": raw["domain"]["display_name"],
        } if raw.get("domain") else None,
    }

In [9]:
def main() -> None:
    fields: Dict[str, Any] = {}
    page, fetched = 1, 0

    print("\n⏳  Downloading OpenAlex fields …\n")

    while True:
        resp = requests.get(
            ENDPOINT,
            params={"page": page, "per_page": MAX_PER_PAGE},
            timeout=30,
        )
        resp.raise_for_status()
        results: List[Dict[str, Any]] = resp.json().get("results", [])

        if not results:     # no more pages
            break

        for raw in results:
            key          = short_id(raw["id"])   # minted key
            fields[key]  = extract_field(raw)
            fetched     += 1

            if fetched % PROGRESS_EVERY == 0:
                print(f"  … {fetched:,} fields so far")

        page += 1
        time.sleep(SLEEP_BETWEEN)

    print(f"\n✅  Finished: {fetched:,} fields collected")

    out_path = Path("openalex_fields.json")
    with out_path.open("w", encoding="utf-8") as fh:
        json.dump(fields, fh, indent=2, ensure_ascii=False)

    print(f"✔  wrote → {out_path.resolve()}")
    print("🎉  All done!")

In [10]:
if __name__ == "__main__":
    main()


⏳  Downloading OpenAlex fields …


✅  Finished: 26 fields collected
✔  wrote → /Users/kaushalamancherla/helio-aws/SocialKG/openalex_fields.json
🎉  All done!
