In [1]:
'''
Script to get the subfield metadata for all OpenAlex topics
'''

'\nScript to get the subfield metadata for all OpenAlex topics\n'

In [2]:
import csv
import requests

from helper_funcs import *
from constants import *

import argparse
import gzip
import json
import sys
import time
from pathlib import Path
from typing import Any, Dict, List, Tuple

from __future__ import annotations

In [3]:
ENDPOINT        = "https://api.openalex.org/subfields"
MAX_PER_PAGE    = 200          # OpenAlex maximum
SLEEP_BETWEEN   = 0.20         # seconds – stay polite
PROGRESS_EVERY  = 1_000        # console update cadence

In [9]:
def short_id(uri: str) -> str:
    """Return fragment after the final '/' in any URI."""
    return uri.rstrip("/").rsplit("/", 1)[-1]

In [18]:
def extract_subfield(raw: Dict[str, Any]) -> Dict[str, Any]:
    """Flatten one raw subfield object with minted IDs."""
    return {
        "display_name": raw.get("display_name", ""),
        "description":  raw.get("description", ""),        # not always present
        "openalex_id":  raw["id"],                         # full URL
        "siblings": [
            short_id(sib["id"])                       # minted IDs
            for sib in raw.get("siblings", [])
        ],
        "field": {
            "id":           short_id(raw["field"]["id"]),
            "display_name": raw["field"]["display_name"],
        } if raw.get("field") else None,
    }

In [19]:
def main() -> None:
    subfields: Dict[str, Any] = {}
    page, fetched = 1, 0

    print("\n⏳  Downloading OpenAlex subfields …\n")

    while True:
        resp = requests.get(
            ENDPOINT,
            params={"page": page, "per_page": MAX_PER_PAGE},
            timeout=30,
        )
        resp.raise_for_status()
        results: List[Dict[str, Any]] = resp.json().get("results", [])

        if not results:      # no more pages
            break

        for raw in results:
            key             = short_id(raw["id"])   # minted key
            subfields[key]  = extract_subfield(raw)
            fetched        += 1

            if fetched % PROGRESS_EVERY == 0:
                print(f"  … {fetched:,} subfields so far")

        page += 1
        time.sleep(SLEEP_BETWEEN)

    print(f"\n✅  Finished: {fetched:,} subfields collected")

    out_path = Path("openalex_subfields.json")
    with out_path.open("w", encoding="utf-8") as fh:
        json.dump(subfields, fh, indent=2, ensure_ascii=False)

    print(f"✔  wrote → {out_path.resolve()}")
    print("🎉  All done!")


In [20]:
main()


⏳  Downloading OpenAlex subfields …


✅  Finished: 252 subfields collected
✔  wrote → /Users/kaushalamancherla/helio-aws/SocialKG/openalex_subfields.json
🎉  All done!
