In [1]:
'''
Script to get the topic metadata for all OpenAlex topics
'''

'\nScript to get the topic metadata for all OpenAlex topics\n'

In [18]:
import csv
import requests

from helper_funcs import *
from constants import *

import argparse
import gzip
import json
import sys
import time
from pathlib import Path
from typing import Any, Dict, List, Tuple

from __future__ import annotations

In [19]:
ENDPOINT        = "https://api.openalex.org/topics"
MAX_PER_PAGE    = 200          # OpenAlex maximum
SLEEP_BETWEEN   = 0.20         # seconds – stay polite
PROGRESS_EVERY  = 1_000        # console update cadence

In [20]:
def short_id(uri: str) -> str:
    """Return the string after the final '/' in any URI."""
    return uri.rstrip("/").rsplit("/", 1)[-1]

In [21]:
def extract_topic(raw: Dict[str, Any]) -> Dict[str, Any]:
    """Flatten an OpenAlex topic object with minted IDs."""
    return {
        "display_name": raw.get("display_name", ""),
        "description":  raw.get("description", ""),
        "keywords":     raw.get("keywords", []),
        "openalex_id":  raw["id"],                    # full URL kept here

        "siblings": [
            short_id(sib["id"])                       # minted IDs
            for sib in raw.get("siblings", [])
        ],

        "subfield": {
            "id":           short_id(raw["subfield"]["id"]),
            "display_name": raw["subfield"]["display_name"],
        } if raw.get("subfield") else None,
    }

In [22]:
def main() -> None:
    topics: Dict[str, Any] = {}
    page, fetched = 1, 0

    print("\n⏳  Downloading OpenAlex topics …\n")

    while True:
        resp = requests.get(
            ENDPOINT,
            params={"page": page, "per_page": MAX_PER_PAGE},
            timeout=30,
        )
        resp.raise_for_status()
        results: List[Dict[str, Any]] = resp.json().get("results", [])

        if not results:                    # no more pages, done
            break

        for raw in results:
            key          = short_id(raw["id"])       # minted key
            topics[key]  = extract_topic(raw)
            fetched     += 1
            if fetched % PROGRESS_EVERY == 0:
                print(f"  … {fetched:,} topics so far")

        page += 1
        time.sleep(SLEEP_BETWEEN)

    print(f"\n✅  Finished: {fetched:,} topics collected")

    out_path = Path("openalex_topics.json")
    with out_path.open("w", encoding="utf-8") as fh:
        json.dump(topics, fh, indent=2, ensure_ascii=False)

    print(f"✔  wrote → {out_path.resolve()}")
    print("🎉  All done!")

In [23]:
main()


⏳  Downloading OpenAlex topics …

  … 1,000 topics so far
  … 2,000 topics so far
  … 3,000 topics so far
  … 4,000 topics so far

✅  Finished: 4,516 topics collected
✔  wrote → /Users/kaushalamancherla/helio-aws/SocialKG/openalex_topics.json
🎉  All done!
