In [1]:
import os
import re
from pathlib import Path
import requests

In [2]:
entity_endpoints = [
    'works',
    'authors',
    'sources',
    'publishers',
    'funders',
    'institutions',
    'concepts',
]

### From API

In [3]:
api_valid_fields = []
for e in entity_endpoints:
    r = requests.get(f"https://api.openalex.org/{e}/valid_fields")
    for item in r.json():
        api_valid_fields.append({
            'key': item,
            'entityType': e,
        })

In [4]:
len(api_valid_fields)

287

In [5]:
api_valid_fields

[{'key': 'abstract.search', 'entityType': 'works'},
 {'key': 'apc_list.currency', 'entityType': 'works'},
 {'key': 'apc_list.provenance', 'entityType': 'works'},
 {'key': 'apc_list.value', 'entityType': 'works'},
 {'key': 'apc_list.value_usd', 'entityType': 'works'},
 {'key': 'apc_paid.currency', 'entityType': 'works'},
 {'key': 'apc_paid.provenance', 'entityType': 'works'},
 {'key': 'apc_paid.value', 'entityType': 'works'},
 {'key': 'apc_paid.value_usd', 'entityType': 'works'},
 {'key': 'author.id', 'entityType': 'works'},
 {'key': 'author.orcid', 'entityType': 'works'},
 {'key': 'authors_count', 'entityType': 'works'},
 {'key': 'authorships.author.id', 'entityType': 'works'},
 {'key': 'authorships.author.orcid', 'entityType': 'works'},
 {'key': 'authorships.countries', 'entityType': 'works'},
 {'key': 'authorships.institutions.continent', 'entityType': 'works'},
 {'key': 'authorships.institutions.country_code', 'entityType': 'works'},
 {'key': 'authorships.institutions.id', 'entityTy

### From GUI source code

In [6]:
url = "https://raw.githubusercontent.com/ourresearch/openalex-gui/search/src/facetConfigs.js"
r = requests.get(url)

In [7]:
p = re.compile(r'key: "(.*?)",.*?entityType: "(.*?)"', flags=re.S)  # re.S means dot matches newline
# p = re.compile(r'key: "(.*?)",', flags=re.S)  # re.S means dot matches newline
gui_matches = p.findall(r.text)

In [8]:
r.text

'import {sortByKey} from "./util";\n\nconst facetCategories = {\n    works: [\n        "popular",\n        "author",\n        "source",\n        "funder",\n        "institution",\n\n        "open access",\n        "search",\n        "citation",\n        "other",\n    ],\n    authors: [\n        "popular",\n        "institution",\n        "geo",\n        "ids",\n        "other",\n    ],\n    sources: [\n        "popular",\n        "open access",\n        "other",\n    ],\n    publishers: [\n        "popular",\n        "other",\n    ],\n    institutions: [\n        "popular",\n        "geo",\n        "other",\n    ],\n    concepts: [\n        "popular",\n        "other"\n    ],\n}\n\nconst facetCategoriesIcons = {\n    popular: "mdi-star-outline",\n    author: "mdi-account-outline",\n    institution: "mdi-town-hall",\n    geo: "mdi-map-marker-outline",\n    funder: "mdi-cash-multiple",\n    "source": "mdi-book-multiple-outline",\n    repository: "mdi-package-variant",\n    search: "mdi-m

In [9]:
gui_filters = []
for key, entity_type in gui_matches:
    gui_filters.append({
        'key': key,
        'entityType': entity_type,
    })

In [10]:
len(gui_filters)

74

In [11]:
import pandas as pd
import numpy as np

In [12]:
pd.Series([x in api_valid_fields for x in gui_filters]).value_counts()

True    74
Name: count, dtype: int64

In [13]:
pd.DataFrame(gui_filters).to_csv('gui_filters.csv', index=False)

### From docs

In [39]:
docs_txt = {}
for e in entity_endpoints:
    fp = list(Path('../../').rglob(f'filter-{e}.md'))[0]
    txt = fp.read_text()
    docs_txt[fp.name] = txt

In [40]:
docs_txt.keys()

dict_keys(['filter-works.md', 'filter-authors.md', 'filter-sources.md', 'filter-publishers.md', 'filter-funders.md', 'filter-institutions.md', 'filter-concepts.md'])

In [42]:
p = re.compile(r'`(.+?)`')
docs_filters_candidates = {}
for filename, txt in docs_txt.items():
    docs_filters_candidates[filename] = p.findall(txt)

In [43]:
docs_filters_candidates

{'filter-works.md': ['filter',
  'https://api.openalex.org/works?filter=publication\\_year:2020',
  'publication_year',
  '/works',
  'Work',
  'Work',
  'host_venue',
  'alternate_host_venues',
  'primary_location',
  'locations',
  'host_venue',
  'alternate_host_venues',
  'authorships.author.id',
  'author.id',
  'authorships.author.orcid',
  'author.orcid',
  'authorships.countries',
  'authorships.institutions.country_code',
  'institutions.country_code',
  'authorships.institutions.id',
  'institutions.id',
  'authorships.institutions.lineage',
  'authorships.institutions.ror',
  'institutions.ror',
  'authorships.institutions.type',
  'authorships.is_corresponding',
  'is_corresponding',
  'apc_list.value',
  'apc_list.currency',
  'apc_list.provenance',
  'apc_list.value_usd',
  'apc_paid.value',
  'apc_paid.currency',
  'apc_paid.provenance',
  'apc_paid.value_usd',
  'best_oa_location.is_accepted',
  'best_oa_location.is_published',
  'best_oa_location.license',
  'best_oa_l