In [None]:
import requests
import pandas as pd
from time import sleep
from concurrent.futures import ThreadPoolExecutor, as_completed
import os

BASE_URL = "https://api.openalex.org/works"

# Look-up Terms
Query = [
    # Computer Science / AI
    "machine learning",
    "artificial intelligence",
    "neural networks",
    "deep learning",
    "computer vision",
    "reinforcement learning",
    "natural language processing",
    "AI ethics",
    "robotics",
    "knowledge graphs",
    
    # Physics
    "quantum mechanics",
    "climate modeling",
    "string theory",
    "particle physics",
    "astrophysics",
    "condensed matter physics",
    "gravitational waves",
    "thermodynamics",
    "optics",
    "plasma physics",
    
    # Biology / Medicine
    "genome sequencing",
    "cancer immunotherapy",
    "CRISPR gene editing",
    "stem cell therapy",
    "epigenetics",
    "microbiome research",
    "protein folding",
    "neuroscience",
    "vaccine development",
    "bioinformatics",
    
    # Social Sciences
    "behavioral economics",
    "urban sociology",
    "political polarization",
    "education policy",
    "social networks",
    "gender studies",
    "migration studies",
    "organizational behavior",
    "public health policy",
    "criminology",
    
    # Humanities
    "medieval literature",
    "renaissance art",
    "philosophy of mind",
    "linguistics",
    "cultural anthropology",
    "classical archaeology",
    "music theory",
    "modern literature",
    "history of science",
    "religious studies"
]

In [2]:
def fetch_page(query, page, per_page=20, max_retries=10):
    """Fetch a single page of OpenAlex results with retries."""
    for attempt in range(max_retries):
        try:
            url = f"{BASE_URL}?filter=title.search:{query},open_access.is_oa:true&per-page={per_page}&page={page}"
            r = requests.get(url, timeout=10)
            r.raise_for_status()
            data = r.json()
            results = data.get("results", [])
            papers = []
            for d in results:
                paper_id = d.get("id")
                papers.append({
                    "id": paper_id,
                    "title": d.get("title"),
                    "abstract": d.get("abstract_inverted_index"),
                    "year": d.get("publication_year"),
                    "concepts": [c["display_name"] for c in d.get("concepts", [])]
                })
            return papers
        except (requests.exceptions.RequestException, ValueError) as e:
            print(f"Page {page} attempt {attempt+1} failed: {e}")
            sleep(15)
    print(f"Page {page} skipped after {max_retries} retries.")
    return []

In [None]:
# Remaking original description
def reconstruct_abstract(inv_index):
    """Convert abstract inverted index to full text."""
    if not inv_index:
        return ""
    position_map = {}
    for word, positions in inv_index.items():
        for pos in positions:
            position_map[pos] = word
    return " ".join(position_map[pos] for pos in sorted(position_map.keys()))

In [None]:
# Parallelise the process for efficiency
def fetch_query_parallel(query):
    papers_dict = {}
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(fetch_page, query, page, 20): page for page in range(1, 50+1)}
        for future in as_completed(futures):
            page_papers = future.result()
            for paper in page_papers:
                papers_dict[paper["id"]] = paper  # deduplication
    return list(papers_dict.values())

In [None]:
# if os.path.exists("openalex_papers5.csv"):
#    os.remove("openalex_papers5.csv")

In [6]:
global_paper_ids = set()
batch = []

In [None]:
for q_idx, q in enumerate(Query, 1):
    print(f"Processing query {q_idx}/{len(Query)}: {q}")
    papers = fetch_query_parallel(q)
    print(f"Fetched {len(papers)} papers for query '{q}'")

    for paper in papers:
        pid = paper["id"]
        if pid in global_paper_ids:
            continue  # skip duplicates across queries
        global_paper_ids.add(pid)
        # reconstruct abstract text
        paper["abstract_text"] = reconstruct_abstract(paper.pop("abstract", None))
        batch.append(paper)

        # Write batch to CSV if reached CHUNK_SIZE
        if len(batch) >= 1000:
            df_chunk = pd.DataFrame(batch)
            df_chunk.to_csv("openalex_papers5.csv", mode='a', index=False, header=not os.path.exists("openalex_papers5.csv"))
            batch = []

# Write remaining papers
if batch:
    df_chunk = pd.DataFrame(batch)
    df_chunk.to_csv("openalex_papers5.csv", mode='a', index=False, header=not os.path.exists("openalex_papers5.csv"))

print(f"Finished. Total unique papers saved: {len(global_paper_ids)}")
print(f"CSV file: {"openalex_papers5.csv"}")
# Writing directly to memory for efficiency and saving an original copy


Processing query 1/50: machine learning
Fetched 1000 English papers for query 'machine learning'
Processing query 2/50: artificial intelligence
Fetched 1000 English papers for query 'artificial intelligence'
Processing query 3/50: neural networks
Fetched 1000 English papers for query 'neural networks'
Processing query 4/50: deep learning
Fetched 1000 English papers for query 'deep learning'
Processing query 5/50: computer vision
Fetched 1000 English papers for query 'computer vision'
Processing query 6/50: reinforcement learning
Fetched 1000 English papers for query 'reinforcement learning'
Processing query 7/50: natural language processing
Fetched 1000 English papers for query 'natural language processing'
Processing query 8/50: AI ethics
Fetched 1000 English papers for query 'AI ethics'
Processing query 9/50: robotics
Fetched 1000 English papers for query 'robotics'
Processing query 10/50: knowledge graphs
Fetched 1000 English papers for query 'knowledge graphs'
Processing query 11/5

In [8]:
df_chunk.head(10)

Unnamed: 0,id,title,year,concepts,abstract_text
0,https://openalex.org/W2886899886,Pattern of inter-marriage in Keningau: a preli...,2016,"[Faith, Ethnic group, Religious conversion, Is...",This paper seeks to discuss the profiling patt...
1,https://openalex.org/W3217800672,Study of expenditure and stay in the segmentat...,2021,"[Tourism, Diversification (marketing strategy)...",Tourist expenditure is an element that is gain...
2,https://openalex.org/W4312205350,Communication Patterns of Gus Baha' Religious ...,2022,"[Clothing, History, Archaeology]",This article aims to describe the communicatio...
3,https://openalex.org/W2947724033,Pope Francis's Laudato Si': A corpus study of ...,2019,"[Encyclical, Relevance (law), Rhetorical quest...",This paper explores aspects of the lexico-gram...
4,https://openalex.org/W3035577798,The Dilemma Between Religious Doctrine and Pol...,2020,"[Politics, Ideology, Doctrine, Islam, Politica...",This paper aims to examine how Hamas as an Isl...
5,https://openalex.org/W3160035937,The Impact of the Church–State Model for an Ef...,2021,"[Coronavirus disease 2019 (COVID-19), Interpre...","During the COVID-19 pandemic, many governments..."
6,https://openalex.org/W4378232595,The influence of cultural and religious factor...,2023,"[Anxiety, Hofstede's cultural dimensions theor...",Introduction Low back pain and neck pain are a...
7,https://openalex.org/W3154589232,Religious tourism in Jordan: current situation...,2012,"[Tourism, Islam, Religious tourism, State (com...",The purpose of this research study is to analy...
8,https://openalex.org/W1973500443,Is it still possible to study religion religio...,2013,"[Mythology, Variety (cybernetics), Philosophy,...",This article reflects on the question whether ...
9,https://openalex.org/W2033397684,"A Comparative Study on Cyber Ethics, Religious...",2013,"[Honesty, Social psychology, Psychology, Happi...","Throughout the internet's evolution , debates ..."


In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict, Counter
import json

In [None]:
stop_words = set(stopwords.words("english")) # assuming english lexicon (dropped all other)

In [None]:
def tokenize(text):
    tokens = word_tokenize(text.lower())
    return [
        t for t in tokens
        if t.isalpha() and t not in stop_words
    ]

In [None]:
def build_inverted_index(text):
    """
    Build an inverted index from a string:
    """
    if not text:
        return {}

    tokens = tokenize(text)  # lowercase, remove stopwords
    inv_index = defaultdict(list)
    for pos, token in enumerate(tokens):
        inv_index[token].append(pos)
    return dict(inv_index)

In [None]:
df = df_chunk

In [None]:
df = df[df["abstract_text"].notna() & (df["abstract_text"] != "")] # remove all pages with empty descriptions

In [None]:
df["abstract_inverted_index"] = df["abstract_text"].apply(build_inverted_index) # construct inverted index

In [None]:
df["tokens"] = df["abstract_inverted_index"].apply(
    lambda inv: [t.lower() for t, pos_list in inv.items() for _ in pos_list]
)
# Expand the inverted index into a token list by:
# - repeating each term once per occurrence
# - converting all tokens to lowercase

In [None]:
df["tokens"] = df["tokens"].apply(json.dumps) # Have to convert to json as when saving to csv, tokens are converted from list to str (char)

In [None]:
df.to_csv("./openalex10.csv", index=False) # save to csv