In [None]:
import requests
import pandas as pd
from time import sleep

BASE_URL = "https://api.openalex.org/works"
# Look-up Terms
Query = [
    # Computer Science / AI
    "machine learning",
    "artificial intelligence",
    "neural networks",
    "deep learning",
    "computer vision",
    "reinforcement learning",
    "natural language processing",
    "AI ethics",
    "robotics",
    "knowledge graphs",
    
    # Physics
    "quantum mechanics",
    "climate modeling",
    "string theory",
    "particle physics",
    "astrophysics",
    "condensed matter physics",
    "gravitational waves",
    "thermodynamics",
    "optics",
    "plasma physics",
    
    # Biology / Medicine
    "genome sequencing",
    "cancer immunotherapy",
    "CRISPR gene editing",
    "stem cell therapy",
    "epigenetics",
    "microbiome research",
    "protein folding",
    "neuroscience",
    "vaccine development",
    "bioinformatics",
    
    # Social Sciences
    "behavioral economics",
    "urban sociology",
    "political polarization",
    "education policy",
    "social networks",
    "gender studies",
    "migration studies",
    "organizational behavior",
    "public health policy",
    "criminology",
    
    # Humanities
    "medieval literature",
    "renaissance art",
    "philosophy of mind",
    "linguistics",
    "cultural anthropology",
    "classical archaeology",
    "music theory",
    "modern literature",
    "history of science",
    "religious studies"
]

In [None]:
def fetch_papers(query, per_page=20, pages=50, max_retries=10):
    papers_dict = {}
    for page in range(1, pages+1):
        retries = 0
        while retries < max_retries:
            try:
                url = f"{BASE_URL}?filter=title.search:{query},open_access.is_oa:true&per-page={per_page}&page={page}"
                r = requests.get(url, timeout=10)
                r.raise_for_status()  # Raises HTTPError for 4xx/5xx
                data = r.json()
                results = data.get("results", [])
                if not results:
                    print(f"No results found for page {page}")
                    break
                for d in results:
                    paper_id = d.get("id")
                    if paper_id not in papers_dict:
                        papers_dict[paper_id] = {
                            "id": paper_id,
                            "title": d.get("title"),
                            "abstract": d.get("abstract_inverted_index"),
                            "year": d.get("publication_year"),
                            "concepts": [c["display_name"] for c in d.get("concepts", [])]
                        }
                break  # success, exit retry loop
            except (requests.exceptions.RequestException, ValueError) as e:
                retries += 1
                print(f"Attempt {retries} failed for page {page}: {e}")
                sleep(15)
        else:
            print(f"Page {page} skipped after {max_retries} failed attempts.")
    return list(papers_dict.values())

In [None]:
all_papers = []

for q in Query:
    papers = fetch_papers(q, per_page=20, pages=100)  # returns list of dicts
    all_papers.extend(papers)  # append to the master list

# Convert to DataFrame once at the end
df = pd.DataFrame(all_papers)
print(df.head())

In [None]:
# Clean dataframe
df = df[df['abstract'].notna()]

In [None]:
df.shape

In [None]:
def reconstruct_abstract(inverted_index):
    if not inverted_index:
        return ""
    
    # position_map[pos] = word
    position_map = {}
    for word, positions in inverted_index.items():
        for pos in positions:
            position_map[pos] = word
    
    # Sort positions and join words
    words = [position_map[pos] for pos in sorted(position_map.keys())]
    return " ".join(words)

In [None]:
df["abstract_text"] = df["abstract"].apply(reconstruct_abstract)

In [None]:
df.head(10)

In [None]:
df.to_csv("openalex_papers5.csv", index=False)