In [None]:
import requests
import pandas as pd

BASE_URL = "https://api.openalex.org/works"
# Look-up Terms
Query = ["artificial intelligence", "neural networks", "supervised learning", "Unsupervised learning", "AI privacy"]

In [None]:
def fetch_papers(query, per_page=20, pages=5):
    print(query)
    papers = []
    for page in range(1, pages+1):
        # Fetch the papers
        url = f"{BASE_URL}?filter=title.search:{query}&per-page={per_page}&page={page}"
        r = requests.get(url)
        data = r.json()
        results = data.get("results", [])
        if not results:
            print(f"No results found for page {page}")
            break
        # Store them 
        for d in results:
            papers.append({
                "id": d.get("id"),
                "title": d.get("title"),
                "abstract": d.get("abstract_inverted_index"),
                "year": d.get("publication_year"),
                "concepts": [c["display_name"] for c in d.get("concepts", [])]
            })
    return papers

In [None]:
all_papers = []

for q in Query:
    papers = fetch_papers(q, per_page=10, pages=1)  # returns list of dicts
    all_papers.extend(papers)  # append to the master list

# Convert to DataFrame once at the end
df = pd.DataFrame(all_papers)
print(df.head())

In [None]:
def reconstruct_abstract(inverted_index):
    if not inverted_index:
        return ""
    
    # position_map[pos] = word
    position_map = {}
    for word, positions in inverted_index.items():
        for pos in positions:
            position_map[pos] = word
    
    # Sort positions and join words
    words = [position_map[pos] for pos in sorted(position_map.keys())]
    return " ".join(words)

In [None]:
df["abstract_text"] = df["abstract"].apply(reconstruct_abstract)

In [None]:
df.to_csv("openalex_papers.csv", index=False)