In [24]:
import requests
import pandas as pd
from time import sleep

BASE_URL = "https://api.openalex.org/works"
# Look-up Terms
Query = ["artificial intelligence", "neural networks", "supervised learning", "Unsupervised learning", "AI privacy"]

In [25]:
def fetch_papers(query, per_page=20, pages=5):
    print(query)
    papers = []
    for page in range(1, pages+1):
        # Fetch the papers
        try:
            url = f"{BASE_URL}?filter=title.search:{query},open_access.is_oa:true&per-page={per_page}&page={page}"
            r = requests.get(url)
            data = r.json()
            results = data.get("results", [])
            if not results:
                print(f"No results found for page {page}")
                break
            # Store them 
            for d in results:
                papers.append({
                    "id": d.get("id"),
                    "title": d.get("title"),
                    "abstract": d.get("abstract_inverted_index"),
                    "year": d.get("publication_year"),
                    "concepts": [c["display_name"] for c in d.get("concepts", [])]
                })
        except requests.exceptions.RequestException as e:
            print("Attempt failed:", e)
            sleep(2)
    return papers

In [26]:
all_papers = []

for q in Query:
    papers = fetch_papers(q, per_page=10, pages=20)  # returns list of dicts
    all_papers.extend(papers)  # append to the master list

# Convert to DataFrame once at the end
df = pd.DataFrame(all_papers)
print(df.head())

artificial intelligence
neural networks
supervised learning
Unsupervised learning
AI privacy
                                 id  \
0  https://openalex.org/W2122410182   
1  https://openalex.org/W2981731882   
2  https://openalex.org/W2803760365   
3  https://openalex.org/W2664267452   
4  https://openalex.org/W2953532875   

                                               title  \
0         Artificial intelligence: a modern approach   
1  Explainable Artificial Intelligence (XAI): Con...   
2               Artificial intelligence in radiology   
3  Artificial intelligence in healthcare: past, p...   
4  The potential for artificial intelligence in h...   

                                            abstract  year  \
0  {'The': [0], 'long-anticipated': [1], 'revisio...  1995   
1                                               None  2019   
2                                               None  2018   
3  {'Artificial': [0], 'intelligence': [1], '(AI)...  2017   
4                        

In [27]:
# Clean dataframe
df = df[df['abstract'].notna()]

In [28]:
df.size

3970

In [29]:
def reconstruct_abstract(inverted_index):
    if not inverted_index:
        return ""
    
    # position_map[pos] = word
    position_map = {}
    for word, positions in inverted_index.items():
        for pos in positions:
            position_map[pos] = word
    
    # Sort positions and join words
    words = [position_map[pos] for pos in sorted(position_map.keys())]
    return " ".join(words)

In [30]:
df["abstract_text"] = df["abstract"].apply(reconstruct_abstract)

In [31]:
df

Unnamed: 0,id,title,abstract,year,concepts,abstract_text
0,https://openalex.org/W2122410182,Artificial intelligence: a modern approach,"{'The': [0], 'long-anticipated': [1], 'revisio...",1995,"[Artificial intelligence, Computer science, In...",The long-anticipated revision of this #1 selli...
3,https://openalex.org/W2664267452,"Artificial intelligence in healthcare: past, p...","{'Artificial': [0], 'intelligence': [1], '(AI)...",2017,"[Cognitive computing, Health care, Data scienc...",Artificial intelligence (AI) aims to mimic hum...
5,https://openalex.org/W2891503716,Peeking Inside the Black-Box: A Survey on Expl...,"{'At': [0], 'the': [1, 4, 28, 45, 57, 101, 130...",2018,"[Transparency (behavior), Sine qua non, Comput...",At the dawn of the fourth industrial revolutio...
6,https://openalex.org/W3017131514,Artificial Intelligence in Education: A Review,"{'The': [0, 110], 'purpose': [1], 'of': [2, 10...",2020,"[Computer science, Adaptability, Personalized ...",The purpose of this study was to assess the im...
8,https://openalex.org/W1543659671,Artificial Intelligence: A Guide to Intelligen...,"{'From': [0], 'the': [1, 4, 12, 60, 76, 87, 95...",2001,"[Computer science, Jargon, Expert system, Inte...",From the Publisher:\r\nVirtually all the liter...
...,...,...,...,...,...,...
993,https://openalex.org/W4399695555,"Evaluating Privacy, Security, and Trust Percep...","{'Conversational': [0], 'AI': [1], '(CAI)': [2...",2024,"[Internet privacy, Perception, Computer securi...",Conversational AI (CAI) systems which encompas...
994,https://openalex.org/W4400022865,Privacy Implications of Explainable AI in Data...,"{'Machine': [0], 'learning': [1, 171], '(ML)':...",2024,"[Internet privacy, Computer science, Informati...","Machine learning (ML) models, demonstrably pow..."
996,https://openalex.org/W4383197555,Translating theory into practice: assessing th...,"{'Artificial': [0], 'Intelligence': [1], '(AI)...",2023,"[Computer science, Context (archaeology), Adve...",Artificial Intelligence (AI) has achieved rema...
997,https://openalex.org/W4393067727,Examining Privacy and Trust Issues at the Edge...,"{'The': [0, 126, 141], 'growing': [1], 'domain...",2023,"[Computer science, Profiling (computer program...",The growing domain of liquidity in computing e...


In [32]:
df.to_csv("openalex_papers4.csv", index=False)