In [8]:
import json
import requests
import pandas as pd

import chromadb
from chromadb.utils import embedding_functions
from sentence_transformers import SentenceTransformer

In [2]:
BASE_OPENALEX = "https://api.openalex.org"

In [3]:
def search_papers(query, n_results=2000):
    
    per_page = 200
    url = f"{BASE_OPENALEX}/works"
    
    params = {
        'filter': f'abstract.search:{query}',
        'per-page': per_page,
    }
    headers = {
        'Accept': 'application/json'
    }
    
    results = []

    for page in range(1, (n_results // per_page)+1):
        params['page'] = page
        response = requests.get(url, params=params, headers=headers)    
        data = response.json()
        
        try: data["results"]
        except KeyError: 
            raise KeyError("not enough results to display")
        
        for work in data['results']:
            paper = {
                'title': work.get('title'),
                'abstract': reconstruct_abstract(work.get('abstract_inverted_index')),
                # 'doi': work.get('doi'),
                # 'publication_year': work.get('publication_year'),
                # 'citation_count': work.get('cited_by_count')
            }
            results.append(paper)
                
    return results


def reconstruct_abstract(index: dict) -> str:
    max_position_max = max([positions for positions in index.values()])[0] + 1
    max_position_sum = sum([len(position) for position in index.values()]) + 1
    abstract_array = (max(max_position_max, max_position_sum)+20)*[None]
    for word, positions in index.items():
        for position in positions:
            try: abstract_array[position] = word
            except IndexError: 
                print(f"len_max: {max_position_max} | pos: {position}")
                print(f"len_sum: {max_position_sum} | pos: {position}")
    abstract_array = [i for i in abstract_array if i is not None]
    abstract_string = ' '.join(abstract_array)
    abstract_string = abstract_string.replace('^abstract\s+', '')
    
    return abstract_string

In [4]:
query = "dynamical systems"
results = search_papers(query, n_results=8000)

In [13]:
with open('data/dynamical_systems.json', 'w') as f:
    json.dump(results, f, indent=2)
