In [1]:
keywords_out = 10

# role of an expert keyword searcher
role_keyword_searcher = f"""You are a scientist, expert and master in constructing text queries for scholarly search engines. 
Your text queries are short and concise, not more than 5 words and most likely about 3 words.
When you create your set of queries you aim for diversity and counterintuitive queries that can potentially unveil unknown but very valuable papers."""

role_summarizer = f"""You are an experienced scientist in the craft of writing review papers that summarize the commonalities and highlight the differences between a given list of papers."""

# formatting explanation for keywords list
explain_list_format_kw = f"""You are going to format your list of keyword searches as a list of strings such as in the following example:
["search_1", "search_2", "search_3", "search_4", "search_5", "search_6", "search_7", "search_8", "search_9", "search_10"]"""

# system prompt for going from a list of abstracts to a list of queries
system_prompt_1 = f"""{role_keyword_searcher}
You are given by the user a list of titles and abstracts and from such list you will produce a list of 10 keyword searches that will unveil interesting results that can greatly impact the research directions given by the user list of titles and abstracts. The list of titles and abstracts is served in the following format: 
title:: text of first title
abstract: text of first abstract
------
title:: text of second title
abstract: text of second abstract 
------
and so on for all the results the user asks for. In any case you are going to produce {keywords_out} and only {keywords_out}.
{explain_list_format_kw}"""

# system prompt for going from a list of abstracts to a summarizing extended abstract
system_prompt_2_a = f"""{role_summarizer}
You are given by the user the task of summarizing from a list of titles and abstracts, an extended abstract for a review paper about user given list of abstracts."""

# system prompt for going from an extended abstract to a list of queries
system_prompt_2_b = f"""{role_keyword_searcher}
You are given by the user an extended abstract from which you have to extract a list of 10 keyword searches that will unveil interesting results that can greatly impact the research directions highlighted int he extended abstract.
{explain_list_format_kw}"""

In [2]:
import os
import pandas as pd

from pydantic import BaseModel
from openai import OpenAI

client = OpenAI(api_key=os.getenv("OPENAI_REFBRO_KEY"))

class SearchList(BaseModel):
    queries: list[str]

def keywords_from_abstracts(formatted_abstracts: str):
    completion = client.beta.chat.completions.parse(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": f"{system_prompt_1}"},
        {"role": "user", "content": f"{formatted_abstracts}"},
    ],
    response_format=SearchList,
    )
    # answer = oai.chat.completion.create(
    #     model="gpt-4o",
    #     store=True,
    #     messages=[
    #         {
    #             "role": "system",
    #             "content": f"{system_prompt_1}"
    #         },
    #         {
    #             "role": "user",
    #             "content": f"{formatted_abstracts}"
    #         }
    #     ],
    #     response_format=SearchList
    # )
    return completion

In [3]:
import aiohttp
import asyncio

BASE_OPENALEX = "https://api.openalex.org"

# TODO: move to openalex.py
async def fetch_papers_async(query: str, n_results=1000):
    query = "%20".join(query.split(" "))
    async with aiohttp.ClientSession() as session:
        tasks = []
        per_page = 200
        pages = (n_results // per_page) + 1
        for page in range(1, pages + 1):
            url = f"{BASE_OPENALEX}/works?search={query}&per-page={per_page}&page={page}"
            tasks.append(session.get(url))
        responses = await asyncio.gather(*tasks)
        results = []
        for response in responses:
            data = await response.json()
            results.extend(data['results']) # TODO: check behavior of extend
    return pd.DataFrame(results)

# TODO: move to openalex.py
async def multi_search(queries, n_results=400) -> pd.DataFrame:
    """ Returns a dataframe with all retrieved papers for all queries """
    results = {}
    for query in queries: 
        results[query] = await fetch_papers_async(query, n_results=n_results)
    return pd.concat(list(results.values()), ignore_index=True)

In [4]:
queries = ["dynamical systems", "quantum entanglement", "many body theorems"]

In [5]:
papers_search = await multi_search(queries)

In [6]:
papers = papers_search[:100]

In [7]:
papers["abstract"] = None

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  papers["abstract"] = None


In [8]:
def reconstruct_abstract(index: dict) -> str:
    if isinstance(index, type(None)): return "MISSING_ABSTRACT" # TODO: expand with scraping methods TODO: decide if to return None instead
    max_position_sum = sum([len(position)+1 for position in index.values()]) + 500 # + 500 for safety 
    abstract_array = max_position_sum*[None]
    for word, positions in index.items():
        for position in positions:
            abstract_array[position] = word
    abstract_array = [i for i in abstract_array if i is not None]
    abstract_string = ' '.join(abstract_array)
    abstract_string = abstract_string.replace(r'^abstract\s+', '')
    return abstract_string

In [20]:
print(papers.columns)
print(papers["primary_location"].iloc[0])

Index(['id', 'doi', 'title', 'display_name', 'relevance_score',
       'publication_year', 'publication_date', 'ids', 'language',
       'primary_location', 'type', 'type_crossref', 'indexed_in',
       'open_access', 'authorships', 'institution_assertions',
       'countries_distinct_count', 'institutions_distinct_count',
       'corresponding_author_ids', 'corresponding_institution_ids', 'apc_list',
       'apc_paid', 'fwci', 'has_fulltext', 'fulltext_origin', 'cited_by_count',
       'citation_normalized_percentile', 'cited_by_percentile_year', 'biblio',
       'is_retracted', 'is_paratext', 'primary_topic', 'topics', 'keywords',
       'concepts', 'mesh', 'locations_count', 'locations', 'best_oa_location',
       'sustainable_development_goals', 'grants', 'datasets', 'versions',
       'referenced_works_count', 'referenced_works', 'related_works',
       'abstract_inverted_index', 'cited_by_api_url', 'counts_by_year',
       'updated_date', 'created_date', 'abstract'],
      dtype=

In [10]:
papers["abstract"] = papers["abstract_inverted_index"].apply(reconstruct_abstract)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  papers["abstract"] = papers["abstract_inverted_index"].apply(reconstruct_abstract)


In [11]:
papers = papers[papers["abstract"] != "MISSING_ABSTRACT"]
print(len(papers))

56


In [12]:
user_prompt = "\n------\n".join(
    f"title:: {pap['title']}\nabstract:: {pap['abstract']}"
    for _, pap in papers.iterrows()
) + "\n------\n"

In [13]:
import tiktoken

In [14]:
enc = tiktoken.encoding_for_model("gpt-4o")

In [15]:
len(enc.encode(user_prompt))

9236

In [16]:
queries = keywords_from_abstracts(user_prompt)

In [17]:
queries.choices[0].message.parsed.queries

['neural network dynamics',
 'situation awareness modeling',
 'hyperbolic invariants',
 'chaotic neuron simulations',
 'digital feedback control',
 'random perturbations analysis',
 'flocking stability equations',
 'turbulence coherent structures',
 'nonlinear system approximation',
 'sparse system identification']