In [1]:
from typing import List
from itertools import combinations

import asyncio
import aiohttp

import json
import time
import requests
import pandas as pd

import numpy as np 

In [2]:
BASE_OPENALEX = "https://api.openalex.org"

In [3]:
queries = [
    "dna kinetics",
    "graph theory",
    "dynamical systems",
    "monte carlo simulations",
    "origins of life",
    "stochastic processes",
    "polymer physics"
]

In [5]:
# Async version
async def fetch_papers_async(query: str, n_results=1000):
    query = "%20".join(query.split(" "))
    async with aiohttp.ClientSession() as session:
        tasks = []
        per_page = 200
        pages = (n_results // per_page) + 1
        for page in range(1, pages + 1):
            url = f"{BASE_OPENALEX}/works?search={query}&per-page={per_page}&page={page}"
            tasks.append(session.get(url))
        responses = await asyncio.gather(*tasks)
        results = []
        for response in responses:
            data = await response.json()
            results.extend(data['results'])
    return pd.DataFrame(results)

def search_papers(query: str, n_results=400) -> pd.DataFrame:
    """ Retrieve n_results out of a query to openalex """
    per_page = min(200, n_results)
    query = "%20".join(query.split(" "))
    url = f"{BASE_OPENALEX}/works?search={query}"
    params = {
        'per-page': per_page,
    }
    dfs = []
    for page in range(1, (n_results // per_page)+1):
        params['page'] = page
        response = requests.get(url, params=params).json()
        df = pd.json_normalize(response["results"])
        df = df.drop(columns=[col for col in df.columns if 'abstract' in col.lower()])
        dfs.append(df)            
    return pd.concat(dfs, ignore_index=True)

async def multi_search(queries: List[str], n_results=400) -> pd.DataFrame:
    """ Returns a dataframe with all retrieved papers for all queries """
    results = {}
    for query in queries: 
        results[query] = await fetch_papers_async(query, n_results=n_results)
    return pd.concat(list(results.values()), ignore_index=True)

def get_topics_set(results: pd.DataFrame):
    topics = results["topics"]
    topic_ids = []
    for topic in topics:
        for t in topic: 
            topic_ids.append(t["id"])
    return set(topic_ids)


In [6]:
results = await multi_search(queries, n_results=1000)

In [25]:
topics = get_topics_set(results)
t_idx = {i:t for i,t in enumerate(topics)}
idx_t = {t:i for i,t in t_idx.items()}

In [8]:
# create a matrix to index through topics 
mutualmatrix = pd.DataFrame(0, index=list(topics), columns=list(topics))

In [9]:
for i, res in results.iterrows():
    # get p(x)
    for t in res["topics"]:
        id = t["id"]
        mutualmatrix.loc[id, id] = mutualmatrix.loc[id, id] + 1
    # get p(x, y)
    for ti, tj in combinations(res["topics"], r=2):
        idi, idj = ti["id"], tj["id"]
        mutualmatrix.loc[idi, idj] = mutualmatrix.loc[idi, idj] + 1
        mutualmatrix.loc[idj, idi] = mutualmatrix.loc[idj, idi] + 1


In [10]:
# Count non-zero elements and total sum in the mutual matrix
nonzero_count = (mutualmatrix != 0).sum().sum()
total_sum = mutualmatrix.sum().sum()
print(f"Number of non-zero elements in mutual matrix: {nonzero_count}")
print(f"Total sum of values in mutual matrix: {total_sum}")

Number of non-zero elements in mutual matrix: 18001
Total sum of values in mutual matrix: 63228


In [11]:
N = len(mutualmatrix)
probmatrix = mutualmatrix/N

In [20]:
pmimatrix = pd.DataFrame(-np.inf, index=probmatrix.index, columns=probmatrix.columns)
for ti, tj in combinations(pmimatrix.index, r=2):
    pmimatrix[ti, tj] = np.log2(probmatrix[ti, tj]/(probmatrix[ti, ti]*probmatrix[tj, tj]))

KeyError: ('https://openalex.org/T11468', 'https://openalex.org/T12561')