In [1]:
from openai import OpenAI
import json
import os
import dotenv

In [None]:
dotenv.load_dotenv()
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

In [4]:
abstract1 = {"role": "user", "content": "Associations between modifiable exposures and disease seen in observational epidemiology are sometimes confounded and thus misleading, despite our best efforts to improve the design and analysis of studies. Mendelian randomization-the random assortment of genes from parents to offspring that occurs during gamete formation and conception-provides one method for assessing the causal nature of some environmental exposures. The association between a disease and a polymorphism that mimics the biological link between a proposed exposure and disease is not generally susceptible to the reverse causation or confounding that may distort interpretations of conventional observational studies. Several examples where the phenotypic effects of polymorphisms are well documented provide encouraging evidence of the explanatory power of Mendelian randomization and are described. The limitations of the approach include confounding by polymorphisms in linkage disequilibrium with the polymorphism under study, that polymorphisms may have several phenotypic effects associated with disease, the lack of suitable polymorphisms for studying modifiable exposures of interest, and canalization-the buffering of the effects of genetic variation during development. Nevertheless, Mendelian randomization provides new opportunities to test causality and demonstrates how investment in the human genome project may contribute to understanding and preventing the adverse effects on human health of modifiable exposures."}

abstract2 = {"role": "user", "content": """Background: Dysregulation of circulating metabolites may affect brain function and cognition, associated with alterations in the cerebral cortex architecture. However, the exact cause remains unclear. This study aimed to determine the causal effect of circulating metabolites on the cerebral cortex architecture.

Methods: This study utilized retrieved data from genome-wide association studies to investigate the relationship between blood metabolites and cortical architecture. A total of 1,091 metabolites and 309 metabolite ratios were used for exposure. The brain cortex surface area and cortex thickness were selected as the primary outcomes in this study. In this study, the inverse variance weighting method was used as the main analytical method, complemented by sensitivity analyses that were more robust to pleiotropy. Furthermore, metabolic pathway analysis was performed via MetaboAnalyst 6.0. Finally, reverse Mendelian randomization (MR) analysis was conducted to assess the potential for reverse causation.

Results: After correcting for the false discovery rate (FDR), we identified 37 metabolites and 9 metabolite ratios that showed significant causal associations with cortical structures. Among these, Oxalate was found to be most strongly associated with cortical surface area (β: 2387.532, 95% CI 756.570-4018.495, p = 0.037), while Tyrosine was most correlated with cortical thickness (β: -0.015, 95% CI -0.005 to -0.025, p = 0.025). Furthermore, pathway analysis based on metabolites identified six significant metabolic pathways associated with cortical structures and 13 significant metabolic pathways based on metabolite ratios.

Conclusion: The identified metabolites and relevant metabolic pathways reveal potential therapeutic pathways for reducing the risk of neurodegenerative diseases. These findings will help guide health policies and clinical practice in treating neurodegenerative diseases.

Keywords: Mendelian randomization; brain cortex surficial area; brain cortex thickness; genome-wide association studies; metabolites."""}

abstract3 = {"role": "user", "content": """Background: The association between air pollution, lung function, gastroesophageal reflux disease, and Non-alcoholic fatty liver disease (NAFLD) remains inconclusive. Previous studies were not convincing due to confounding factors and reverse causality. We aim to investigate the causal relationship between air pollution, lung function, gastroesophageal reflux disease, and NAFLD using Mendelian randomization analysis.

Methods: In this study, univariate Mendelian randomization analysis was conducted first. Subsequently, Steiger testing was performed to exclude the possibility of reverse association. Finally, significant risk factors identified from the univariate Mendelian analysis, as well as important factors affecting NAFLD from previous observational studies (type 2 diabetes and body mass index), were included in the multivariable Mendelian randomization analysis.

Results: The results of the univariable Mendelian randomization analysis showed a positive correlation between particulate matter 2.5, gastroesophageal reflux disease, and NAFLD. There was a negative correlation between forced expiratory volume in 1 s, forced vital capacity, and NAFLD. The multivariable Mendelian randomization analysis indicated a direct causal relationship between gastroesophageal reflux disease (OR = 1.537, p = 0.011), type 2 diabetes (OR = 1.261, p < 0.001), and NAFLD.

Conclusion: This Mendelian randomization study confirmed the causal relationships between air pollution, lung function, gastroesophageal reflux, and NAFLD. Furthermore, gastroesophageal reflux and type 2 diabetes were identified as independent risk factors for NAFLD, having a direct causal connection with the occurrence of NAFLD."""}

prompt = {"role": "user", "content": """What are the exposures and outcomes in this abstract? If there are multiple exposures or outcomes, provide them all. If there are no exposures or outcomes, provide an empty list. Also categorize the exposures and outcomes into the following groups using the exact category names provided: 
- molecular
- socioeconomic
- environmental
- behavioural
- anthropometric
- clinical measures
- infectious disease
- neoplasm
- disease of the blood and blood-forming organs
- metabolic disease
- mental disorder
- disease of the nervous system
- disease of the eye and adnexa
- disease of the ear and mastoid process
- disease of the circulatory system
- disease of the digestive system
- disease of the skin and subcutaneous tissue
- disease of the musculoskeletal system and connective tissue
- disease of the genitourinary system
If an exposure or outcome does not fit into any of these groups, specify "Other". 
List the analytical methods used in the abstract. Match the methods to the following list of exact method names. If a method is used that is not in the list, specify "Other" and also provide the name of the method. The list of methods is as follows:
- two-sample mendelian randomization
- multivariable mendelian randomization
- colocalization
- network mendelian randomization
- triangulation
- reverse mendelian randomization
- one-sample mendelian randomization
- negative controls
- sensitivity analysis
- non-linear mendelian randomization
- within-family mendelian randomization
Summarise how many null vs non-null results were found in the abstract.
Provide your answer in strict json format using exactly the format as the example output and without markdown code blocks."""}


example_output = {"role": "assistant", "content": """
{
  "exposures": [
    {
        "id": "1",
        "trait": "Particulate matter 2.5",
        "category": "Environmental"
    },
    {
        "id": "2",
        "trait": "Type 2 diabetes",
        "category": "metabolic disease"
    },
    {
        "id": "3",
        "trait": "Body mass index",
        "category": "Anthropometric"
    }
  ],
  "outcomes": [
    {
        "id": "1",
        "trait": "Forced expiratory volume in 1 s",
        "category": "Clinical measure"
    },
    {
        "id": "2",
        "trait": "Forced vital capacity",
        "category": "Clinical measure"
    },
    {
        "id": "3",
        "trait": "Gastroesophageal reflux disease",
        "category": "disease of the digestive system"
    },
    {
        "id": "4",
        "trait": "Non-alcoholic fatty liver disease (NAFLD)",
        "category": "disease of the digestive system"
    }
  ],
  "methods": ["two-sample mendelian randomization", "multivariable mendelian randomization", "colocalisation", "network mendelian randomization"],
  "results": {
    "null": 0,
    "non-null": 6
  }
}
"""}

abstract4 = {"role": "user", "content": """Background: Epidemiological evidence links a close correlation between long-term exposure to air pollutants and autoimmune diseases, while the causality remained unknown.

Methods: Two-sample Mendelian randomization (TSMR) was used to investigate the role of PM10, PM2.5, NO2, and NOX (N = 423,796-456,380) in 15 autoimmune diseases (N = 14,890-314,995) using data from large European GWASs including UKB, FINNGEN, IMSGC, and IPSCSG. Multivariable Mendelian randomization (MVMR) was conducted to investigate the direct effect of each air pollutant and the mediating role of common factors, including body mass index (BMI), alcohol consumption, smoking status, and household income. Transcriptome-wide association studies (TWAS), two-step MR, and colocalization analyses were performed to explore underlying mechanisms between air pollution and autoimmune diseases.

Results: In TSMR, after correction of multiple testing, hypothyroidism was causally associated with higher exposure to NO2 [odds ratio (OR): 1.37, p = 9.08 × 10-4] and NOX [OR: 1.34, p = 2.86 × 10-3], ulcerative colitis (UC) was causally associated with higher exposure to NOX [OR: 2.24, p = 1.23 × 10-2] and PM2.5 [OR: 2.60, p = 5.96 × 10-3], rheumatoid arthritis was causally associated with higher exposure to NOX [OR: 1.72, p = 1.50 × 10-2], systemic lupus erythematosus was causally associated with higher exposure to NOX [OR: 4.92, p = 6.89 × 10-3], celiac disease was causally associated with lower exposure to NOX [OR: 0.14, p = 6.74 × 10-4] and PM2.5 [OR: 0.17, p = 3.18 × 10-3]. The risky effects of PM2.5 on UC remained significant in MVMR analyses after adjusting for other air pollutants. MVMR revealed several common mediators between air pollutants and autoimmune diseases. Transcriptional analysis identified specific gene transcripts and pathways interconnecting air pollutants and autoimmune diseases. Two-step MR revealed that POR, HSPA1B, and BRD2 might mediate from air pollutants to autoimmune diseases. POR pQTL (rs59882870, PPH4=1.00) strongly colocalized with autoimmune diseases.

Conclusion: This research underscores the necessity of rigorous air pollutant surveillance within public health studies to curb the prevalence of autoimmune diseases."""}


abstract5 = {"role": "user", "content": """Background: Circulating C-reactive protein (CRP) is associated with the metabolic syndrome and might be causally linked to it. Our aim was to generate estimates of the association between plasma CRP and metabolic syndrome phenotypes that were free from confounding and reverse causation, to assess the causal role of this protein.

Methods: We examined associations between serum CRP concentration and metabolic syndrome phenotypes in the British Women's Heart and Health Study. We then compared these estimates with those derived from a mendelian randomised framework with common CRP gene haplotypes to generate unconfounded and unbiased estimates of any causal associations.

Findings: In a sample of British women, body-mass index (BMI), systolic blood pressure, waist-to-hip ratio, serum concentrations of HDL cholesterol and triglycerides, and insulin resistance were all associated with plasma CRP concentration. CRP haplotypes were associated with plasma CRP concentration (p<0.0001). With instrumental variable analyses, there was no association between plasma CRP concentration and any of the metabolic syndrome phenotypes analysed. There was strong evidence that linear regression and mendelian randomisation based estimation gave conflicting results for the CRP-BMI association (p=0.0002), and some evidence of conflicting results for the association of CRP with the score for insulin resistance (p=0.0139), triglycerides (p=0.0313), and HDL cholesterol (p=0.0688).

Interpretation: Disparity between estimates of the association between plasma CRP and phenotypes comprising the metabolic syndrome derived from conventional analyses and those from a mendelian randomisation approach suggests that there is no causal association between CRP and the metabolic syndrome phenotypes."""}

abstract6 = {"role": "user", "content": """Background: Previous Mendelian randomization (MR) studies using population samples (population MR) have provided evidence for beneficial effects of educational attainment on health outcomes in adulthood. However, estimates from these studies may have been susceptible to bias from population stratification, assortative mating and indirect genetic effects due to unadjusted parental genotypes. MR using genetic association estimates derived from within-sibship models (within-sibship MR) can avoid these potential biases because genetic differences between siblings are due to random segregation at meiosis.

Methods: Applying both population and within-sibship MR, we estimated the effects of genetic liability to educational attainment on body mass index (BMI), cigarette smoking, systolic blood pressure (SBP) and all-cause mortality. MR analyses used individual-level data on 72 932 siblings from UK Biobank and the Norwegian HUNT study, and summary-level data from a within-sibship Genome-wide Association Study including >140 000 individuals.

Results: Both population and within-sibship MR estimates provided evidence that educational attainment decreased BMI, cigarette smoking and SBP. Genetic variant-outcome associations attenuated in the within-sibship model, but genetic variant-educational attainment associations also attenuated to a similar extent. Thus, within-sibship and population MR estimates were largely consistent. The within-sibship MR estimate of education on mortality was imprecise but consistent with a putative effect.

Conclusions: These results provide evidence of beneficial individual-level effects of education (or liability to education) on adulthood health, independently of potential demographic and family-level confounders."""}



def openai_prompt(abstract, pmid):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
            messages=[{"role": "system", "content": "You are a helpful assistant."},
                    abstract4,
                    prompt,
                    example_output,
                    {"role": "user", "content": bytes(abstract, 'utf-8').decode('utf-8', 'ignore')},
                    prompt],
    )
    o = json.loads(response.choices[0].message.content)
    o['pmid'] = pmid
    return o

In [5]:
o = openai_prompt(abstract5['content'], 12345678)
print(json.dumps(o, indent=2))


{
  "exposures": [
    {
      "id": "1",
      "trait": "Plasma C-reactive protein (CRP)",
      "category": "Molecular"
    },
    {
      "id": "2",
      "trait": "CRP gene haplotypes",
      "category": "Molecular"
    }
  ],
  "outcomes": [
    {
      "id": "1",
      "trait": "Body-mass index (BMI)",
      "category": "Anthropometric"
    },
    {
      "id": "2",
      "trait": "Systolic blood pressure",
      "category": "Disease of the circulatory system"
    },
    {
      "id": "3",
      "trait": "Waist-to-hip ratio",
      "category": "Anthropometric"
    },
    {
      "id": "4",
      "trait": "Serum concentrations of HDL cholesterol and triglycerides",
      "category": "Metabolic disease"
    },
    {
      "id": "5",
      "trait": "Insulin resistance",
      "category": "Metabolic disease"
    }
  ],
  "methods": [
    "mendelian randomisation",
    "linear regression",
    "instrumental variable analysis"
  ],
  "results": {
    "null": 4,
    "non-null": 1
  },
 

In [6]:
with open("../data/pubmed_abstracts_20250502.json") as f:
    b = json.load(f)

with open("../data/pubmed_authors.json") as f:
    authors = json.load(f)

len(b)

9999

In [7]:
len(authors)

8210

In [8]:
# Remove entries in b that are already in authors
b = [entry for entry in b if entry['pmid'] not in [a['pmid'] for a in authors]]
len(b)

5412

In [9]:
b[0]

{'pmid': '40313062',
 'ab': "BackgroundDNA damage and repair (DDR) and structural atrophies in different brain regions were recognized as critical factors in the onset of Alzheimer's disease (AD).ObjectiveWe utilized Mendelian randomization (MR) to examine the causal effects of the DDR-related molecular traits on AD and the potential mediating roles of different brain region volumes.MethodsIn primary analysis, we utilized public genome-wide association studies of AD and summary data from existing molecular traits datasets, including gene expression, DNA methylation, and protein levels quantitative trait loci (eQTL, mQTL, and pQTL) in both blood and brain to examine their causal associations by summary-data-based MR analysis and additional five two-sample MR methods. Subsequently, mediation analysis explored the potential mediate roles of 13 imaging-derived brain volume phenotypes in the associations between the DDR pathways and AD through a network MR design.ResultsWe found that the vo

Send one abstract at a time the the openai api - this is very slow and more expensive.

In [None]:
auth_prompt = {"role": "user", "content": """Extract the university name and country from this text. Provide the result in json format with one field for the 'institution' and one field for the 'country'. If the country is not mentioned, provide an empty string. If the institution is not mentioned, provide an empty string. If the institution is mentioned but not the country, provide an empty string for the country. For the institution, retain only the university name and no department names etc."""}

def openai_prompt_auth(author_affil, pmid):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
            messages=[{"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": bytes(author_affil, 'utf-8').decode('utf-8', 'ignore')},
                    auth_prompt],
    )
    o = json.loads(response.choices[0].message.content)
    o['pmid'] = pmid
    return o

author_affil = b[0]['author_affil']
openai_prompt_auth(author_affil, 12345678)

# result = []
# for i in range(len(a)):
#     print(i)
#     if 'ab' not in a[i].keys():
#         continue
#     try:        
#         o = openai_prompt_auth(a[i]['author_affil'], a[i]['pmid'])
#         result.append(o)
#     except:
#         continue
#     if i % 100 == 0:
#         with open("data/pubmed_authors.json", "w") as f:
#             json.dump(result, f)

# with open("data/pubmed_authors.json", "w") as f:
#     json.dump(result, f)


# result = []
# for i in range(len(b)):
#     print(i)
#     if 'ab' not in b[i].keys():
#         continue
#     try:        
#         o = openai_prompt(b[i]['ab'], b[i]['pmid'])
#         result.append(o)
#     except:
#         continue
#     if i % 100 == 0:
#         with open("../data/abstract_summary_20250502.json", "w") as f:
#             json.dump(result, f)

# with open("../data/abstract_summary_20250502.json", "w") as f:
#     json.dump(result, f)

Batches are faster and cheaper.
1. For each batch create a .jsonl file with the requests
2. Use the openai api to send the requests in the .jsonl file
3. Retrieve the responses, parse and save them

In [26]:
def openai_prompt_auth_batch(abstracts, jsonl_file_root, batch_size=500):
    """
    abstracts: list of abstracts to process
    jsonl_file_root: file to write the results to e.g. "../data/author_processing_20250502". <batch>.jsonl will be appended to this filename
    batch_size: number of abstracts to process at once
    """
    auth_prompt = {"role": "user", "content": """Extract the university name and country from this text. Provide the result in json format with one field for the 'institution' and one field for the 'country'. If the country is not mentioned, provide an empty string. If the institution is not mentioned, provide an empty string. If the institution is mentioned but not the country, provide an empty string for the country. For the institution, retain only the university name and no department names etc."""}
    
    # Get number of batches to generate
    num_batches = len(abstracts) // batch_size + 1
    print("Number of batches: {}".format(num_batches))
    batch_file_names = ["{}.{}.jsonl".format(jsonl_file_root, i) for i in range(num_batches)]
    
    for batch in range(num_batches):
        # Get the batch of abstracts
        start = batch * batch_size
        end = min((batch + 1) * batch_size, len(abstracts))
        print("Processing batch {} of {}".format(batch, num_batches))
        
        # Create the jsonl file for this batch
        jsonl_file = batch_file_names[batch]
        
        # Process the abstracts in this batch
        with open(jsonl_file, "w") as f:
            for i in range(start, end):
                if 'author_affil' not in abstracts[i].keys():
                    continue
                
                try:
                    o = {
                        'custom_id': abstracts[i]['pmid'], 
                        "method": "POST", 
                        "url": "/v1/chat/completions",
                        "body": {
                            "model": "gpt-3.5-turbo",
                            "messages": [
                                {"role": "system", "content": "You are a helpful assistant."},
                                {"role": "user", "content": bytes(abstracts[i]['author_affil'], 'utf-8').decode('utf-8', 'ignore')},
                                auth_prompt
                            ],
                            "max_tokens": 1000
                        }
                    }
                    f.write(json.dumps(o) + "\n")
                except:
                    continue
    return batch_file_names
author_batches = openai_prompt_auth_batch(b, "../data/author_processing_20250502_batch", batch_size=500)

Number of batches: 11
Processing batch 0 of 11
Processing batch 1 of 11
Processing batch 2 of 11
Processing batch 3 of 11
Processing batch 4 of 11
Processing batch 5 of 11
Processing batch 6 of 11
Processing batch 7 of 11
Processing batch 8 of 11
Processing batch 9 of 11
Processing batch 10 of 11


In [None]:
batch_input_file = []
for i in range(len(author_batches)):
    a = client.files.create(
        file=open(author_batches[i], "rb"),
        purpose="batch"
    )
    batch_input_file.append(a)
    

In [75]:
batch_input_file[0].id

'file-4AhPiq8QHmGsMBaVQX5Cht'

In [None]:
batches = []
for i in range(len(batch_input_file)):
    f = batch_input_file[i]
    batch_input_file_id = f.id
    a = client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
            "author processing": "20250502",
            "batch": str(i),
            "batch_input_file": author_batches[i]
        }
    )
    batches.append(a)

In [80]:
batches = ["batch_682a3e53bfe88190acc35829d83d1fe5", "batch_682a3e541f9881908dfbe48e0fe60e24", "batch_682a3e546d0481909180ac02cd18abd0", "batch_682a3e54b49881908a435724f4452912", "batch_682a3e54fab08190a5fe9496d9d70459", "batch_682a3e5579e08190bcb1f5b1fc4d4021", "batch_682a3e55df80819091dc54a586d6de08", "batch_682a3e5646bc819092a794c144a72fdd", "batch_682a3e56cc948190b41cec169671bf81", "batch_682a3e5767708190ae0d37ae17ff0712", "batch_682a3e57c6cc8190a9b8c88f5c5cf5ae"]
batch = client.batches.retrieve(batches[i])
batch

Batch(id='batch_682a3e57c6cc8190a9b8c88f5c5cf5ae', completion_window='24h', created_at=1747598935, endpoint='/v1/chat/completions', input_file_id='file-Q7KuX3yoZmPMPF1hG1D8yR', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1747685335, failed_at=None, finalizing_at=None, in_progress_at=1747598937, metadata={'author processing': '20250502', 'batch': '10', 'batch_input_file': '../data/author_processing_20250502_batch.10.jsonl'}, output_file_id=None, request_counts=BatchRequestCounts(completed=388, failed=0, total=391))

In [87]:
def retrieve_batch_status(batches):
    batch_status = []
    results_files = []
    for i in range(len(batches)):
        batch = client.batches.retrieve(batches[i])
        batch_status.append(batch)
        results_files.append(batch.output_file_id)

    # Count the number of batches in each status
    status_count = {}
    for status in batch_status:
        if status.status not in status_count:
            status_count[status.status] = 0
        status_count[status.status] += 1
    # Print the status count
    for status, count in status_count.items():
        print(f"Status: {status}, Count: {count}")
    # Return results files if all batches are completed
    if all(status.status == "completed" for status in batch_status):
        return results_files
    else:
        return None

batch_output_files = retrieve_batch_status(batches)
batch_output_files


Status: completed, Count: 11


['file-PnhwmGJ4jGi4LskGmPFqrf',
 'file-HhaBVhwUk1PGNVD37KMeYa',
 'file-1mXH6mEATQd95gV2gV4QYq',
 'file-7T3z5EGF5GqKXzutVWCaqg',
 'file-BoXbm2a27gmAXQFEVc1WqP',
 'file-WaZZoEyQcHyLMrPzhKirKP',
 'file-LeKvjxhhvcgLVAaqCoPkJD',
 'file-DwrRcG4SjushNSn1G7hmYc',
 'file-CN9XN5iem4kg459PFG99yY',
 'file-WzY1JrCsJj7iKrUBP1BqZi',
 'file-9wXcRtASgUFXHfhsDa3ZpT']

In [None]:
def read_output(input):
    # remove markdown code blocks
    input = input.replace("```json", "")
    input = input.replace("```", "")
    # remove leading and trailing whitespace
    input = input.strip()
    # remove leading and trailing newlines
    input = input.strip("\n")
    # remove leading and trailing spaces
    input = input.strip(" ")
    # remove leading and trailing tabs
    input = input.strip("\t")
    # remove leading and trailing carriage returns
    input = input.strip("\r")
    # remove leading and trailing form feeds
    input = input.strip("\f")
    # remove leading and trailing vertical tabs
    input = input.strip("\v")
    # remove leading and trailing null characters
    input = input.strip("\0")
    return json.loads(input)

if batch_output_files is not None:
    # Get results from the batches
    batch_results = []
    for i in range(len(batch_output_files)):
        print("Processing batch {}".format(i))
        batch = client.files.content(batch_output_files[i])
        batch_output = [json.loads(line) for line in file_response.text.splitlines()]
        for x in batch_output:
            try:
                pmid = {"pmid": x["custom_id"]}
                cont = read_output(x["response"]["body"]["choices"][0]["message"]["content"])
                a = {**pmid, **cont}
                batch_results.append(a)
            except:
                print("Error in response for pmid {}".format(x["custom_id"]))
                continue
        with open("../data/author_processing_20250502.json", "w") as f:
            json.dump(batch_results, f)
else:
    print("Not all batches are completed yet.")
    batch_results = []


{'institution': 'Nanchang University', 'country': 'China'}
{'institution': 'Shanxi Medical University', 'country': 'China'}
{'institution': 'Sun Yat-sen University', 'country': 'China'}
{'institution': 'Chung Shan Medical University', 'country': 'Taiwan'}
{'institution': 'Soochow University', 'country': 'China'}
{'institution': 'Changzhou Maternal and Child Health Care Hospital', 'country': 'China'}
{'institution': 'Binzhou Medical University', 'country': 'China'}
{'institution': 'Huazhong University of Science and Technology', 'country': 'China'}
{'institution': 'Westlake University', 'country': 'China'}
{'institution': 'Harbin Medical University', 'country': 'China'}
{'institution': 'Jiangnan University', 'country': 'China'}
{'institution': 'Jilin University', 'country': 'China'}
{'institution': 'Vanderbilt University', 'country': 'USA'}
{'institution': 'Shahid Beheshti University of Medical Sciences', 'country': 'Iran'}
{'institution': 'University of Cambridge', 'country': 'UK'}
{'i

In [89]:
with open("../data/pubmed_authors.json") as f:
    authors = json.load(f)
# merge authors and batch_results
authors_full = [x for x in authors if x['pmid'] not in [y['pmid'] for y in batch_results]]
for x in batch_results:
    authors_full.append(x)
len(authors_full)

13589

In [90]:
with open("../data/pubmed_authors_20250502.json", "w") as f:
    json.dump(authors_full, f)