In [1]:
###Importing Libraries####

import requests
import xml.etree.ElementTree as ET
import pandas as pd
from typing import List, Dict

#### This function is defining helper to check academic affiliation ####

def is_non_academic(affiliation: str) -> bool:
    academic_keywords = [
        "university", "institute", "college", "hospital", "school", "center", "centre", "dept"
    ]
    affiliation_lower = affiliation.lower()
    return not any(keyword in affiliation_lower for keyword in academic_keywords)

### This function retrieves PubMed article IDs (PMIDs) for a given search query by interacting with the NCBI's E-utilities API#####

def get_pubmed_ids(query: str, max_results: int = 10) -> List[str]:
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": query,
        "retmode": "json",
        "retmax": max_results
    }
    response = requests.get(url, params=params)
    data = response.json()
    return data["esearchresult"]["idlist"]

### This function fetches detailed information about PubMed articles given their PubMed IDs (PMIDs), with a specific focus on identifying non-academic authors and their affiliations. #####

def fetch_paper_details(pubmed_ids: List[str]) -> List[Dict]:
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        "db": "pubmed",
        "id": ",".join(pubmed_ids),
        "retmode": "xml"
    }
    response = requests.get(url, params=params)
    root = ET.fromstring(response.text)
    
    papers = []

    for article in root.findall(".//PubmedArticle"):
        pmid = article.findtext(".//PMID")
        title = article.findtext(".//ArticleTitle")
        pub_date = article.findtext(".//PubDate/Year") or "Unknown"
        
        non_academic_authors = []
        company_affiliations = []
        corresponding_email = "N/A"

        for author in article.findall(".//Author"):
            affiliation = author.findtext(".//AffiliationInfo/Affiliation")
            name_parts = [
                author.findtext("ForeName") or "",
                author.findtext("LastName") or ""
            ]
            full_name = " ".join(name_parts).strip()

            if affiliation and is_non_academic(affiliation):
                non_academic_authors.append(full_name)
                company_affiliations.append(affiliation)

            # Extract email
            if affiliation and "@" in affiliation and corresponding_email == "N/A":
                corresponding_email = affiliation.split()[-1]  # crude way to get email

        if non_academic_authors:
            papers.append({
                "PubmedID": pmid,
                "Title": title,
                "Publication Date": pub_date,
                "Non-academic Author(s)": "; ".join(non_academic_authors),
                "Company Affiliation(s)": "; ".join(company_affiliations),
                "Corresponding Author Email": corresponding_email
            })

    return papers

### This function saves a list of paper dictionaries to a CSV file using pandas. #####

def save_to_csv(papers: List[Dict], filename: str = "output.csv"):
    df = pd.DataFrame(papers)
    df.to_csv(filename, index=False)
    print(f"Saved {len(df)} papers to {filename}")

#### This code performs a complete PubMed search pipeline, from querying article IDs to saving detailed results in a CSV file. ####

query = "vaccine development"
ids = get_pubmed_ids(query, max_results=20)
papers = fetch_paper_details(ids)
save_to_csv(papers, "vaccine_output.csv")


Saved 9 papers to vaccine_output.csv


In [2]:
#### testing ####

In [3]:
query = "mRNA vaccine"
ids = get_pubmed_ids(query, max_results=10)
print("PubMed IDs:", ids)

papers = fetch_paper_details(ids)
print("Filtered papers:", papers[:2])  # Print first 2 for preview

save_to_csv(papers, "mrna_vaccine_output.csv")


PubMed IDs: ['40626925', '40626318', '40625176', '40624462', '40624061', '40623607', '40622507', '40622505', '40621109', '40620155']
Filtered papers: [{'PubmedID': '40626318', 'Title': 'Leveraged Vaccination to Alleviate Original Antigenic Sin for Enhancing Broad-Neutralizing Antibody Response against SARS-CoV-2 Omicron Subvariants.', 'Publication Date': '2025', 'Non-academic Author(s)': 'Kai Ji; Xishan Lu; Bo Ying', 'Company Affiliation(s)': 'Suzhou Abogen Biosciences Co., Ltd. Suzhou Jiangsu China.; Suzhou Abogen Biosciences Co., Ltd. Suzhou Jiangsu China.; Suzhou Abogen Biosciences Co., Ltd. Suzhou Jiangsu China.', 'Corresponding Author Email': 'N/A'}]
Saved 1 papers to mrna_vaccine_output.csv
