In [1]:
import requests
import pandas as pd
from typing import List, Dict

In [2]:
import requests
from typing import List, Dict

def fetch_papers(query: str, debug: bool = False) -> List[Dict]:
    """
    Fetch research papers from the PubMed API based on a query.
    
    Parameters:
    - query: A string representing the PubMed query.
    - debug: If True, print debug information during execution.
    
    Returns:
    - A list of dictionaries containing details of the fetched papers.
    """
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    details_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    params = {
        "db": "pubmed",
        "term": query,
        "retmode": "json",
        "retmax": 100  # Fetch up to 100 papers for demonstration purposes
    }

    try:
        if debug:
            print(f"Sending request to PubMed API: {base_url} with params: {params}")
        response = requests.get(base_url, params=params)
        response.raise_for_status()
        search_data = response.json()
        
        paper_ids = search_data.get("esearchresult", {}).get("idlist", [])
        if not paper_ids:
            if debug:
                print("No papers found for the query.")
            return []

        details_params = {
            "db": "pubmed",
            "id": ",".join(paper_ids),
            "retmode": "json"
        }
        
        if debug:
            print(f"Fetching paper details from PubMed API: {details_url} with params: {details_params}")
        details_response = requests.get(details_url, params=details_params)
        details_response.raise_for_status()
        details_data = details_response.json()

        papers = []
        for paper_id, details in details_data.get("result", {}).items():
            if paper_id == "uids":
                continue
            papers.append({
                "PubmedID": paper_id,
                "Title": details.get("title", "N/A"),
                "PublicationDate": details.get("pubdate", "N/A"),
                "Authors": details.get("authors", []),  # To process authors later
                "Affiliation": details.get("source", "N/A")
            })

        return papers

    except requests.exceptions.RequestException as e:
        if debug:
            print(f"Error during API request: {e}")
        return []


In [3]:
from fetch_papers_module import fetch_papers  # Replace with your actual module name

def test_fetch_papers():
    # Define a sample PubMed query
    query = "COVID-19 vaccine"
    debug = True

    # Call the function
    results = fetch_papers(query, debug)

    # Print results
    if results:
        print(f"Fetched {len(results)} papers:")
        for paper in results[:5]:  # Print details of the first 5 papers
            print(paper)
    else:
        print("No papers fetched or an error occurred.")




ModuleNotFoundError: No module named 'fetch_papers_module'

In [5]:
import requests

def fetch_pubmed_ids(query):
    PUBMED_API_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        'db': 'pubmed',
        'term': query,
        'retmode': 'json'
    }
    response = requests.get(PUBMED_API_URL, params=params)
    response.raise_for_status()
    data = response.json()
    return data.get("esearchresult", {}).get("idlist", [])

# Test the function
query = "biotechnology cancer"
pubmed_ids = fetch_pubmed_ids(query)
print(f"PubMed IDs: {pubmed_ids}")


PubMed IDs: ['39772264', '39772162', '39771591', '39771588', '39771501', '39770927', '39770487', '39770472', '39770406', '39770404', '39770249', '39770115', '39769488', '39769470', '39769463', '39769457', '39769428', '39769419', '39769349', '39769237']


In [6]:
def fetch_paper_details(pubmed_ids):
    DETAILS_API_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    params = {
        'db': 'pubmed',
        'id': ','.join(pubmed_ids),
        'retmode': 'json'
    }
    response = requests.get(DETAILS_API_URL, params=params)
    response.raise_for_status()
    data = response.json()
    return data.get("result", {})

# Test the function
paper_details = fetch_paper_details(pubmed_ids)
for paper_id, details in paper_details.items():
    if paper_id != "uids":  # Skip the 'uids' key from the API response
        print(f"ID: {paper_id}, Title: {details.get('title')}, Authors: {details.get('authors')}")


ID: 39772264, Title: Optimized Directed Virus Evolution to Accelerate the Generation of Oncolytic Coxsackievirus B3 Adapted to Resistant Colorectal Cancer Cells., Authors: [{'name': 'Elsner L', 'authtype': 'Author', 'clusterid': ''}, {'name': 'Dieringer B', 'authtype': 'Author', 'clusterid': ''}, {'name': 'Geisler A', 'authtype': 'Author', 'clusterid': ''}, {'name': 'Girod M', 'authtype': 'Author', 'clusterid': ''}, {'name': 'Van Linthout S', 'authtype': 'Author', 'clusterid': ''}, {'name': 'Kurreck J', 'authtype': 'Author', 'clusterid': ''}, {'name': 'Fechner H', 'authtype': 'Author', 'clusterid': ''}]
ID: 39772162, Title: Isothermal Nucleic Acid Amplification for Point-of-Care Primary Cervical Cancer Screening., Authors: [{'name': 'Lamsisi M', 'authtype': 'Author', 'clusterid': ''}, {'name': 'Benlghazi A', 'authtype': 'Author', 'clusterid': ''}, {'name': 'Kouach J', 'authtype': 'Author', 'clusterid': ''}, {'name': 'Laraqui A', 'authtype': 'Author', 'clusterid': ''}, {'name': 'Ennaji 

In [7]:
def identify_non_academic_authors(authors):
    non_academic_authors = []
    for author in authors:
        affiliation = author.get("affiliation", "").lower()
        # A simple heuristic to exclude academic affiliations
        if "university" not in affiliation and "institute" not in affiliation:
            non_academic_authors.append({
                "name": author.get("name"),
                "affiliation": author.get("affiliation")
            })
    return non_academic_authors

# Test the function
for paper_id, details in paper_details.items():
    if paper_id != "uids" and "authors" in details:
        non_academic_authors = identify_non_academic_authors(details["authors"])
        if non_academic_authors:
            print(f"Paper ID: {paper_id}")
            print(f"Non-academic Authors: {non_academic_authors}")


Paper ID: 39772264
Non-academic Authors: [{'name': 'Elsner L', 'affiliation': None}, {'name': 'Dieringer B', 'affiliation': None}, {'name': 'Geisler A', 'affiliation': None}, {'name': 'Girod M', 'affiliation': None}, {'name': 'Van Linthout S', 'affiliation': None}, {'name': 'Kurreck J', 'affiliation': None}, {'name': 'Fechner H', 'affiliation': None}]
Paper ID: 39772162
Non-academic Authors: [{'name': 'Lamsisi M', 'affiliation': None}, {'name': 'Benlghazi A', 'affiliation': None}, {'name': 'Kouach J', 'affiliation': None}, {'name': 'Laraqui A', 'affiliation': None}, {'name': 'Ennaji MM', 'affiliation': None}, {'name': 'Chauleur C', 'affiliation': None}, {'name': 'Bourlet T', 'affiliation': None}, {'name': 'Li G', 'affiliation': None}]
Paper ID: 39771591
Non-academic Authors: [{'name': 'Bazylevich A', 'affiliation': None}, {'name': 'Miller A', 'affiliation': None}, {'name': 'Tkachenko I', 'affiliation': None}, {'name': 'Merlani M', 'affiliation': None}, {'name': 'Patsenker L', 'affiliat

In [8]:
import csv

def save_to_csv(data, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=["PubmedID", "Title", "Publication Date", "Non-academic Authors", "Company Affiliations", "Corresponding Author Email"])
        writer.writeheader()
        for row in data:
            writer.writerow(row)

# Test the function
# Prepare the data to save
results = []
for paper_id, details in paper_details.items():
    if paper_id != "uids" and "authors" in details:
        non_academic_authors = identify_non_academic_authors(details["authors"])
        if non_academic_authors:
            results.append({
                "PubmedID": paper_id,
                "Title": details.get("title"),
                "Publication Date": details.get("pubdate"),
                "Non-academic Authors": "; ".join([auth['name'] for auth in non_academic_authors]),
                "Company Affiliations": "; ".join([auth['affiliation'] for auth in non_academic_authors if auth['affiliation']]),
                "Corresponding Author Email": details.get("corresponding_author_email", "N/A")
            })

# Save results to a CSV file
save_to_csv(results, "pubmed_results.csv")
print("Results saved to pubmed_results.csv")


Results saved to pubmed_results.csv


In [9]:
def fetch_affiliations(pubmed_id):
    EFETCH_API_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        'db': 'pubmed',
        'id': pubmed_id,
        'retmode': 'xml'
    }
    response = requests.get(EFETCH_API_URL, params=params)
    response.raise_for_status()
    return response.text


In [10]:
xml_data = fetch_affiliations("39772264")



In [11]:
xml_data

'<?xml version="1.0" ?>\n<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2024//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_240101.dtd">\n<PubmedArticleSet>\n<PubmedArticle><MedlineCitation Status="MEDLINE" Owner="NLM" IndexingMethod="Automated"><PMID Version="1">39772264</PMID><DateCompleted><Year>2025</Year><Month>01</Month><Day>08</Day></DateCompleted><DateRevised><Year>2025</Year><Month>01</Month><Day>08</Day></DateRevised><Article PubModel="Electronic"><Journal><ISSN IssnType="Electronic">1999-4915</ISSN><JournalIssue CitedMedium="Internet"><Volume>16</Volume><Issue>12</Issue><PubDate><Year>2024</Year><Month>Dec</Month><Day>20</Day></PubDate></JournalIssue><Title>Viruses</Title><ISOAbbreviation>Viruses</ISOAbbreviation></Journal><ArticleTitle>Optimized Directed Virus Evolution to Accelerate the Generation of Oncolytic Coxsackievirus B3 Adapted to Resistant Colorectal Cancer Cells.</ArticleTitle><ELocationID EIdType="pii" ValidYN="Y">1958</ELocation

In [12]:
import xml.etree.ElementTree as ET

def parse_affiliations(xml_data):
    root = ET.fromstring(xml_data)
    authors = []
    for author in root.findall(".//Author"):
        name = author.findtext("LastName", "") + " " + author.findtext("ForeName", "")
        affiliation = author.findtext(".//AffiliationInfo/Affiliation", "")
        authors.append({
            "name": name,
            "affiliation": affiliation
        })
    return authors


In [16]:
authors = parse_affiliations(xml_data)

In [15]:
from fuzzywuzzy import fuzz

def identify_pharma_biotech_authors_fuzzy(authors):
    keywords = ["pharma", "biotech", "company", "corporation", "inc", "gmbh", "s.r.l."]
    threshold = 80  # Minimum similarity score
    pharma_authors = []

    for author in authors:
        affiliation = author.get("affiliation", "").lower()
        for keyword in keywords:
            # Calculate fuzzy match score
            similarity = fuzz.partial_ratio(keyword, affiliation)
            if similarity >= threshold:
                pharma_authors.append(author)
                break  # No need to check other keywords if one matches
    return pharma_authors




In [17]:
identify_pharma_biotech_authors_fuzzy(authors)

[{'name': 'Elsner Leslie',
  'affiliation': 'Department of Applied Biochemistry, Institute of Biotechnology, Technische Universität Berlin, 13355 Berlin, Germany.'},
 {'name': 'Dieringer Babette',
  'affiliation': 'Department of Applied Biochemistry, Institute of Biotechnology, Technische Universität Berlin, 13355 Berlin, Germany.'},
 {'name': 'Geisler Anja',
  'affiliation': 'Department of Applied Biochemistry, Institute of Biotechnology, Technische Universität Berlin, 13355 Berlin, Germany.'},
 {'name': 'Girod Maxim',
  'affiliation': 'Department of Applied Biochemistry, Institute of Biotechnology, Technische Universität Berlin, 13355 Berlin, Germany.'},
 {'name': 'Kurreck Jens',
  'affiliation': 'Department of Applied Biochemistry, Institute of Biotechnology, Technische Universität Berlin, 13355 Berlin, Germany.'},
 {'name': 'Fechner Henry',
  'affiliation': 'Department of Applied Biochemistry, Institute of Biotechnology, Technische Universität Berlin, 13355 Berlin, Germany.'}]