# PubMed Research Paper Fetcher

This notebook demonstrates the code for fetching and filtering research papers from PubMed using a query string. The program identifies papers with at least one author affiliated with a pharmaceutical or biotech company and outputs the results as a CSV file.

In [1]:
# Import necessary libraries
from typing import List, Dict  # For type hinting of function parameters and return types
import requests  # For making HTTP requests to the PubMed API
import csv  # For handling CSV operations
import re  # For regular expressions (not used here but imported)
import logging  # For logging useful information during the script's execution

# PubMedFetcher Class Definition
class PubMedFetcher:
    """A class to fetch and filter research papers from PubMed."""

    # The base URLs for PubMed's search and summary API endpoints
    BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"

    def fetch_papers(self, query: str) -> List[Dict[str, str]]:
        """Fetch and process papers matching the query from PubMed."""
        logging.debug(f"Fetching papers for query: {query}")

        # Step 1: Search for paper IDs
        ids = self._search_papers(query)
        
        # If no papers are found, return an empty list
        if not ids:
            logging.debug("No papers found for the given query.")
            return []

        # Step 2: Fetch details of the papers using their IDs
        papers = self._fetch_details(ids)

        # Step 3: Filter and extract relevant information
        return self._filter_papers(papers)

    def _search_papers(self, query: str) -> List[str]:
        """Search PubMed for paper IDs based on the query."""
        # Parameters for the search request
        params = {
            "db": "pubmed",  # Specify PubMed as the database
            "term": query,  # The search query string
            "retmode": "json",  # Response format as JSON
            "retmax": 100  # Maximum number of results to fetch
        }
        
        # Perform the HTTP GET request to search PubMed
        response = requests.get(self.BASE_URL, params=params)
        
        # Raise an error if the request fails
        response.raise_for_status()

        # Parse the JSON response and extract the list of paper IDs
        data = response.json()
        return data.get("esearchresult", {}).get("idlist", [])

    def _fetch_details(self, ids: List[str]) -> List[Dict]:
        """Fetch details of papers using their PubMed IDs."""
        # Parameters for fetching details of the papers
        params = {
            "db": "pubmed",  # Specify PubMed as the database
            "id": ",".join(ids),  # Join the list of IDs into a comma-separated string
            "retmode": "json"  # Response format as JSON
        }

        # Perform the HTTP GET request to fetch paper details
        response = requests.get(self.FETCH_URL, params=params)

        # Raise an error if the request fails
        response.raise_for_status()

        # Parse the JSON response and return the details of the papers
        data = response.json()
        return data.get("result", {}).values()

    def _filter_papers(self, papers: List[Dict]) -> List[Dict[str, str]]:
        """Filter papers for non-academic authors and extract required fields."""
        results = []

        # Loop through the fetched papers to extract necessary information
        for paper in papers:
            # Skip any invalid or incomplete paper data
            if not isinstance(paper, dict) or "uid" not in paper:
                continue

            # Get the list of authors from the paper
            authors = paper.get("authors", [])

            # Find non-academic authors by checking their affiliations
            non_academic_authors = [
                author for author in authors
                if not self._is_academic_author(author.get("affiliation", ""))
            ]

            # If there are non-academic authors, gather the relevant information
            if non_academic_authors:
                results.append({
                    "PubmedID": paper.get("uid", "N/A"),  # PubMed ID (unique identifier for the paper)
                    "Title": paper.get("title", "N/A"),  # Title of the paper
                    "Publication Date": paper.get("pubdate", "N/A"),  # Publication date
                    "Non-academic Author(s)": ", ".join(  # Names of non-academic authors
                        [author.get("name", "") for author in non_academic_authors]
                    ),
                    "Company Affiliation(s)": ", ".join(  # Affiliation (company, institution) of non-academic authors
                        [author.get("affiliation", "") for author in non_academic_authors]
                    ),
                    "Corresponding Author Email": paper.get("elocationid", "N/A")  # Corresponding author’s email (if available)
                })

        return results

    @staticmethod
    def _is_academic_author(affiliation: str) -> bool:
        """Determine if an author is academic based on their affiliation."""
        # List of keywords to detect academic affiliations
        academic_keywords = ["university", "institute", "college", "school", "lab", "hospital"]
        
        # Check if any of the keywords are present in the author's affiliation
        return any(keyword.lower() in affiliation.lower() for keyword in academic_keywords)


## Usage Example

Below is an example of how to use the `PubMedFetcher` class to fetch and filter research papers.

In [3]:
# Example usage
if __name__ == "__main__":
    # This sets up logging with an information level (INFO)
    logging.basicConfig(level=logging.INFO)
    # an instance of a class called "PubMedFetcher" is created
    fetcher = PubMedFetcher()

    # This line sets the search query for fetching papers from PubMed
    query = "cancer treatment"
    # return a list of dictionaries containing details about the papers related to cancer treatment and store the results in results variable.
    results = fetcher.fetch_papers(query)
    
    # Save to CSV
    # This block opens a file named output.csv in write mode ("w"). If output.csv doesn’t already exist, it will be created.
    with open("output.csv", mode="w", newline="", encoding="utf-8") as file:
        # Used to write the list of dictionaries (results) to the CSV. 
        writer = csv.DictWriter(file, fieldnames=[
            "PubmedID", "Title", "Publication Date", "Non-academic Author(s)", "Company Affiliation(s)", "Corresponding Author Email"
        ])
        writer.writeheader()      # writes the header row to the CSV file (column names).
        writer.writerows(results) # writes the actual data (paper details) to the CSV.
    
    print(f"Fetched {len(results)} papers. Results saved to output.csv.")


Fetched 100 papers. Results saved to output.csv.
