# PubMed Research Paper Fetcher

This notebook demonstrates the code for fetching and filtering research papers from PubMed using a query string. The program identifies papers with at least one author affiliated with a pharmaceutical or biotech company and outputs the results as a CSV file.

In [1]:
# Import necessary libraries
from typing import List, Dict
import requests
import csv
import re
import logging

# PubMedFetcher Class Definition
class PubMedFetcher:
    """A class to fetch and filter research papers from PubMed."""

    BASE_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"

    def fetch_papers(self, query: str) -> List[Dict[str, str]]:
        """Fetch and process papers matching the query from PubMed."""
        logging.debug(f"Fetching papers for query: {query}")

        # Step 1: Search for paper IDs
        ids = self._search_papers(query)
        if not ids:
            logging.debug("No papers found for the given query.")
            return []

        # Step 2: Fetch details of the papers
        papers = self._fetch_details(ids)

        # Step 3: Filter and extract relevant information
        return self._filter_papers(papers)

    def _search_papers(self, query: str) -> List[str]:
        """Search PubMed for paper IDs based on the query."""
        params = {
            "db": "pubmed",
            "term": query,
            "retmode": "json",
            "retmax": 100
        }
        response = requests.get(self.BASE_URL, params=params)
        response.raise_for_status()

        data = response.json()
        return data.get("esearchresult", {}).get("idlist", [])

    def _fetch_details(self, ids: List[str]) -> List[Dict]:
        """Fetch details of papers using their PubMed IDs."""
        params = {
            "db": "pubmed",
            "id": ",".join(ids),
            "retmode": "json"
        }
        response = requests.get(self.FETCH_URL, params=params)
        response.raise_for_status()

        data = response.json()
        return data.get("result", {}).values()

    def _filter_papers(self, papers: List[Dict]) -> List[Dict[str, str]]:
        """Filter papers for non-academic authors and extract required fields."""
        results = []

        for paper in papers:
            if not isinstance(paper, dict) or "uid" not in paper:
                continue

            authors = paper.get("authors", [])
            non_academic_authors = [
                author for author in authors
                if not self._is_academic_author(author.get("affiliation", ""))
            ]

            if non_academic_authors:
                results.append({
                    "PubmedID": paper.get("uid", "N/A"),
                    "Title": paper.get("title", "N/A"),
                    "Publication Date": paper.get("pubdate", "N/A"),
                    "Non-academic Author(s)": ", ".join(
                        [author.get("name", "") for author in non_academic_authors]
                    ),
                    "Company Affiliation(s)": ", ".join(
                        [author.get("affiliation", "") for author in non_academic_authors]
                    ),
                    "Corresponding Author Email": paper.get("elocationid", "N/A")
                })

        return results

    @staticmethod
    def _is_academic_author(affiliation: str) -> bool:
        """Determine if an author is academic based on their affiliation."""
        academic_keywords = ["university", "institute", "college", "school", "lab", "hospital"]
        return any(keyword.lower() in affiliation.lower() for keyword in academic_keywords)


## Usage Example

Below is an example of how to use the `PubMedFetcher` class to fetch and filter research papers.

In [2]:
# Example usage
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    fetcher = PubMedFetcher()

    query = "cancer treatment"
    results = fetcher.fetch_papers(query)
    
    # Save to CSV
    with open("output.csv", mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=[
            "PubmedID", "Title", "Publication Date", "Non-academic Author(s)", "Company Affiliation(s)", "Corresponding Author Email"
        ])
        writer.writeheader()
        writer.writerows(results)
    
    print(f"Fetched {len(results)} papers. Results saved to output.csv.")


Fetched 100 papers. Results saved to output.csv.
