### CODE FOR LITERATURE SEARCH IN PUBMED, EUROPE PMC, AND ARXIV BY API

* Developed by Karen Gonçalves - Postdoctoral researcher / ISGLOBAL
* Last update: 21.08.2024

#### Packages

In [1]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84


In [2]:
pip install biopython requests crossrefapi xmltodict

Collecting crossrefapi
  Downloading crossrefapi-1.6.0-py3-none-any.whl.metadata (538 bytes)
Collecting xmltodict
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-1.26.16-py2.py3-none-any.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.4/48.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading crossrefapi-1.6.0-py3-none-any.whl (14 kB)
Downloading urllib3-1.26.16-py2.py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.1/143.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict, urllib3, crossrefapi
  Attempting uninstall: urllib3
    Found existing installation: urllib3 2.0.7
    Uninstalling urllib3-2.0.7:
      Successfully uninstalled urllib3-2.0.7
Successfully installed crossrefapi-1.6.0 urllib3-1.26.16 x

### Literature search by API

#### Full search

In [None]:
from Bio import Entrez
import requests
from crossref.restful import Works
import xmltodict
import pandas as pd
import time
import urllib.error

# Set your email for NCBI Entrez (PubMed)
Entrez.email = "youremail@adress.com"

# Function to fetch articles from PubMed and extract detailed information
def fetch_pubmed(query, max_results=100, retries=3):
    attempt = 0
    articles = []
    while attempt < retries:
        try:
            # Add a date range to the query
            query_with_date = f"{query} AND (\"2020/01/01\"[Date - Publication] : \"3000\"[Date - Publication])"
            handle = Entrez.esearch(db="pubmed", term=query_with_date, retmax=max_results)
            record = Entrez.read(handle)
            handle.close()
            id_list = record['IdList']
            for pub_id in id_list:
                fetch_handle = Entrez.efetch(db="pubmed", id=pub_id, retmode="xml")
                fetch_record = Entrez.read(fetch_handle)
                fetch_handle.close()
                try:
                    article = fetch_record['PubmedArticle'][0]['MedlineCitation']['Article']
                    title = article['ArticleTitle']
                    abstract = article['Abstract']['AbstractText'][0] if 'Abstract' in article else "N/A"

                    # Full-text access attempt (you would need to implement this based on your access)
                    # This is a placeholder - replace with actual full-text retrieval
                    full_text = "N/A"

                    study_design = extract_study_design(title, abstract, full_text)
                    sample_size = extract_sample_size(title, abstract, full_text)
                    results = abstract
                    exposures = extract_exposures(title, abstract, full_text)
                    clinical_assessment = extract_clinical_assessment(title, abstract, full_text)
                    year = article['Journal']['JournalIssue']['PubDate'].get('Year', 'N/A')

                    articles.append({
                        'Title': title,
                        'Study Design': study_design,
                        'Sample Size': sample_size,
                        'Results': results,
                        'Type of Exposures': exposures,
                        'Clinical Assessment': clinical_assessment,
                        'Year of Publication': year
                    })
                except (KeyError, IndexError) as e:
                    print(f"Error processing PubMed article ID {pub_id}: {e}")
            return articles
        except (urllib.error.HTTPError, urllib.error.URLError) as e:
            print(f"Error fetching PubMed data: {e}")
            attempt += 1
            time.sleep(5)  # wait before retrying
    print("Failed to fetch PubMed data after several attempts.")
    return articles

# Updated helper functions to include full-text
def extract_study_design(title, abstract, full_text):
    combined_text = f"{title} {abstract} {full_text}".lower()
    if "randomized control" in combined_text:
        return "Randomized Control Trial"
    elif "cohort" in combined_text:
        return "Cohort Study"
    elif "case-control" in combined_text:
        return "Case-Control Study"
    elif "cross-sectional" in combined_text:
        return "Cross-Sectional Study"
    else:
        return "N/A"

def extract_sample_size(title, abstract, full_text):
    combined_text = f"{title} {abstract} {full_text}".split()
    for i, word in enumerate(combined_text):
        if word.isdigit() and i + 1 < len(combined_text):
            if combined_text[i + 1].lower() in ["participants", "individuals", "patients", "n=", "sample", 'n=']:
                return f"{word} {combined_text[i + 1]}"
    return "N/A"

def extract_exposures(title, abstract, full_text):
    exposures_keywords = ['air pollution', 'noise', 'light at night', 'climate', 'circadian rhythm']
    combined_text = f"{title.lower()} {abstract.lower()} {full_text.lower()}"
    for keyword in exposures_keywords:
        if keyword in combined_text:
            return keyword
    return "N/A"

def extract_clinical_assessment(title, abstract, full_text):
    assessment_keywords = ['relapse', 'interview', 'clinical scale', 'assessment', 'intelligence tests',
                           'neurological test', 'biological test', 'observations', 'IQ', 'IQ tests', 'EHR']
    combined_text = f"{title.lower()} {abstract.lower()} {full_text.lower()}"
    for keyword in assessment_keywords:
        if keyword in combined_text:
            return keyword
    return "N/A"

# Fetch functions for CrossRef, Europe PMC, arXiv remain the same, handling only title/abstract

# Example usage with corrected query
query = "(bipolar disorder) AND (air pollution OR noise OR light at night OR climate OR circadian rhythm)"

pubmed_articles = fetch_pubmed(query)
crossref_articles = fetch_crossref(query)
europe_pmc_articles = fetch_europe_pmc(query)
arxiv_articles = fetch_arxiv(query)

# Combine all articles
all_articles = pubmed_articles + crossref_articles + europe_pmc_articles + arxiv_articles

# Convert to DataFrame for easier viewing and analysis
articles_df = pd.DataFrame(all_articles)

# Save the DataFrame to an Excel file in the current working directory
output_file = "articles_bipolar_disorder_environmental_exposures_full_text.xlsx"
articles_df.to_excel(output_file, index=False)

print(f"Results saved to {output_file}")


Error processing PubMed article ID 33760504: list index out of range
Results saved to articles_bipolar_disorder_environmental_exposures_full_text.xlsx


#### Top 80 cited papers

In [None]:
import requests
import pandas as pd
from Bio import Entrez
import time

# Set your email for NCBI Entrez (PubMed)
Entrez.email = "youremail@adress.com"

# Function to fetch articles from PubMed and extract detailed information
def fetch_pubmed(query, max_results=100, retries=3):
    attempt = 0
    articles = []
    while attempt < retries:
        try:
            # Add a date range to the query
            query_with_date = f"{query} AND (\"2020/01/01\"[Date - Publication] : \"3000\"[Date - Publication])"
            handle = Entrez.esearch(db="pubmed", term=query_with_date, retmax=max_results)
            record = Entrez.read(handle)
            handle.close()
            id_list = record['IdList']
            for pub_id in id_list:
                fetch_handle = Entrez.efetch(db="pubmed", id=pub_id, retmode="xml")
                fetch_record = Entrez.read(fetch_handle)
                fetch_handle.close()
                try:
                    article = fetch_record['PubmedArticle'][0]['MedlineCitation']['Article']
                    title = article['ArticleTitle']
                    abstract = article['Abstract']['AbstractText'][0] if 'Abstract' in article else "N/A"

                    # Extract DOI
                    doi = "N/A"
                    if 'ELocationID' in article:
                        for location in article['ELocationID']:
                            if location.attributes.get('EIdType') == 'doi':
                                doi = str(location)
                                break

                    # Placeholder for full-text access
                    full_text = "N/A"

                    study_design = extract_study_design(title, abstract, full_text)
                    sample_size = extract_sample_size(title, abstract, full_text)
                    results = abstract
                    exposures = extract_exposures(title, abstract, full_text)
                    clinical_assessment = extract_clinical_assessment(title, abstract, full_text)
                    year = article['Journal']['JournalIssue']['PubDate'].get('Year', 'N/A')

                    articles.append({
                        'Title': title,
                        'DOI': doi,
                        'Study Design': study_design,
                        'Sample Size': sample_size,
                        'Results': results,
                        'Type of Exposures': exposures,
                        'Clinical Assessment': clinical_assessment,
                        'Year of Publication': year,
                        'Citations': None  # Placeholder for citation count
                    })
                except (KeyError, IndexError) as e:
                    print(f"Error processing PubMed article ID {pub_id}: {e}")
            return articles
        except (urllib.error.HTTPError, urllib.error.URLError) as e:
            print(f"Error fetching PubMed data: {e}")
            attempt += 1
            time.sleep(5)  # wait before retrying
    print("Failed to fetch PubMed data after several attempts.")
    return articles

# Updated helper functions to include full-text
def extract_study_design(title, abstract, full_text):
    combined_text = f"{title} {abstract} {full_text}".lower()
    if "randomized control" in combined_text:
        return "Randomized Control Trial"
    elif "cohort" in combined_text:
        return "Cohort Study"
    elif "case-control" in combined_text:
        return "Case-Control Study"
    elif "cross-sectional" in combined_text:
        return "Cross-Sectional Study"
    else:
        return "N/A"

def extract_sample_size(title, abstract, full_text):
    combined_text = f"{title} {abstract} {full_text}".split()
    for i, word in enumerate(combined_text):
        if word.isdigit() and i + 1 < len(combined_text):
            if combined_text[i + 1].lower() in ["participants", "individuals", "patients", "n=", "sample", 'n=']:
                return f"{word} {combined_text[i + 1]}"
    return "N/A"

def extract_exposures(title, abstract, full_text):
    exposures_keywords = ['air pollution', 'noise', 'light at night', 'climate', 'circadian rhythm']
    combined_text = f"{title.lower()} {abstract.lower()} {full_text.lower()}"
    for keyword in exposures_keywords:
        if keyword in combined_text:
            return keyword
    return "N/A"

def extract_clinical_assessment(title, abstract, full_text):
    assessment_keywords = ['relapse', 'interview', 'clinical scale', 'assessment', 'intelligence tests',
                           'neurological test', 'biological test', 'observations', 'IQ', 'IQ tests', 'EHR']
    combined_text = f"{title.lower()} {abstract.lower()} {full_text.lower()}"
    for keyword in assessment_keywords:
        if keyword in combined_text:
            return keyword
    return "N/A"

# Function to fetch citation counts using CrossRef
def fetch_citations(doi):
    if doi == "N/A":
        return 0
    url = f"https://api.crossref.org/works/{doi}"
    try:
        response = requests.get(url)
        if response.status_code == 200:
            data = response.json()
            return data['message'].get('is-referenced-by-count', 0)
        else:
            print(f"Failed to retrieve citation count for DOI {doi}: {response.status_code}")
            return 0
    except Exception as e:
        print(f"Error fetching citation count for DOI {doi}: {e}")
        return 0

# Combine articles from different sources
def combine_articles(pubmed_articles, crossref_articles, europe_pmc_articles, arxiv_articles):
    all_articles = pubmed_articles + crossref_articles + europe_pmc_articles + arxiv_articles

    # Fetch citations for each article
    for article in all_articles:
        if article['DOI'] != "N/A":
            article['Citations'] = fetch_citations(article['DOI'])

    return all_articles

# Example usage with corrected query
query = "(bipolar disorder) AND (air pollution OR noise OR light at night OR climate OR circadian rhythm)"

pubmed_articles = fetch_pubmed(query)
crossref_articles = []
europe_pmc_articles = []
arxiv_articles = []

# Combine all articles
all_articles = combine_articles(pubmed_articles, crossref_articles, europe_pmc_articles, arxiv_articles)

# Convert to DataFrame for easier viewing and analysis
articles_df = pd.DataFrame(all_articles)

# Sort by citation count and get the top 20
top_20_cited_articles = articles_df.sort_values(by='Citations', ascending=False).head(80)

# Save the top 20 cited articles to an Excel file
output_file = "top_80_cited_articles_bipolar_disorder_environmental_exposures.xlsx"
top_20_cited_articles.to_excel(output_file, index=False)

print(f"Top 80 cited articles saved to {output_file}")


Error processing PubMed article ID 33760504: list index out of range
Top 80 cited articles saved to top_80_cited_articles_bipolar_disorder_environmental_exposures.xlsx
