# PubMed Search and Store

## Notebook Description

This notebook, **PubMed Search and Store**, provides a streamlined method for searching PubMed and storing essential information from retrieved articles. The goal is to facilitate the extraction and organization of metadata from PubMed articles and make it available for further analysis and review.

## Objectives

1. **Search PubMed**: Retrieve articles from PubMed based on a specified search query.
2. **Store Essential Metadata**: Extract and save key fields for each article, including:
   - **Title**: Article title.
   - **Authors**: List of authors.
   - **PubMed ID**: Unique identifier for PubMed articles.
   - **DOI**: Digital Object Identifier, if available.
   - **Abstract**: Summary of the article’s content.
3. **Retrieve PDF URL**: If available, attempt to locate and store a URL to the full-text PDF of each article.

## Implementation Details


In [None]:
from pymed import PubMed
import json
import pandas as pd
import uuid
#read the json pico_examples.json

with open('pico_examples.json') as f:
    data = json.load(f)
f.close()

pico_df = pd.DataFrame(data)

#add a column which will be the unique identifier for the pico question
pico_df['PICO_ID'] = [uuid.uuid4() for _ in range(len(pico_df))]

#pull the first pico PubMedSearchStr and search for it
pm = PubMed(tool="VertexAIPICO", email="james@jameslabadorf.com")

def get_results(pico):
    results = list(pm.query(pico, max_results=1000))
    print("PubMed query for: ", pico)
    print("Number of results: ", len(results))
    return results

def create_flat_dicitonary(result):
    result_dict = {}
    result_dict["pubmed_id"] = result['pubmed_id'].split("\n")[0]
    authors = result['authors']
    if len(authors) > 3:
        result_dict['authors'] = authors[0]['lastname'] + " et al."
    elif len(authors) > 0:
        author_list = []
        for author in authors:
            last_name = author['lastname']
            if last_name is not None:
                author_list.append(author['lastname'])
        result_dict['authors'] = ", ".join(author_list)
    else:
        result_dict['authors'] = ""
    result_dict['title'] = result['title']
    result_dict['abstract'] = result['abstract']
    result_dict['keywords'] = str(result['keywords'])
    result_dict['journal'] = result['journal']
    result_dict['doi'] = result['doi']
    result_dict['publication_date'] = result['publication_date']
    result_dict['authors_raw'] = str(result['authors'])
    result_dict["pubmed_id_raw"] = result['pubmed_id']
    #we want to create a list of authors such that it is First Author's Last name + et al. if there are more than 3 authors. If there are 3 or less authors, we want to list all authors' last names
    return result_dict

results_df = pd.DataFrame()

from tqdm.notebook import tqdm
for index, pico in tqdm(pico_df.iterrows(), total=len(pico_df)):
    results = get_results(pico['PubMedSearchStr'])
    flat_results = [create_flat_dicitonary(result.toDict()) for result in results]
    for result in flat_results:
        result['PICO_ID'] = pico['PICO_ID']
    results_df = pd.concat([results_df, pd.DataFrame(flat_results)])


#print a summary of the results
print("Summary of results")
print("Number of total results: ", len(results_df))
print("Number of unique PICO questions: ", len(results_df['PICO_ID'].unique()))

results_df.to_csv("PubMed_PICO_Results.csv", index=False)


In [50]:
from metapub import FindIt
import requests
from time import sleep

#get the list of unique pubmed_ids
pubmed_ids = list(results_df['pubmed_id'].unique())

def get_pdf_url(pubmed_id):
    try:
        sleep(0.3334)
        status = ""
        article = FindIt(pubmed_id)
        if article.url:
            article = FindIt(pubmed_id)
            status = "found"
            url = article.url

            return url, status
        else:
            status = article.reason
            url = ""
            return url, status
    except Exception as e:
        print(e)
        return "", "error"

pdf_urls = []
pdf_status = []
for pubmed_id in tqdm(pubmed_ids):
    url, status = get_pdf_url(pubmed_id)
    pdf_urls.append(url)
    pdf_status.append(status)

#add the pdf urls and status to the results_df
results_df['pdf_url'] = pdf_urls
results_df['pdf_status'] = pdf_status
#how many pdfs were found?
print("Number of PDFs found: ", len(results_df[results_df['pdf_status'] == "found"]))

results_df.to_csv("PubMed_PICO_Results.csv", index=False)

  0%|          | 0/548 [00:00<?, ?it/s]

cannot access local variable 'pdfurl' where it is not associated with a value


[32m2024-10-29 13:21:13[0m [35mDESKTOP-NVR8E1L[0m [34mmetapub.DxDOI[6908][0m [1;30mINFO[0m URL is accessible: https://link.springer.com/article/10.1007/s10072-019-03736-3 (Status code: 200)
[32m2024-10-29 13:21:13[0m [35mDESKTOP-NVR8E1L[0m [34mmetapub.DxDOI[6908][0m [1;30mINFO[0m cached results for key 10.1007/s10072-019-03736-3 (10.1007/s10072-019-03736-3) 
[32m2024-10-29 13:21:37[0m [35mDESKTOP-NVR8E1L[0m [34mmetapub.DxDOI[6908][0m [1;30mINFO[0m URL is accessible: https://link.springer.com/article/10.1007/s13760-018-1034-5 (Status code: 200)
[32m2024-10-29 13:21:37[0m [35mDESKTOP-NVR8E1L[0m [34mmetapub.DxDOI[6908][0m [1;30mINFO[0m cached results for key 10.1007/s13760-018-1034-5 (10.1007/s13760-018-1034-5) 
[32m2024-10-29 13:22:15[0m [35mDESKTOP-NVR8E1L[0m [34mmetapub.DxDOI[6908][0m [1;30mINFO[0m URL is accessible: https://link.springer.com/article/10.1007/s00592-018-1132-0 (Status code: 200)
[32m2024-10-29 13:22:15[0m [35mDESKTOP-NVR8E1L[0

cannot access local variable 'pdfurl' where it is not associated with a value


[32m2024-10-29 13:23:54[0m [35mDESKTOP-NVR8E1L[0m [34mmetapub.DxDOI[6908][0m [1;30mINFO[0m URL is accessible: https://linkinghub.elsevier.com/retrieve/pii/S1279770723002014 (Status code: 200)
[32m2024-10-29 13:23:54[0m [35mDESKTOP-NVR8E1L[0m [34mmetapub.DxDOI[6908][0m [1;30mINFO[0m cached results for key 10.1007/s12603-015-0602-0 (10.1007/s12603-015-0602-0) 
[32m2024-10-29 13:23:57[0m [35mDESKTOP-NVR8E1L[0m [34mmetapub.DxDOI[6908][0m [1;30mINFO[0m URL is accessible: https://link.springer.com/article/10.1007/s12282-016-0687-2 (Status code: 200)
[32m2024-10-29 13:23:57[0m [35mDESKTOP-NVR8E1L[0m [34mmetapub.DxDOI[6908][0m [1;30mINFO[0m cached results for key 10.1007/s12282-016-0687-2 (10.1007/s12282-016-0687-2) 
[32m2024-10-29 13:24:20[0m [35mDESKTOP-NVR8E1L[0m [34mmetapub.DxDOI[6908][0m [1;30mINFO[0m URL is accessible: https://link.springer.com/article/10.1007/s00408-015-9751-5 (Status code: 200)
[32m2024-10-29 13:24:20[0m [35mDESKTOP-NVR8E1L[0

cannot access local variable 'pdfurl' where it is not associated with a value


[32m2024-10-29 13:28:46[0m [35mDESKTOP-NVR8E1L[0m [34mmetapub.DxDOI[6908][0m [1;30mINFO[0m URL is accessible: https://link.springer.com/article/10.1007/s00228-018-2562-x (Status code: 200)
[32m2024-10-29 13:28:46[0m [35mDESKTOP-NVR8E1L[0m [34mmetapub.DxDOI[6908][0m [1;30mINFO[0m cached results for key 10.1007/s00228-018-2562-x (10.1007/s00228-018-2562-x) 
[32m2024-10-29 13:28:46[0m [35mDESKTOP-NVR8E1L[0m [34mmetapub.DxDOI[6908][0m [1;30mINFO[0m cached results for key 10.1007/s00228-018-2562-x (10.1007/s00228-018-2562-x) 
[32m2024-10-29 13:28:58[0m [35mDESKTOP-NVR8E1L[0m [34mmetapub.DxDOI[6908][0m [1;30mINFO[0m URL is accessible: https://linkinghub.elsevier.com/retrieve/pii/S1341321X18301776 (Status code: 200)
[32m2024-10-29 13:28:58[0m [35mDESKTOP-NVR8E1L[0m [34mmetapub.DxDOI[6908][0m [1;30mINFO[0m cached results for key 10.1016/j.jiac.2018.06.006 (10.1016/j.jiac.2018.06.006) 
[32m2024-10-29 13:30:17[0m [35mDESKTOP-NVR8E1L[0m [34mmetapub.DxD

In [62]:
#we want to get the pdfs for the articles that were found. Save them to the folder pdfs and create new folder for each PICO question
import os
import urllib.request

#make a directory for the pdfs
if not os.path.exists("pdfs"):
    os.makedirs("pdfs")

pico_ids = list(results_df['PICO_ID'].unique())

for pico_id in pico_ids:
    if not os.path.exists(f"pdfs/{pico_id}"):
        os.makedirs(f"pdfs/{pico_id}")

for index, row in tqdm(results_df.iterrows(), total=len(results_df)):
    if row['pdf_status'] == "found":
        pdf_url = row['pdf_url']
        pico_id = row['PICO_ID']
        pdf_name = f"pdfs/{pico_id}/{index}.pdf"
        try:
            urllib.request.urlretrieve(pdf_url, pdf_name)
        except Exception as e:
            print(e)
            print("Error downloading pdf: ", pdf_url)
            continue

  0%|          | 0/548 [00:00<?, ?it/s]

HTTP Error 500: Internal Server Error
Error downloading pdf:  http://europepmc.org/backend/ptpmcrender.fcgi?accid=PMC3304299&blobtype=pdf
