## Publications webscraped under the affiliation of the Division of Imaging, Diagnostics, and Software Reliability

In [1]:
#Import libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd

## Input keyword and URL 

In [2]:
# What keyword would I like to search? (Division of Imaging Diagnostics and Software Reliability, Office of Science and Engineering Laboratories[affil]) - 76
keyword = '(Division%20of%20Imaging%20Diagnostics%20and%20Software%20Reliability%2C%20Office%20of%20Science%20and%20Engineering%20Laboratories%5Baffil%5D)'

# For the website I'm checking, what is the base URL for the page where results come up?
baseURL = 'https://pubmed.ncbi.nlm.nih.gov/?term='

# Build the URL
searchURL = baseURL + keyword
print(searchURL)

# Send an HTTP request (internet browser asks server for info needed to load the website), save the results in 'response'
response = requests.get(searchURL).content
#print(response)

# Interpret response.content with Beautifhttps://pubmed.ncbi.nlm.nih.gov/?term=(OfficeulSoup
content = BeautifulSoup(response, 'lxml') #lxml = library to parse HTML and other languages
print(content)

https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Imaging%20Diagnostics%20and%20Software%20Reliability%2C%20Office%20of%20Science%20and%20Engineering%20Laboratories%5Baffil%5D)
<!DOCTYPE html>
<html lang="en">
<head itemscope="" itemtype="http://schema.org/WebPage" prefix="og: http://ogp.me/ns#">
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<!-- Mobile properties -->
<meta content="True" name="HandheldFriendly"/>
<meta content="320" name="MobileOptimized"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<link href="https://cdn.ncbi.nlm.nih.gov" rel="preconnect"/>
<link href="https://www.ncbi.nlm.nih.gov" rel="preconnect"/>
<link href="https://www.google-analytics.com" rel="preconnect"/>
<link href="https://cdn.ncbi.nlm.nih.gov/pubmed/1e50868b-da01-4bc0-8fb1-7d35b0eecbe6/CACHE/css/output.5ecf62baa0fa.css" rel="stylesheet" type="text/css"/>
<link href="https://cdn.ncbi.nlm.nih.gov/pubmed/1e50868b-da01-4bc0-8fb1-7d35b0eecbe6/

In [3]:
# Find ONE article
article_content = content.find('article', class_ = 'full-docsum')
#print(article_content)

#Extract title
title = article_content.find('a', class_ = 'docsum-title').text.strip()
print(title)

# Extract other citation info
authors = article_content.find('span', class_ = 'docsum-authors full-authors').text.strip()
print(authors)

journal = article_content.find('span', class_ = 'docsum-journal-citation full-journal-citation').text.strip()
print(journal)

pmid = article_content.find('span', class_ = 'citation-part').text.strip()
print(pmid)

Elastography Assessment of Liver Fibrosis: Society of Radiologists in Ultrasound Consensus Conference Statement.
Barr RG, Ferraioli G, Palmeri ML, Goodman ZD, Garcia-Tsao G, Rubin J, Garra B, Myers RP, Wilson SR, Rubens D, Levine D.
Radiology. 2015 Sep;276(3):845-61. doi: 10.1148/radiol.2015150619. Epub 2015 Jun 16.
PMID: 26079489


## Get citation code

In [4]:
# Assemble full citation string
full_citation = authors + ' ' + title + ' ' + journal + ' ' + pmid
print(full_citation)

Barr RG, Ferraioli G, Palmeri ML, Goodman ZD, Garcia-Tsao G, Rubin J, Garra B, Myers RP, Wilson SR, Rubens D, Levine D. Elastography Assessment of Liver Fibrosis: Society of Radiologists in Ultrasound Consensus Conference Statement. Radiology. 2015 Sep;276(3):845-61. doi: 10.1148/radiol.2015150619. Epub 2015 Jun 16. PMID: 26079489


In [5]:
### How many pages of results are there? ###
total_pages = content.find('label', class_ = 'of-total-pages').text.strip()
#print(total_pages)

# Extract the number of pages
p = total_pages.split(' ')
#print(p)

numPages = int(p[1])
#print(numPages)


### Loop through each page ###

# Base URL for multiple pages
pageURL = searchURL + '&page='
print(pageURL)

# Loop through each page and extract the information
citations = []
titles = []

count = 1
while count < (numPages + 1):
    # link to the page we're on
    page = pageURL + str(count)
    print(page)
    
    # Collect response from that page
    response = requests.get(page)

    # Interpret response.content with BeautifulSoup
    content = BeautifulSoup(response.content, 'lxml')
    
    # Find ALL articles on this page
    articles = content.find_all('article', class_ = 'full-docsum')
    
    # Iterate through the list of articles
    for article in articles:
        title = article.find('a', class_ = 'docsum-title').text.strip()
        citation = article.find('div', class_ = 'docsum-citation full-citation')
        authors = citation.find('span', class_ = 'docsum-authors full-authors').text
        journal = citation.find('span', class_ = 'docsum-journal-citation full-journal-citation').text
        pmid = citation.find('span', class_ = 'citation-part').text
        full_citation = authors + ' ' + title + ' ' + journal + ' ' + pmid
    
        citations.append(full_citation)
        titles.append(title)
    
    
    
    count += 1
    
print('Finished')
print(citations)
print(titles)

https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Imaging%20Diagnostics%20and%20Software%20Reliability%2C%20Office%20of%20Science%20and%20Engineering%20Laboratories%5Baffil%5D)&page=
https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Imaging%20Diagnostics%20and%20Software%20Reliability%2C%20Office%20of%20Science%20and%20Engineering%20Laboratories%5Baffil%5D)&page=1
https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Imaging%20Diagnostics%20and%20Software%20Reliability%2C%20Office%20of%20Science%20and%20Engineering%20Laboratories%5Baffil%5D)&page=2
https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Imaging%20Diagnostics%20and%20Software%20Reliability%2C%20Office%20of%20Science%20and%20Engineering%20Laboratories%5Baffil%5D)&page=3
https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Imaging%20Diagnostics%20and%20Software%20Reliability%2C%20Office%20of%20Science%20and%20Engineering%20Laboratories%5Baffil%5D)&page=4
https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20I

In [9]:
# Pair each title with its citation
results = list( zip(titles, citations) )

# Data frame
df = pd.DataFrame(results, columns = ['Titles', 'Citation'])

# Save file
df.to_csv('DIDSR_Affil_Search.csv')

## Code to Webscrape Year

In [13]:
# Read the CSV file
didsr = pd.read_csv('DIDSR_Affil_Search.csv')

# Read the CSV file
didsr_data = pd.read_csv('csv-DivisionofDIDSR.csv')

In [14]:
# remove the period after each publication title
didsr['Titles'] = didsr['Titles'].str.rstrip('.')

In [15]:
merged_data = pd.merge(didsr, didsr_data[['Title', 'Publication Year']], left_on='Titles', right_on='Title', how='left')

# turn all nas into 0, so all values can be int
merged_data['Publication Year'] = merged_data['Publication Year'].fillna(0).astype(int)

# remove the 0 after the year
merged_data['Publication Year'] = merged_data['Publication Year'].astype(int).astype(str).str.rstrip('.0')

In [16]:
merged_data.tail()

Unnamed: 0.1,Unnamed: 0,Titles,Citation,Title,Publication Year
73,71,Evaluating the sensitivity of the optimization...,"Zeng R, Park S, Bakic P, Myers KJ. Evaluating ...",Evaluating the sensitivity of the optimization...,2015
74,72,Changes in Circulating Tumor DNA Reflect Clini...,"Vega DM, Nishimura KK, Zariffa N, Thompson JC,...",Changes in Circulating Tumor DNA Reflect Clini...,2022
75,73,Beam orientation optimization for coherent X-r...,"Breedlove S, Badano A. Beam orientation optimi...",Beam orientation optimization for coherent X-r...,2021
76,74,Evaluating whole slide imaging: A working grou...,"Treanor D, Gallas BD, Gavrielides MA, Hewitt S...",Evaluating whole slide imaging: A working grou...,2015
77,75,Pioneers in Medical Imaging: Honoring the Memo...,"Myers KJ, Chen W. Pioneers in Medical Imaging:...",Pioneers in Medical Imaging: Honoring the Memo...,2014


In [23]:
dataset3 = merged_data.to_excel('DIDSR_Affil_Search.xlsx')