## Publications webscraped under the affiliation of the Division of Applied Mechanics

In [1]:
#Import libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd

## Input keyword and URL 1987 - 2023

In [2]:
# What keyword would I like to search? (Division of Applied Mechanics[affil]) - 197 results
keyword = '(Division%20of%20Applied%20Mechanics%5Baffil%5D)'

# For the website I'm checking, what is the base URL for the page where results come up?
baseURL = 'https://pubmed.ncbi.nlm.nih.gov/?term='

# Build the URL
searchURL = baseURL + keyword
print(searchURL)

# Send an HTTP request (internet browser asks server for info needed to load the website), save the results in 'response'
response = requests.get(searchURL).content
#print(response)

# Interpret response.content with Beautifhttps://pubmed.ncbi.nlm.nih.gov/?term=(OfficeulSoup
content = BeautifulSoup(response, 'lxml') #lxml = library to parse HTML and other languages
print(content)

https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Applied%20Mechanics%5Baffil%5D)
<!DOCTYPE html>
<html lang="en">
<head itemscope="" itemtype="http://schema.org/WebPage" prefix="og: http://ogp.me/ns#">
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<!-- Mobile properties -->
<meta content="True" name="HandheldFriendly"/>
<meta content="320" name="MobileOptimized"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<link href="https://cdn.ncbi.nlm.nih.gov" rel="preconnect"/>
<link href="https://www.ncbi.nlm.nih.gov" rel="preconnect"/>
<link href="https://www.google-analytics.com" rel="preconnect"/>
<link href="https://cdn.ncbi.nlm.nih.gov/pubmed/1e50868b-da01-4bc0-8fb1-7d35b0eecbe6/CACHE/css/output.5ecf62baa0fa.css" rel="stylesheet" type="text/css"/>
<link href="https://cdn.ncbi.nlm.nih.gov/pubmed/1e50868b-da01-4bc0-8fb1-7d35b0eecbe6/CACHE/css/output.452c70ce66f7.css" rel="stylesheet" type="text/css"/>
<link href="https://cdn.n

In [3]:
# Find ONE article
article_content = content.find('article', class_ = 'full-docsum')
#print(article_content)

#Extract title
title = article_content.find('a', class_ = 'docsum-title').text.strip()
print(title)

# Extract other citation info
authors = article_content.find('span', class_ = 'docsum-authors full-authors').text.strip()
print(authors)

journal = article_content.find('span', class_ = 'docsum-journal-citation full-journal-citation').text.strip()
print(journal)

pmid = article_content.find('span', class_ = 'citation-part').text.strip()
print(pmid)

Role of Nanoparticles in Nanofluid Droplet Impact on Solid Surfaces.
Aksoy YT, Liu L, Abboud M, Vetrano MR, Koos E.
Langmuir. 2023 Jan 10;39(1):12-19. doi: 10.1021/acs.langmuir.2c02578. Epub 2022 Dec 22.
PMID: 36548220


In [4]:
# Assemble full citation string
full_citation = authors + ' ' + title + ' ' + journal + ' ' + pmid
print(full_citation)

Aksoy YT, Liu L, Abboud M, Vetrano MR, Koos E. Role of Nanoparticles in Nanofluid Droplet Impact on Solid Surfaces. Langmuir. 2023 Jan 10;39(1):12-19. doi: 10.1021/acs.langmuir.2c02578. Epub 2022 Dec 22. PMID: 36548220


In [5]:
### How many pages of results are there? ###
total_pages = content.find('label', class_ = 'of-total-pages').text.strip()
#print(total_pages)

# Extract the number of pages
p = total_pages.split(' ')
#print(p)

numPages = int(p[1])
#print(numPages)


### Loop through each page ###

# Base URL for multiple pages
pageURL = searchURL + '&page='
print(pageURL)

# Loop through each page and extract the information
citations = []
titles = []

count = 1
while count < (numPages + 1):
    # link to the page we're on
    page = pageURL + str(count)
    print(page)
    
    # Collect response from that page
    response = requests.get(page)

    # Interpret response.content with BeautifulSoup
    content = BeautifulSoup(response.content, 'lxml')
    
    # Find ALL articles on this page
    articles = content.find_all('article', class_ = 'full-docsum')
    
    # Iterate through the list of articles
    for article in articles:
        title = article.find('a', class_ = 'docsum-title').text.strip()
        citation = article.find('div', class_ = 'docsum-citation full-citation')
        authors = citation.find('span', class_ = 'docsum-authors full-authors').text
        journal = citation.find('span', class_ = 'docsum-journal-citation full-journal-citation').text
        pmid = citation.find('span', class_ = 'citation-part').text
        full_citation = authors + ' ' + title + ' ' + journal + ' ' + pmid
    
        citations.append(full_citation)
        titles.append(title)
    
    
    
    count += 1
    
print('Finished')
print(citations)
print(titles)

https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Applied%20Mechanics%5Baffil%5D)&page=
https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Applied%20Mechanics%5Baffil%5D)&page=1
https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Applied%20Mechanics%5Baffil%5D)&page=2
https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Applied%20Mechanics%5Baffil%5D)&page=3
https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Applied%20Mechanics%5Baffil%5D)&page=4
https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Applied%20Mechanics%5Baffil%5D)&page=5
https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Applied%20Mechanics%5Baffil%5D)&page=6
https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Applied%20Mechanics%5Baffil%5D)&page=7
https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Applied%20Mechanics%5Baffil%5D)&page=8
https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Applied%20Mechanics%5Baffil%5D)&page=9
https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Applie

In [6]:
# Pair each title with its citation
results = list( zip(titles, citations) )

# Data frame
df = pd.DataFrame(results, columns = ['Titles', 'Citation'])

# Save file
df.to_csv('DAM_Affil_Search.csv')

## Code to Webscrape Year

In [7]:
# Read the CSV file
dam = pd.read_csv('DAM_Affil_Search.csv')

# Read the CSV file
dam_data = pd.read_csv('csv-DivisionofDAM.csv')

In [8]:
# remove the period after each publication title
dam['Titles'] = dam['Titles'].str.rstrip('.')

In [9]:
merged_data = pd.merge(dam, dam_data[['Title', 'Publication Year']], left_on='Titles', right_on='Title', how='left')

# turn all nas into 0, so all values can be int
merged_data['Publication Year'] = merged_data['Publication Year'].fillna(0).astype(int)

# remove the 0 after the year
merged_data['Publication Year'] = merged_data['Publication Year'].astype(int).astype(str).str.rstrip('.0')

In [10]:
merged_data.tail()

Unnamed: 0.1,Unnamed: 0,Titles,Citation,Title,Publication Year
192,192,Electrokinetic model of cochlear hair cell mot...,"Jen DH, Steele CR. Electrokinetic model of coc...",Electrokinetic model of cochlear hair cell mot...,1987
193,193,Casimir problem of spherical dielectrics: nume...,"Brevik I, Aarseth JB, Høye JS. Casimir problem...",Casimir problem of spherical dielectrics: nume...,2002
194,194,Using a Mock Circulatory Loop as a Regulatory ...,"D'Souza GA, Rinaldi J, Meki M, Crusan ARM, Ric...",Using a Mock Circulatory Loop as a Regulatory ...,2023
195,195,Cure Kinetics and Inverse Analysis of Epoxy-Am...,"Abali BE, Zecchini M, Daissè G, Czabany I, Gin...",Cure Kinetics and Inverse Analysis of Epoxy-Am...,2021
196,196,Orthotropic piezoelectric properties of the co...,"Tolomeo JA, Steele CR. Orthotropic piezoelectr...",Orthotropic piezoelectric properties of the co...,1995


In [16]:
dataset2 = merged_data.to_excel('DAM_Affil_Search.xlsx')