 ## Publications webscraped under the affiliation of the Division of Biology, Chemistry, and Material Science

In [7]:
#Import libraries

import requests
from bs4 import BeautifulSoup
import pandas as pd

## Input keyword and URL 

In [9]:
# What keyword would I like to search? 
keyword = '(Division%20of%20Biology%2C%20Chemistry%20and%20Material%20Science%2C%20Office%20of%20Science%20and%20Engineering%20Laboratories%5Baffil%5D)'

# For the website I'm checking, what is the base URL for the page where results come up?
baseURL = 'https://pubmed.ncbi.nlm.nih.gov/?term='

# Build the URL
searchURL = baseURL + keyword
print(searchURL)

# Send an HTTP request (internet browser asks server for info needed to load the website), save the results in 'response'
response = requests.get(searchURL).content
#print(response)

# Interpret response.content with Beautifhttps://pubmed.ncbi.nlm.nih.gov/?term=(OfficeulSoup
content = BeautifulSoup(response, 'lxml') #lxml = library to parse HTML and other languages
print(content)

https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Biology%2C%20Chemistry%20and%20Material%20Science%2C%20Office%20of%20Science%20and%20Engineering%20Laboratories%5Baffil%5D)
<!DOCTYPE html>
<html lang="en">
<head itemscope="" itemtype="http://schema.org/WebPage" prefix="og: http://ogp.me/ns#">
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<!-- Mobile properties -->
<meta content="True" name="HandheldFriendly"/>
<meta content="320" name="MobileOptimized"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<link href="https://cdn.ncbi.nlm.nih.gov" rel="preconnect"/>
<link href="https://www.ncbi.nlm.nih.gov" rel="preconnect"/>
<link href="https://www.google-analytics.com" rel="preconnect"/>
<link href="https://cdn.ncbi.nlm.nih.gov/pubmed/1e50868b-da01-4bc0-8fb1-7d35b0eecbe6/CACHE/css/output.5ecf62baa0fa.css" rel="stylesheet" type="text/css"/>
<link href="https://cdn.ncbi.nlm.nih.gov/pubmed/1e50868b-da01-4bc0-8fb1-7d35b0eecbe6/CAC

In [10]:
# Find ONE article
article_content = content.find('article', class_ = 'full-docsum')
#print(article_content)

#Extract title
title = article_content.find('a', class_ = 'docsum-title').text.strip()
print(title)

# Extract other citation info
authors = article_content.find('span', class_ = 'docsum-authors full-authors').text.strip()
print(authors)

journal = article_content.find('span', class_ = 'docsum-journal-citation full-journal-citation').text.strip()
print(journal)

pmid = article_content.find('span', class_ = 'citation-part').text.strip()
print(pmid)

Evaluation of optical detection platforms for multiplexed detection of proteins and the need for point-of-care biosensors for clinical use.
Spindel S, Sapsford KE.
Sensors (Basel). 2014 Nov 25;14(12):22313-41. doi: 10.3390/s141222313.
PMID: 25429414


## Get citation code

In [11]:
# Assemble full citation string
full_citation = authors + ' ' + title + ' ' + journal + ' ' + pmid
print(full_citation)

Spindel S, Sapsford KE. Evaluation of optical detection platforms for multiplexed detection of proteins and the need for point-of-care biosensors for clinical use. Sensors (Basel). 2014 Nov 25;14(12):22313-41. doi: 10.3390/s141222313. PMID: 25429414


In [12]:
### How many pages of results are there? ###
total_pages = content.find('label', class_ = 'of-total-pages').text.strip()
#print(total_pages)

# Extract the number of pages
p = total_pages.split(' ')
#print(p)

numPages = int(p[1])
#print(numPages)


### Loop through each page ###

# Base URL for multiple pages
pageURL = searchURL + '&page='
print(pageURL)

# Loop through each page and extract the information
citations = []
titles = []

count = 1
while count < (numPages + 1):
    # link to the page we're on
    page = pageURL + str(count)
    print(page)
    
    # Collect response from that page
    response = requests.get(page)

    # Interpret response.content with BeautifulSoup
    content = BeautifulSoup(response.content, 'lxml')
    
    # Find ALL articles on this page
    articles = content.find_all('article', class_ = 'full-docsum')
    
    # Iterate through the list of articles
    for article in articles:
        title = article.find('a', class_ = 'docsum-title').text.strip()
        citation = article.find('div', class_ = 'docsum-citation full-citation')
        authors = citation.find('span', class_ = 'docsum-authors full-authors').text
        journal = citation.find('span', class_ = 'docsum-journal-citation full-journal-citation').text
        pmid = citation.find('span', class_ = 'citation-part').text
        full_citation = authors + ' ' + title + ' ' + journal + ' ' + pmid
    
        citations.append(full_citation)
        titles.append(title)
    
    
    
    count += 1
    
print('Finished')
print(citations)
print(titles)

https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Biology%2C%20Chemistry%20and%20Material%20Science%2C%20Office%20of%20Science%20and%20Engineering%20Laboratories%5Baffil%5D)&page=
https://pubmed.ncbi.nlm.nih.gov/?term=(Division%20of%20Biology%2C%20Chemistry%20and%20Material%20Science%2C%20Office%20of%20Science%20and%20Engineering%20Laboratories%5Baffil%5D)&page=1
Finished
['Spindel S, Sapsford KE. Evaluation of optical detection platforms for multiplexed detection of proteins and the need for point-of-care biosensors for clinical use. Sensors (Basel). 2014 Nov 25;14(12):22313-41. doi: 10.3390/s141222313. PMID: 25429414', 'Haddad MR, Choi EY, Zerfas PM, Yi L, Martinelli D, Sullivan P, Goldstein DS, Centeno JA, Brinster LR, Ralle M, Kaler SG. Cerebrospinal Fluid-Directed rAAV9-rsATP7A Plus Subcutaneous Copper Histidinate Advance Survival and Outcomes in a Menkes Disease Mouse Model. Mol Ther Methods Clin Dev. 2018 Jul 9;10:165-178. doi: 10.1016/j.omtm.2018.07.002. eCollection 2018 S

In [13]:
# Pair each title with its citation
results = list( zip(titles, citations) )

# Data frame
df = pd.DataFrame(results, columns = ['Titles', 'Citation'])

# Save file
df.to_csv('DBCMS_Affil_Search.csv')

## Code to Webscrape Year

In [15]:
# Read the CSV file
dbcms = pd.read_csv('DBCMS_Affil_Search.csv')

# Read the CSV file
dbcms_data = pd.read_csv('dbcms.csv')

In [16]:
# remove the period after each publication title
dbcms['Titles'] = dbcms['Titles'].str.rstrip('.')

In [17]:
merged_data = pd.merge(dbcms, dbcms_data[['Title', 'Publication Year']], left_on='Titles', right_on='Title', how='left')

# turn all nas into 0, so all values can be int
merged_data['Publication Year'] = merged_data['Publication Year'].fillna(0).astype(int)

# remove the 0 after the year
merged_data['Publication Year'] = merged_data['Publication Year'].astype(int).astype(str).str.rstrip('.0')

In [18]:
merged_data.tail()

Unnamed: 0.1,Unnamed: 0,Titles,Citation,Title,Publication Year
0,0,Evaluation of optical detection platforms for ...,"Spindel S, Sapsford KE. Evaluation of optical ...",Evaluation of optical detection platforms for ...,2014
1,1,Cerebrospinal Fluid-Directed rAAV9-rsATP7A Plu...,"Haddad MR, Choi EY, Zerfas PM, Yi L, Martinell...",Cerebrospinal Fluid-Directed rAAV9-rsATP7A Plu...,2018


In [26]:
dataset1 = merged_data.to_excel('DBCMS_Affil_Search.xlsx', index=False)

## Combine the webscraped data for all four divisions into one file and check for duplicates

In [30]:
# List of file names
file_names = ['DAM_Affil_Search.xlsx', 'DIDSR_Affil_Search.xlsx', 'DBMP_Affil_Search.xlsx', 'DBCMS_Affil_Search.xlsx']

# Empty list to store dataframes
dfs = []

# Loop through each file and import as dataframe
for file_name in file_names:
    df = pd.read_excel(file_name)
    dfs.append(df)

# Concatenate all dataframes into one
combined_df = pd.concat(dfs, ignore_index=True)

# Now you can use the combined dataframe for further analysis or processing

In [33]:
# Check the number of values
num_values = combined_df.shape[0]

# Print the number of values
print("The dataframe has", num_values, "values.")

The dataframe has 435 values.


In [37]:
# Count the number of duplicates
num_duplicates = combined_df['Titles'].duplicated().sum()

# Print the number of duplicates
print("The dataframe has", num_duplicates, "duplicate publications.")

The dataframe has 9 duplicate publications.


In [38]:
# Check for duplicate publications
duplicate_publications = combined_df.duplicated(subset=['Titles'])

# Print the duplicate publications
print(combined_df[duplicate_publications]['Titles'])

198    Elastography Assessment of Liver Fibrosis: Soc...
255    Elastography Assessment of Liver Fibrosis: Soc...
256    Elastography Assessment of Liver Fibrosis: Soc...
299    Label-free X-ray estimation of brain amyloid b...
309    Survey of Acoustic Output in Neonatal Brain Pr...
321    An ex vivo model of medical device-mediated ba...
326    Colorimetrical uncertainty estimation for the ...
353    Pulsed laser damage of gold nanorods in turbid...
367    Longitudinal Functional Assessment of Brain In...
Name: Titles, dtype: object


In [40]:
# Remove duplicates
combined_df.drop_duplicates(subset=['Titles'], inplace=True)

# Confirm removal
print("Duplicates have been removed from the dataframe.")

Duplicates have been removed from the dataframe.


In [42]:
# Save dataframe as an Excel file
combined_df.to_excel('Affil_divisions_scraped.xlsx', index=False)

The dataframe has been saved as an Excel file.


## Merge this data with the osel webscraped data and check for duplicates withen them

In [44]:
inputs = pd.read_excel('inputs_re-refined101.xlsx')

In [45]:
# Check for duplicates between the two files
duplicates = inputs[inputs.duplicated(keep=False)]

# Print the duplicates
print("Duplicates between the two files:")
print(duplicates)

Duplicates between the two files:
Empty DataFrame
Columns: [Publication Titles, references, Link, Affiliation of OSEL, Number of Unique Submissions, Unique Procodes, Program, Division , Source of Publication Title (Name of Scientist), Search Phrase/Keyword, PublishDate, Publication Year]
Index: []


In [47]:
# Count the number of duplicates between the two files
num_duplicates = duplicates.duplicated().sum()

# Print the number of duplicates
print("Number of duplicates between the two files:", num_duplicates)

Number of duplicates between the two files: 0


In [49]:
# inputs - dataset 1
# import Affil_divisions_scraped.xlsx
# AI - to merge the rwo dataset and called it inputs refined 003 and create a new excel our of the data
div = pd.read_excel('Affil_divisions_scraped.xlsx')


In [54]:
div.head()

Unnamed: 0.1,Unnamed: 0,Titles,Citation,Publication Year
0,0,Role of Nanoparticles in Nanofluid Droplet Imp...,"Aksoy YT, Liu L, Abboud M, Vetrano MR, Koos E....",2023.0
1,1,Decellularized tissue-engineered heart valves ...,"Badria AF, Koutsoukos PG, Mavrilas D. Decellul...",202.0
2,2,"Deciphering the ""Art"" in Modeling and Simulati...","Erdemir A, Besier TF, Halloran JP, Imhauser CW...",2019.0
3,3,3rd International workshop on spinal loading a...,"Shirazi-Adl A, Schmidt H, Kingma I. 3rd Intern...",202.0
4,4,Spine loading and deformation - From loading t...,"Shirazi-Adl A, Schmidt H, Kingma I. Spine load...",2016.0


In [55]:
# Rename the column in the div dataset
div = div.rename(columns={'Titles': 'Publication Titles'})
div = div.rename(columns={'Citation': 'references'})

# Add the div data to the inputs dataset
merged_data = pd.concat([inputs, div], axis=0)

# Save the merged dataset as a new Excel file
merged_data.to_excel('inputs_003.xlsx', index=False)