# NIH SRA Scraper


### Upload packages:

In [None]:
import os                        # Set working directory
import pandas as pd              # Managing dataframes
from bs4 import BeautifulSoup    # HTML parsing
from datetime import date        # Extracting date for output version control

from selenium import webdriver                           # Automating web browser interaction
from selenium.webdriver.common.keys import Keys          # For send_keys() command
from selenium.webdriver.common.by import By              # For find_element() command
from selenium.webdriver.chrome.options import Options    # Option to make webdriver not visible


### Upload datasheet: 

In [11]:
# Set working directory
os.chdir(r'C:\Users\')

# Enter name of datasheet and sheet name with samples
sra_samples = pd.read_excel('.xlsx', sheet_name = '')
         
# View datasheet
sra_samples.head()


Unnamed: 0,Sample,Human alphaherpesvirus 1
0,SRR15964039,19994
1,SRR15964015,12627
2,SRR15963969,9072
3,SRR15963991,8628
4,SRR15964037,7720


### Open webdriver:
*Included an option so webdriver webpage is not visible while scraping

In [17]:
# Open web page using Selenium webdriver 
driver = webdriver.Chrome()


In [12]:
# OPTIONAL - webdriver will not be visible while scraping

# Configure Chrome options - only use if you don't want webdriver visible
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run Chrome in headless mode
chrome_options.add_argument("--disable-gpu")  # Disable GPU acceleration

# Create a new Chrome driver with the configured options
driver = webdriver.Chrome(options=chrome_options)


### Web Scraper:
Webdriver opens NIH and searches for each SRA sample. After navigating to BioSample page, data from attributes table is added to a dataframe.

In [19]:
# Create empty dataframe
df = pd.DataFrame()

for index, row in sra_samples.iterrows():
    
    driver.get('https://www.ncbi.nlm.nih.gov/sra/')

    # Enter sample ID into search bar
    sample_search = driver.find_element(By.XPATH, '/html/body/div[1]/div[1]/form/div[1]/div[2]/div[2]/div/div/div/div/input')
    sample_search.send_keys(str(row['Sample']))

    # CLick search button
    search_button = driver.find_element(By.XPATH, '/html/body/div[1]/div[1]/form/div[1]/div[2]/div[2]/div/div/button')
    search_button.click()

    # Click SAMN#
    samn_link = driver.find_element(By.XPATH, '/html/body/div[1]/div[1]/form/div[1]/div[4]/div/div[5]/div/div[1]/div[3]/span/div/a[1]')
    samn_link.click()

    # Get HTML code
    html = driver.page_source

    # Create BeautifulSoup object
    soup = BeautifulSoup(html, 'html.parser')

    # Find table body
    tbody = soup.find('tbody')

    # Extract column names (<th>) and cell values (<td>)
    column_names = [sra_samples.columns[0]] + [th.text for th in soup.find_all('th')]

    # Extract data values (<td>)
    dat = [str(row['Sample'])] + [td.text.strip() for td in soup.find_all('td')]
    
    # Create dataframe row
    row = pd.DataFrame([dat], columns = column_names)
    
    # Concatenate the row to DataFrame 
    df = pd.concat([df, row], axis=0, ignore_index=True)
    

# Close webdriver
driver.quit()

# Merge NIH data with original data
output = sra_samples.merge(df, how = 'left', on = 'Sample')

# View data
output.head()


Unnamed: 0,Sample,Human alphaherpesvirus 1,source name,tissue,disease,sorted compartment
0,SRR15964039,19994,Tumor assocaited CD4+ CD25+ Regulatory T cell ...,Skin,Melanoma,treg
1,SRR15964015,12627,CD45- CD90- CD44- Tumor cells sorted by flow c...,Skin,Melanoma,tumor
2,SRR15963969,9072,Tumor infiltrating viable cells sorted by flow...,Skin,Melanoma,live
3,SRR15963991,8628,CD45- CD90- CD44- Tumor cells sorted by flow c...,Skin,Melanoma,tumor
4,SRR15964037,7720,Tumor assocaited HLA DR+ CD3-CD1920-CD56- myel...,Skin,Melanoma,myeloid


### Export data

In [21]:
# Export data
output.to_excel('PRJNA764510_KRAKEN_JR_' + str(date.today()) + '.xlsx', index = False)
