In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys # give us access to things like the Enter key
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
import time
import pandas as pd

## Web Scraping Clinical Trials Website using Selenium

In [2]:
PATH = "/Desktop/DAC Projects/ProjectFlux/chromedriver"
driver = webdriver.Chrome(PATH)
driver.get("https://clinicaltrials.gov/ct2/about-site/crawling")

data = {}

# tracking information table
last_update_posted_dates = []

# descriptive information table
brief_summaries = []
detailed_descriptions = []
conditions = []
interventions = []

# recruitment information table
eligibility_criterias = []
genders = []
ages = []

try:
    for i in range(0,2867): # 0 is the first link
        
        # to get all the links in the first page (eg. NCT00000001 to NCT00002000 is the 0th index of i)
        dataset = driver.find_elements(By.CSS_SELECTOR,"a[href *= '/ct2/crawl/']") # *= is finding all elements that has the CSS code
        
        # click on the link to go to the second page
        dataset[i].click()
        
        # IN THE SECOND PAGE CURRENTLY
        for j in range(0,2500):
            
            # to get all the links in the second page (eg. NCT00000102 is the 0th index of j)
            elements = WebDriverWait(driver,3).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR,"a[href *= '/ct2/show/']"))
            )
            
            # click on the links to go to the third page
            elements[j].click()
            
            # IN THE THIRD PAGE CURRENTLY
            
            # click on the tabular view to go to the fourth page
            tabular_view = driver.find_elements(By.XPATH, '//*[@id="tabular"]')
            tabular_view[0].click()
            
            # IN THE FOURTH PAGE CURRENTLY
        
        
            ## TRACKING INFORMATION TABLE
            # scraping the last updated date
            
            #checking if header contains "Last Update Posted Date"
            last_update_header = WebDriverWait(driver,3).until(
                EC.visibility_of_element_located((By.XPATH, ".//th[contains(text(), 'Last Update Posted Date')]"))
            )
        
            last_update_posted_date = last_update_header.find_element(By.XPATH, "./following-sibling::td") # getting td element within the header
            last_update_posted_dates.append(last_update_posted_date.text) # adding the text to the list

            ## DESCRIPTIVE INFORMATION TABLE

            # scraping the brief summary
            brief_summary_text = WebDriverWait(driver,3).until(
                EC.visibility_of_element_located((By.XPATH, ".//th[contains(text(), 'Brief Summary')]"))
            )
        
            brief_summary = brief_summary_text.find_element(By.XPATH, "./following-sibling::td") #getting td element within the header
            brief_summaries.append(brief_summary.text) #adding the text to the list
            
            # scraping the detailed description
            detailed_description_text = WebDriverWait(driver,3).until(
                EC.visibility_of_element_located((By.XPATH, ".//th[contains(text(), 'Detailed Description')]"))
            )
        
            detailed_description = detailed_description_text.find_element(By.XPATH, "./following-sibling::td") #getting td element within the header
            detailed_descriptions.append(detailed_description.text) #adding the text to the list

            # scraping the condition
            condition_text = WebDriverWait(driver,3).until(
                EC.visibility_of_element_located((By.XPATH, ".//th[contains(text(), 'Condition')]"))
            )
        
            condition = condition_text.find_element(By.XPATH, "./following-sibling::td") #getting td element within the header
            conditions.append(condition.text) #adding the text to the list

            # scraping the intervention
            intervention_text = WebDriverWait(driver,3).until(
                EC.visibility_of_element_located((By.XPATH, ".//th[contains(text(), 'Intervention')]"))
            )
        
            intervention = intervention_text.find_element(By.XPATH, "./following-sibling::td") #getting td element within the header
            interventions.append(intervention.text) #adding the text to the list

            ## RECRUITMENT INFORMATION TABLE
            
            # scraping the eligibility criteria
            eligibility_criteria_text = WebDriverWait(driver,3).until(
                EC.visibility_of_element_located((By.XPATH, ".//th[contains(text(), 'Intervention')]"))
            )
        
            eligibility_criteria = eligibility_criteria_text.find_element(By.XPATH, "./following-sibling::td") #getting td element within the header
            eligibility_criterias.append(eligibility_criteria.text) #adding the text to the list

            # scraping the gender
            gender_text = WebDriverWait(driver,3).until(
                EC.visibility_of_element_located((By.XPATH, ".//th[contains(text(), 'Sex/Gender')]"))
            )
        
            gender = gender_text.find_element(By.XPATH, "./following-sibling::td") #getting td element within the header
            genders.append(gender.text) #adding the text to the list

            # scraping the age
            age_text = WebDriverWait(driver,3).until(
                EC.visibility_of_element_located((By.XPATH, ".//th[contains(text(), 'Ages')]"))
            )
        
            age = age_text.find_element(By.XPATH, "./following-sibling::td") #getting td element within the header
            ages.append(age.text) #adding the text to the list
            
            # go back to the third page
            driver.back()
            
            # go back to the second page and commence the next jth for-loop (eg. NCT00000104 is the next index (1st index) of j)
            driver.back()
            
            # to break the for-loop once all the elements (links in the second page) have been clicked
            if j+1 >= len(elements):
                break
            else:
                continue
           
        # once the jth for-loop is done, go back to the first page and commence the next ith for-loop (eg. NCT00002001 to NCT00004000 is the next index (1st index) of i)
        driver.back()

except:
    # if the code fails, selenium driver will quit running
    driver.quit()
    
# collating all the lists of information into a data dictionary

## TRACKING INFORMATION TABLE
data['Last Update Posted Date'] = last_update_posted_dates

## DESCRIPTIVE INFORMATION TABLE
data['Brief Summary'] = brief_summaries
data['Detailed Description'] = detailed_descriptions
data['Condition'] = conditions
data['Intervention'] = interventions

## RECRUITMENT INFORMATION TABLE
data['Eligibility Criteria'] = eligibility_criterias
data['Gender'] = genders
data['Ages'] = ages

  driver = webdriver.Chrome(PATH)


In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,Last Update Posted Date,Brief Summary,Detailed Description,Condition,Intervention,Eligibility Criteria,Gender,Ages
0,"June 24, 2005",This study will test the ability of extended r...,This protocol is designed to assess both acute...,Congenital Adrenal Hyperplasia,Drug: Nifedipine,Drug: Nifedipine,Sexes Eligible for Study: All,"14 Years to 35 Years (Child, Adult)"
1,"June 24, 2005",Inner city children are at an increased risk f...,Not Provided,Lead Poisoning,Procedure: ERP measures of attention and memory,Procedure: ERP measures of attention and memory,Sexes Eligible for Study: Female,"0 Years and older (Child, Adult, Older Adult)"
2,"November 29, 2017",The purpose of this study is to learn how the ...,Patients will receive each vaccine once only c...,Cancer,Biological: Intracel KLH Vaccine\nIntracel KLH...,Biological: Intracel KLH Vaccine\nIntracel KLH...,Sexes Eligible for Study: All,"18 Years and older (Adult, Older Adult)"
3,"June 24, 2005",Recently a non-toxic system for whole body hyp...,Not Provided,Rheumatic Diseases,Device: Whole body hyperthermia unit,Device: Whole body hyperthermia unit,Sexes Eligible for Study: All,"18 Years to 65 Years (Adult, Older Adult)"
4,"June 24, 2005",Adults with cyanotic congenital heart disease ...,Not Provided,"Heart Defects, Congenital",Not Provided,Not Provided,Sexes Eligible for Study: All,"17 Years to 60 Years (Child, Adult)"
...,...,...,...,...,...,...,...,...
3115,"June 23, 2016",RATIONALE: Radiation therapy uses high-energy ...,OBJECTIVES:\nCompare the survival without recu...,Prostate Cancer,Radiation: radiation therapy,Radiation: radiation therapy,Sexes Eligible for Study: Male,"up to 75 Years (Child, Adult, Older Adult)"
3116,"July 13, 2016",RATIONALE: Capsaicin lozenges may be effective...,OBJECTIVES: I. Determine the efficacy of capsa...,Head and Neck Cancer\nRadiation Toxicity,Dietary Supplement: capsaicin\nOther: placebo\...,Dietary Supplement: capsaicin\nOther: placebo\...,Sexes Eligible for Study: All,"18 Years and older (Adult, Older Adult)"
3117,"July 13, 2016",RATIONALE: Chemoprevention therapy is the use ...,OBJECTIVES:\nDetermine the chemopreventive eff...,Non-melanomatous Skin Cancer,Drug: acitretin\nOther: placebo,Drug: acitretin\nOther: placebo,Sexes Eligible for Study: All,"18 Years and older (Adult, Older Adult)"
3118,"October 8, 2019",RATIONALE: Drugs used in chemotherapy use diff...,OBJECTIVES:\nCompare the response rate associa...,Breast Cancer,Biological: trastuzumab\nDrug: carboplatin\nDr...,Biological: trastuzumab\nDrug: carboplatin\nDr...,Sexes Eligible for Study: Female,"18 Years and older (Adult, Older Adult)"


In [None]:
df.to_csv('Selenium_Test_4.csv', index=False)

In [None]:
data