In [2]:
import os
import time
import csv
import math
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC 

In [None]:
# Function to extract data from each project page
def extract_data(url):
    driver.get(url)
    time.sleep(2)  # Give the page some time to load
    
    # List of tuples containing the data keys and their corresponding XPaths
    elements = [
        ('Study Title'              , '/html/body/app-root/main/ctg-study-details/div[2]/ctg-study-info/div/div/div[2]/div[1]/ctg-study-overview/div/div/div[1]/div[3]/ctg-long-text/div/div'),
        ('ClinicalTrialsID'         , '/html/body/app-root/main/ctg-study-details/section/ctg-study-details-top-info/div[3]/div[1]/span[2]'),
        ('Start Date'               , '/html/body/app-root/main/ctg-study-details/div[2]/ctg-study-info/div/div/div[2]/div[1]/ctg-study-overview/div/div/div[2]/div/div[1]/div[1]/span'),
        ('End Date'                 , '/html/body/app-root/main/ctg-study-details/div[2]/ctg-study-info/div/div/div[2]/div[1]/ctg-study-overview/div/div/div[2]/div/div[1]/div[3]/span'),
        ('Sample Size'              , '/html/body/app-root/main/ctg-study-details/div[2]/ctg-study-info/div/div/div[2]/div[1]/ctg-study-overview/div/div/div[2]/div/div[2]/div[2]/span'),
        ('Brief Summary'            , '/html/body/app-root/main/ctg-study-details/div[2]/ctg-study-info/div/div/div[2]/div[1]/ctg-study-overview/div/div/div[1]/ctg-long-text[1]/div/div'),
        ('Condition'                , '/html/body/app-root/main/ctg-study-details/div[2]/ctg-study-info/div/div/div[2]/div[1]/ctg-study-overview/div/div/div[1]/ctg-conditions/div/div/span[1]'),
        ('Intervention'             , '/html/body/app-root/main/ctg-study-details/div[2]/ctg-study-info/div/div/div[2]/div[1]/ctg-study-overview/div/div/div[1]/div[5]/div[2]/ul'),
        ('Contact Name'             , '/html/body/app-root/main/ctg-study-details/div[2]/ctg-study-info/div/div/div[2]/div[2]/ctg-study-contacts-and-locations/div/div/div/ctg-study-contact-info/p[1]/span'),
        ('Contact Email'            , '/html/body/app-root/main/ctg-study-details/div[2]/ctg-study-info/div/div/div[2]/div[2]/ctg-study-contacts-and-locations/div/div/div/ctg-study-contact-info/p[3]/ctg-study-contact-email/span/a'),
        ('Principal Investigator'   , '/html/body/app-root/main/ctg-study-details/div[2]/ctg-study-info/div/div/div[2]/div[5]/ctg-collaborators-and-investigators/div[2]/div/div[2]/div/ul/li/div')
    ]
    
    data = {}
    for key, xpath in elements:
        try:
            data[key] = driver.find_element(By.XPATH, xpath).text
        except:
            data[key] = None
    
    return data


In [18]:
# Initialize the WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Open the URL

# "Digital Health Interventions"
#driver.get("https://clinicaltrials.gov/search?locStr=United%20Kingdom&country=United%20Kingdom&term=Digital%20Health%20Intervention&aggFilters=studyType:int")

# "mobile applications"
#driver.get("https://clinicaltrials.gov/search?locStr=United%20Kingdom&country=United%20Kingdom&term=mobile%20application&aggFilters=studyType:int")

# "Internet Delivered Treatment"
#driver.get("https://clinicaltrials.gov/search?locStr=United%20Kingdom&country=United%20Kingdom&aggFilters=studyType:int&intr=Internet%20Delivered%20Treatment")

# "Website Intervention"
#driver.get("https://clinicaltrials.gov/search?locStr=United%20Kingdom&country=United%20Kingdom&aggFilters=studyType:int&intr=Website%20intervention")

# Give the page some time to load
time.sleep(5)

In [19]:
# Get the total number of search results
total_results_xpath = "/html/body/app-root/main/ctg-search-results/div[2]/section/div[2]/div/div/div[1]/div[1]/div/p"
total_results_string = driver.find_element(By.XPATH, total_results_xpath).text

# Use regex to find the number
totalpages_match = re.search(r'out of (\d+)', total_results_string)
total_results = int(totalpages_match.group(1))
print(f"Total Results: {total_results}")

# Calculate the total number of pages
perpage_match = re.search(r'Viewing \d+-(\d+)', total_results_string)
results_per_page = int(perpage_match.group(1))
print(f"Results per Page: {results_per_page}")
total_pages = math.ceil(total_results / results_per_page)
print(f"Total Pages: {total_pages}")

Total Results: 34
Results per Page: 10
Total Pages: 4


In [21]:
# Initialize a list to hold the data
results = []

# Iterate over all the pages
for page in range(total_pages):
    # Iterate through the search results on the current page
    for i in range(1, results_per_page + 1):
        if (page * results_per_page) + i > total_results:
            break  # Stop if we've processed all results
        try:
            # Construct XPath to click onto each search result 
            xpath = f"/html/body/app-root/main/ctg-search-results/div[2]/section/div[2]/div/div/div[3]/ctg-search-hit-card[{i}]/div/div[3]/header/a"
            # Use WebDriverWait to wait until the element is present
            link = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, xpath)))
            url = link.get_attribute('href')  # Get the href attribute
            
            # Open the link in a new tab
            driver.execute_script("window.open(arguments[0]);", url)
            driver.switch_to.window(driver.window_handles[-1])  # Switch to the new tab
            
            # Extract data from the new tab
            project_data = extract_data(url)
            results.append(project_data)
            
            # Close the tab and switch back to the original search page
            driver.close()
            driver.switch_to.window(driver.window_handles[0])
            
        except Exception as e:
            print(f"Error processing result {i} on page {page + 1}: {e}")
            continue

    # Click the next button if there are more pages
    if page < total_pages - 1:
        try:
            next_button_xpath = '//nav/ul/li/button/span[contains(text(), "Next")]' # Find XPATH with "Next" in text      
            next_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, next_button_xpath)))
            next_button.click()
            time.sleep(5)  # Wait for the next page to load
        except Exception as e:
            print(f"Error clicking the next button on page {page + 1}: {e}")
            break


In [22]:
# Specify the folder location and filename for saving the CSV
folder_path = r"C:\Users\je116\OneDrive - Imperial College London\PhD-wpca-je116\9. Additional Projects\Funding Awards\09FEB2024 - Imperial BRC Digital Health Trials\3. Survey\Advertising"
filename = "clintrials_website_trials.csv"
csv_file_path = os.path.join(folder_path, filename)

# Create the directory if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Save results to CSV
if results:
    keys = results[0].keys()
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(results)
    print(f"Data saved to {csv_file_path}")

# Close the WebDriver
driver.quit()

Data saved to C:\Users\je116\OneDrive - Imperial College London\PhD-wpca-je116\9. Additional Projects\Funding Awards\09FEB2024 - Imperial BRC Digital Health Trials\3. Survey\Advertising\clintrials_website_trials.csv
