In [1]:
import os
import time
import csv
import math
import re
import urllib.parse
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC 
from selenium.common.exceptions import ElementClickInterceptedException

In [2]:
# Function to extract data from each project page
def extract_data(url):
    driver.get(url)
    time.sleep(2)  # Give the page some time to load
    
    read_more_info = {
        'Plain English Summary': '/html/body/div/div/div/div/div/div[2]/div/div[2]/div/div/div[1]/div[1]/span/span[1]/span[3]/span/p[2]',  # Update with the actual XPath  
        'Brief Summary': '/html/body/div/div/div/div/div/div[2]/div/div[2]/div/div/div[1]/div[2]/span/span[1]/span[3]/span/p[2]'  # Update with the actual XPath
    }
    
    # Click on each "Read More" button
    for summary_name, xpath in read_more_info.items():
        try:
            element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, xpath)))
            driver.execute_script("arguments[0].scrollIntoView();", element)
            time.sleep(1)  # Wait briefly for smooth scrolling            
            element.click() # Click the button
            time.sleep(1)  # Wait for the content to load
        except Exception as e:
            print("!", end="")
            #print(f"Could not click 'Read More' for {summary_name}:", e)
    
    # List of tuples containing the data keys and their corresponding XPaths
    elements = [
        ('Study Title'              , '/html/body/div/div/div/div/div/div[2]/div/div[2]/div/div/div[1]/h1'),
        ('ClinicalTrialsID'         , '/html/body/div/div/div/div/div/div[2]/div/div[1]/div/div/div[3]/h4/span'),
        ('Start Date'               , '/html/body/div/div/div/div/div/div[2]/div/div[2]/div/div/div[2]/div/div[2]/div[1]/div[2]/div[2]/span'),
        ('End Date'                 , '/html/body/div/div/div/div/div/div[2]/div/div[2]/div/div/div[2]/div/div[2]/div[2]/div[2]/div[2]/span'),
        ('Brief Summary'            , '/html/body/div/div/div/div/div/div[2]/div/div[2]/div/div/div[1]/div[2]'),
        ('Plain English Summary'    , '/html/body/div/div/div/div/div/div[2]/div/div[2]/div/div/div[1]/div[1]'),
        ('Contact Name'             , '/html/body/div/div/div/div/div/div[2]/div/div[2]/div/div/div[2]/div/div[1]/div[1]/div[2]/div[2]/span/span/a[1]'),
        ('Contact Organisation'     , '/html/body/div/div/div/div/div/div[2]/div/div[2]/div/div/div[2]/div/div[2]/div[3]/div[2]/div[2]/span/span/a'),
        ('Additional Investigators' , '/html/body/div/div/div/div/div/div[2]/div/div[2]/div/div/div[2]/div/div[1]/div[3]/div[2]/div[2]')
    ]
    
    data = {}
    for key, xpath in elements:
        try:
            # Debugging
            #print(key , end = ",")
            data[key] = driver.find_element(By.XPATH, xpath).text
        except:
            data[key] = None
    
    return data


In [10]:
# Initialize the WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# List of search terms
search_terms = [
    '("Digital Health" AND "Intervention") AND "Clinical Trial"',
    '("mobile application" AND "Intervention") AND "Clinical Trial"',
    '("smartphone" AND "Intervention") AND "Clinical Trial"',
    '("Internet Delivered" AND "Intervention") AND "Clinical Trial"',
    '("Website" AND "Intervention") AND "Clinical Trial"',
    '("eHealth" AND "Intervention") AND "Clinical Trial', 
    '("mHealth" AND "Intervention") AND "Clinical Trial', 
    '("Digital" AND "Intervention") AND "Clinical Trial"'
]

# Base URL parts
base_url = "https://fundingawards.nihr.ac.uk/?query="

# Initialize a list to hold the data for all search terms
all_results = []

# Iterate through search terms and open the URLs
for term in search_terms:
    # Replace spaces with %20 for URL encoding
    encoded_term = urllib.parse.quote(term)
    
    # Construct the full URL
    url = f"{base_url}{encoded_term}"
    
    # Open the URL
    driver.get(url)
    time.sleep(5)

    print(f"Gathering results for {term}")

    # Get the total number of search results
    total_results_xpath = "/html/body/div[1]/div/div/div/div/div[2]/div/div[2]/section/div[1]/div[1]/strong[2]"
    total_results_string = driver.find_element(By.XPATH, total_results_xpath).text
    total_results = int(total_results_string)
    print(f"Total Results: {total_results}")

    if total_results > 25 :
        results_per_page_xpath = "/html/body/div[1]/div/div/div/div/div[2]/div/div[2]/section/div[2]/div[2]/div/div[2]/button[1]"
        results_per_page_string = driver.find_element(By.XPATH, results_per_page_xpath).text
        results_per_page = int(results_per_page_string)
        total_pages = math.ceil(total_results / results_per_page)
    else:
        results_per_page = total_results
        total_pages = 1
    
    print(f"Results per Page: {results_per_page}")
    print(f"Total Pages: {total_pages}")

    # Iterate over all the pages
    for page in range(total_pages):
        # Iterate through the search results on the current page
        for i in range(1, results_per_page + 1):
            if (page * results_per_page) + i > total_results:
                break  # Stop if we've processed all results
            try:
                # Construct XPath to click onto each search result 
                xpath = f"/html/body/div[1]/div/div/div/div/div[2]/div/div[2]/section/section/article[{i}]/div[1]/div/h2/a"              
                
                # Use WebDriverWait to wait until the element is present
                link = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, xpath)))
                url = link.get_attribute('href')  # Get the href attribute
                
                # Open the link in a new tab
                driver.execute_script("window.open(arguments[0]);", url)
                driver.switch_to.window(driver.window_handles[-1])  # Switch to the new tab
                
                # Extract data from the new tab
                project_data = extract_data(url)
                # Add the search term to the project data
                project_data['search_term'] = term

                all_results.append(project_data)
                
                # Close the tab and switch back to the original search page
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
                
                if i % 10 == 0:
                    print(i * (page+1), end = "")
                else:
                    print(".", end = "")

            except Exception as e:
                print(f"Error processing result {i} on page {page + 1}: {e}")
                continue

        # Click the next button if there are more pages
        if page < total_pages - 1:
            try:
                next_button_xpath = "//a[@class='page-link' and @aria-label='Next']"
                # next_button_xpath = '//nav/ul/li/button/span[contains(text(), "Next")]' # Find XPATH with "Next" in text      
                next_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, next_button_xpath)))
                next_button.click()
                time.sleep(5)  # Wait for the next page to load
            except Exception as e:
                print(f"Error clicking the next button on page {page + 1}: {e}")
                break

# At the end, `all_results` will contain data from all search terms
print(f"Total records gathered: {len(all_results)}")
    

Gathering results for ("Digital Health" AND "Intervention") AND "Clinical Trial"
Total Results: 9
Results per Page: 9
Total Pages: 1
.........Gathering results for ("mobile application" AND "Intervention") AND "Clinical Trial"
Total Results: 1
Results per Page: 1
Total Pages: 1
.Gathering results for ("smartphone" AND "Intervention") AND "Clinical Trial"
Total Results: 38
Results per Page: 25
Total Pages: 2
....!.....10.........20..............20..!.Gathering results for ("Internet Delivered" AND "Intervention") AND "Clinical Trial"
Total Results: 3
Results per Page: 3
Total Pages: 1
...Gathering results for ("Website" AND "Intervention") AND "Clinical Trial"
Total Results: 182
Results per Page: 25
Total Pages: 8
!.........10.........20..............20.........40..............30.........60..............40.........80..............50.........100..............60.........120..............70.........140........!..!..Gathering results for ("eHealth" AND "Intervention") AND "Clinical Trial
To

In [11]:
# Specify the folder location and filename for saving the CSV
folder_path = r"C:\Users\je116\OneDrive - Imperial College London\PhD-wpca-je116\9. Additional Projects\Funding Awards\09FEB2024 - Imperial BRC Digital Health Trials\3. Survey\Advertising"
filename = "WEBSCRAPE_NIHRLibrary_alltrials.csv"
csv_file_path = os.path.join(folder_path, filename)

# Create the directory if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Save results to CSV
if all_results:
    keys = all_results[0].keys()
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(all_results)
    print(f"Data saved to {csv_file_path}")

# Close the WebDriver
driver.quit()

Data saved to C:\Users\je116\OneDrive - Imperial College London\PhD-wpca-je116\9. Additional Projects\Funding Awards\09FEB2024 - Imperial BRC Digital Health Trials\3. Survey\Advertising\WEBSCRAPE_NIHRLibrary_alltrials.csv
