In [29]:
import os
import time
import csv
import math
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

In [25]:
# Initialize the WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Open the URL
driver.get("https://fundingawards.nihr.ac.uk/?query=Digital%20Intervention%20Randomised%20Controlled%20Trial")

# Give the page some time to load
time.sleep(5)

In [26]:
# Get the total number of search results
total_results_xpath = "/html/body/div[1]/div/div/div/div/div[2]/div/div[2]/section/div[1]/div[1]/strong[2]"
total_results = int(driver.find_element(By.XPATH, total_results_xpath).text)
print(f"Total Results: {total_results}")

# Calculate the total number of pages
results_per_page = 25
total_pages = math.ceil(total_results / results_per_page)

Total Results: 85


In [27]:
# Function to extract data from each project page
def extract_data(url):
    driver.get(url)
    time.sleep(2)  # Give the page some time to load
    data = {}
    try:
        data['NIHR Programme'] = driver.find_element(By.XPATH, '/html/body/div/div/div/div/div/div[2]/div/div[2]/div/div/div[2]/div/div[1]/div[3]/div[2]/div[2]').text
    except:
        data['NIHR Programme'] = None
    try:
        data['Award ID'] = driver.find_element(By.XPATH, '/html/body/div/div/div/div/div/div[2]/div/div[1]/div/div/div[3]/h4/span').text
    except:
        data['Award ID'] = None
    try:
        data['Title'] = driver.find_element(By.XPATH, '/html/body/div/div/div/div/div/div[2]/div/div[2]/div/div/div[1]/h1').text
    except:
        data['Title'] = None
    try:
        data['Abstract'] = driver.find_element(By.XPATH, '/html/body/div/div/div/div/div/div[2]/div/div[2]/div/div/div[1]/div[2]/span/span[1]/span[3]/span/p[2]').text
    except:
        data['Abstract'] = None
    try:
        data['Read More'] = driver.find_element(By.XPATH, '/html/body/div/div/div/div/div/div[2]/div/div[2]/div/div/div[1]/div[2]/span[1]/span[1]/span').text
    except:
        data['Read More'] = None
    try:
        data['Chief Investigator'] = driver.find_element(By.XPATH, '/html/body/div/div/div/div/div/div[2]/div/div[2]/div/div/div[2]/div/div[1]/div[1]/div[2]/div[2]').text
    except:
        data['Chief Investigator'] = None
    try:
        data['Start Date'] = driver.find_element(By.XPATH, '/html/body/div/div/div/div/div/div[2]/div/div[2]/div/div/div[2]/div/div[2]/div[1]/div[2]/div[2]/span').text
    except:
        data['Start Date'] = None
    try:
        data['End Date'] = driver.find_element(By.XPATH, '/html/body/div/div/div/div/div/div[2]/div/div[2]/div/div/div[2]/div/div[2]/div[2]/div[2]/div[2]/span').text
    except:
        data['End Date'] = None
    return data

In [30]:
# Initialize a list to hold the data
results = []

# Iterate over all the pages
for page in range(total_pages):
    # Iterate through the search results on the current page
    for i in range(1, results_per_page + 1):
        if (page * results_per_page) + i > total_results:
            break  # Stop if we've processed all results
        try:
            # Construct XPath for each search result
            xpath = f"/html/body/div[1]/div/div/div/div/div[2]/div/div[2]/section/section/article[{i}]/div[1]/div/h2/a"
            # Use WebDriverWait to wait until the element is present
            link = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, xpath)))
            url = link.get_attribute('href')  # Get the href attribute
            
            # Open the link in a new tab
            driver.execute_script("window.open(arguments[0]);", url)
            driver.switch_to.window(driver.window_handles[-1])  # Switch to the new tab
            
            # Extract data from the new tab
            project_data = extract_data(url)
            results.append(project_data)
            
            # Close the tab and switch back to the original search page
            driver.close()
            driver.switch_to.window(driver.window_handles[0])
            
        except Exception as e:
            print(f"Error processing result {i} on page {page + 1}: {e}")
            continue

    # Click the next button if there are more pages
    if page < total_pages - 1:
        try:
            next_button_xpath = "/html/body/div[1]/div/div/div/div/div[2]/div/div[2]/section/div[2]/div[1]/ul/li[7]/a"
            next_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, next_button_xpath)))
            next_button.click()
            time.sleep(5)  # Wait for the next page to load
        except Exception as e:
            print(f"Error clicking the next button on page {page + 1}: {e}")
            break


In [31]:
# Specify the folder location and filename for saving the CSV
folder_path = r"C:\Users\je116\OneDrive - Imperial College London\PhD-wpca-je116\9. Additional Projects\Funding Awards\09FEB2024 - Imperial BRC Digital Health Trials\3. Survey\Advertising"
filename = "nihr_funding_awards.csv"
csv_file_path = os.path.join(folder_path, filename)

# Create the directory if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Save results to CSV
if results:
    keys = results[0].keys()
    with open(csv_file_path, 'w', newline='') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(results)
    print(f"Data saved to {csv_file_path}")

# Close the WebDriver
driver.quit()

Data saved to C:\Users\je116\OneDrive - Imperial College London\PhD-wpca-je116\9. Additional Projects\Funding Awards\09FEB2024 - Imperial BRC Digital Health Trials\3. Survey\Advertising\nihr_funding_awards.csv


In [None]:
page = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
url = "https://fundingawards.nihr.ac.uk/?query=Digital%20Intervention%20Randomised%20Controlled%20Trial"
