In [21]:
import os
from datetime import datetime
import time
import csv
import math
import re
import shutil
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC 

In [24]:
# Specify the directory to download files
default_download_dir = r"C:\Users\je116\Downloads" 
download_dir = r"C:\Users\je116\OneDrive - Imperial College London\PhD-wpca-je116\9. Additional Projects\Funding Awards\09FEB2024 - Imperial BRC Digital Health Trials\3. Survey\Advertising\WEBSCRAPE_ISRCTN_Relevant Documents"

# Function to download files based on the text label, with a custom filename
def download_file_from_trial_output(clinical_trials_id):
    try:
        # Look for the div with the class "Trial_outputs"
        trial_outputs = driver.find_element(By.CLASS_NAME, "Trial_outputs")

        # Find the first file where the text is either "Protocol" or "Statistical Analysis Plan"
        download_links = trial_outputs.find_elements(By.TAG_NAME, "a")  # Assuming the links are inside <a> tags

        found_protocol = False
        found_sap = False

        for link in download_links:
            link_text = link.text.strip().lower()  
            
            if not found_protocol and "protocol" in link_text:
                found_protocol = True
                file_url = link.get_attribute("href")
                file_type = "Protocol"
                
                link.click()
                time.sleep(2) # Wait for the download to complete
                
                if len(driver.window_handles) > 2: 
                    driver.switch_to.window(driver.window_handles[-1])  # Switch to the new tab
                    driver.close()
                    driver.switch_to.window(driver.window_handles[1])
                else:
                    file_name = f"{clinical_trials_id}_{file_type}.pdf"
                    download_path = os.path.join(download_dir, file_name)
                    move_latest_file_to_destination(default_download_dir, download_path)

            elif not found_sap and "statistical analysis plan" in link_text:
                found_sap = True
                file_url = link.get_attribute("href")
                file_type = "SAP"

                link.click()
                time.sleep(2) # Wait for the download to complete
                
                if len(driver.window_handles) > 2: 
                    driver.switch_to.window(driver.window_handles[-1])  # Switch to the new tab
                    driver.close()
                    driver.switch_to.window(driver.window_handles[1])
                else:
                    file_name = f"{clinical_trials_id}_{file_type}.pdf"
                    download_path = os.path.join(download_dir, file_name)
                    move_latest_file_to_destination(default_download_dir, download_path)

            if found_protocol and found_sap:
                break

    except Exception as e:
        print(f"Error downloading file: {e}")

# Function to move the most recently downloaded file from default directory to the desired location
def move_latest_file_to_destination(default_download_dir, destination_path):
    try:
        today = datetime.now().date() # Only look through today downloads
        # Get a list of files in the default download directory
        files = os.listdir(default_download_dir)

        # Filter files by modified time, ensuring only files modified today are considered
        today_files = [f for f in files if datetime.fromtimestamp(os.path.getmtime(os.path.join(default_download_dir, f))).date() == today]

        # Sort today's files by modified time (most recent first)
        today_files.sort(key=lambda x: os.path.getmtime(os.path.join(default_download_dir, x)), reverse=True)

        # Find the latest file
        latest_file = os.path.join(default_download_dir, today_files[0])

        # Move the latest file to the destination path
        shutil.move(latest_file, destination_path)
        #print(f"Moved file to {destination_path}")

    except Exception as e:
        print(f"Error moving file: {e}")

In [5]:
# Function to extract data from each project page
def extract_data(url):
    driver.get(url)
    time.sleep(2)  # Give the page some time to load
    
    # List of tuples containing the data keys and their corresponding XPaths
    elements = [
        ('Study Title'              , '/html/body/div[2]/div/div/header/div/h1'),
        ('ClinicalTrialsID'         , '/html/body/div[2]/div/div/header/div/p/span[1]'),
        ('Start Date'               , '/html/body/div[2]/div/div/div[2]/article/section[5]/div/div/p[8]'),
        ('End Date'                 , '/html/body/div[2]/div/div/div[2]/article/section[5]/div/div/p[9]'),
        ('Sample Size'              , '/html/body/div[2]/div/div/div[2]/article/section[5]/div/div/p[6]'),
        ('Brief Summary'            , '/html/body/div[2]/div/div/div[2]/article/section[1]/div/div/p[1]'),
        ('Design'                   , '/html/body/div[2]/div/div/div[2]/article/section[4]/div/div/p[5]'),
        ('Condition'                , '/html/body/div[2]/div/div/div[2]/article/section[4]/div/div/p[11]'),
        ('Intervention'             , '/html/body/div[2]/div/div/div[2]/article/section[4]/div/div/p[12]'),
        ('Intervention Type'        , '/html/body/div[2]/div/div/div[2]/article/section[4]/div/div/p[13]'),
        ('Contact Name'             , '/html/body/div[2]/div/div/div[2]/article/section[2]/div/div/p[2]'),
        ('Contact Email'            , '/html/body/div[2]/div/div/div[2]/article/section[2]/div/div/p[4]/a'),
        ('Principal Investigator'   , '/html/body/div[2]/div/div/div[2]/article/section[2]/div/div/p[6]')
    ]
    
    data = {}
    for key, xpath in elements:
        try:
            # Debugging
            #print(key , end = ",")
            data[key] = driver.find_element(By.XPATH, xpath).text
        except:
            data[key] = None
    
    # Extract the ClinicalTrialsID to use for file naming
    clinical_trials_id = data.get('ClinicalTrialsID')
    
    # Download the files from "Trial_outputs"
    if clinical_trials_id:
        download_file_from_trial_output(clinical_trials_id)

    return data

In [25]:
# Set up Chrome options to specify the download folder and disable the popup that asks for confirmation
chrome_options = Options()
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": default_download_dir,  # Set default download folder
    "download.prompt_for_download": False,  # Disable download prompt
    "download.directory_upgrade": True,
    "safebrowsing.enabled": True
})

# Initialize the WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# List of search terms
search_terms = [
    "Digital Health Intervention",
    "mobile application",
    "Internet Delivered Treatment",
    "Website intervention"
]

# Base URL parts
base_url = "https://www.isrctn.com/"

# Initialize a list to hold the data for all search terms
all_results = []

# Iterate through search terms and open the URLs
for term in search_terms:
    # Replace spaces with %20 for URL encoding
    encoded_term = term.replace(" ", "+")
    
    # Construct the full URL
    url = f"{base_url}search?q={encoded_term}"
    
    # Open the URL
    driver.get(url)
    
    # Give the page some time to load
    time.sleep(5)

    cookies_button_xpath = '/html/body/section/div/div[2]/button[2]'
    cookies_button = driver.find_elements(By.XPATH, cookies_button_xpath)
    
    if cookies_button:
        # Click the cookies button if it is present
        cookies_button[0].click()
        print("Cookies button clicked.")
    else:
        print("Cookies button not found, no click needed.")

    print(f"Gathering results for {term}")
    
    # Get the total number of search results
    total_results_xpath = "/html/body/div[2]/div/div/div[1]/h1"
    total_results_string = driver.find_element(By.XPATH, total_results_xpath).text

    # Use regex to find the number
    total_results_match = re.search(r'(\d+)\s+results', total_results_string, re.DOTALL)
    total_results = int(total_results_match.group(1))
    print(f"Total Results: {total_results}")

    # Calculate the total number of pages
    total_pages_xpath = "/html/body/div[2]/div/div/div[1]/div[1]/div/span[3]"
    total_pages_string = driver.find_element(By.XPATH, total_pages_xpath).text
    total_pages_match = re.search(r'of (\d+)', total_pages_string)
    total_pages = int(total_pages_match.group(1))
    results_per_page = math.ceil(total_results / total_pages)

    print(f"Results per Page: {results_per_page}")
    print(f"Total Pages: {total_pages}")

    # Iterate over all the pages
    for page in range(total_pages):
        # Iterate through the search results on the current page
        for i in range(1, results_per_page + 1):
            if (page * results_per_page) + i > total_results:
                break  # Stop if we've processed all results
            try:
                # Construct XPath to click onto each search result 
                xpath = f"/html/body/div[2]/div/div/div[1]/ul/li[{i}]/article/div[1]/h3/a"
                
                # Use WebDriverWait to wait until the element is present
                link = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, xpath)))
                url = link.get_attribute('href')  # Get the href attribute
                
                # Open the link in a new tab
                driver.execute_script("window.open(arguments[0]);", url)
                driver.switch_to.window(driver.window_handles[-1])  # Switch to the new tab
                
                # Extract data from the new tab
                project_data = extract_data(url)
                # Add the search term to the project data
                project_data['search_term'] = term

                all_results.append(project_data)
                
                # Close the tab and switch back to the original search page
                driver.close()
                driver.switch_to.window(driver.window_handles[0])
                
                if i % 10 == 0:
                    print(i * (page+1), end = "")
                else:
                    print(".", end = "")

            except Exception as e:
                print(f"Error processing result {i} on page {page + 1}: {e}")
                continue

        # Click the next button if there are more pages
        if page < total_pages - 1:
            try:
                next_button_xpath = "//a/span[contains(@class, 'Pager Pager--next')]"
                # next_button_xpath = '//nav/ul/li/button/span[contains(text(), "Next")]' # Find XPATH with "Next" in text      
                next_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, next_button_xpath)))
                next_button.click()
                time.sleep(5)  # Wait for the next page to load
            except Exception as e:
                print(f"Error clicking the next button on page {page + 1}: {e}")
                break

# At the end, `all_results` will contain data from all search terms
print(f"Total records gathered: {len(all_results)}")

Cookies button clicked.
Gathering results for Digital Health Intervention
Total Results: 735
Results per Page: 10
Total Pages: 74
.........10.........20.........30.........40.........50.........60.........70.........80.........90.........100.........110.........120.........130.........140.........150.........160.........170.........180.........190.........200.........210.........220.........230.........240.........250.........260.........270.........280.........290.........300.........310.........320.........330.........340.........350.........360.........370.........380.........390.........400.........410.........420.........430.........440.........450.........460.........470.........480.........490.........500.........510.........520.........530.........540.........550.........560.........570.........580.........590.........600.........610.........620.........630.........640.........650.........660.........670.........680.........690.........700.........710.........720.........730...

In [50]:
# Specify the folder location and filename for saving the CSV
folder_path = r"C:\Users\je116\OneDrive - Imperial College London\PhD-wpca-je116\9. Additional Projects\Funding Awards\09FEB2024 - Imperial BRC Digital Health Trials\3. Survey\Advertising"
filename = "WEBSCRAPE_ISRCTN_alltrials.csv"
csv_file_path = os.path.join(folder_path, filename)

# Create the directory if it doesn't exist
os.makedirs(folder_path, exist_ok=True)

# Save results to CSV
if all_results:
    keys = all_results[0].keys()
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(all_results)
    print(f"Data saved to {csv_file_path}")

# Close the WebDriver
driver.quit()

Data saved to C:\Users\je116\OneDrive - Imperial College London\PhD-wpca-je116\9. Additional Projects\Funding Awards\09FEB2024 - Imperial BRC Digital Health Trials\3. Survey\Advertising\WEBSCRAPE_ISRCTN_alltrials.csv
