In [None]:
import os
import json
import time
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By

output_base_folder = ''
os.makedirs(output_base_folder, exist_ok=True)

def gather_existing_dois(output_folder):
    existing_dois = set()
    for root, dirs, files in os.walk(output_folder):
        for file in files:
            if file.endswith(".html"):
                doi = file.replace(".html", "").replace("-", "/")
                existing_dois.add(doi)
    return existing_dois

async def extract_dois_from_folder(folder_path):
    dois_list = []
    # Find all JSON files in the specified folder and subfolders
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.json'):
                file_path = os.path.join(root, file)
                # Load DOIs from each JSON file
                with open(file_path, 'r', encoding='utf-8') as file_obj:
                    data = json.load(file_obj)
                    # Extracting DOIs from the JSON file
                    for doi in data.keys():
                        dois_list.append((root, doi))  # Append tuple of (root, doi)
    return dois_list

async def download_rsc(dois_with_root, output_base_folder, input_folder):
    # Set up Chrome WebDriver
    chrome_driver_path = 'path to /chromedriver.exe'
    service = Service(executable_path=chrome_driver_path)
    driver = webdriver.Chrome(service=service)

    # Get existing DOIs to avoid re-downloading
    existing_dois = gather_existing_dois(output_base_folder)

    # Iterate through DOIs and perform actions
    for root, doi_data in tqdm(dois_with_root, desc="Downloading RSC Papers"):
        doi_filename = doi_data.replace("/", "-")

        # Skip if the DOI already exists in the output folder
        if doi_data in existing_dois:
            print(f'Skipping DOI {doi_data} as it already exists.')
            continue

        try:
            doi_suffix = doi_data.split("/")[-1]
            search_url = f'https://pubs.rsc.org/en/results?searchtext={doi_suffix}'
            driver.get(search_url)
            time.sleep(3)  # Wait for the search results page to load

            # Check if the page is showing "Aw, Snap!" error message
            if "Aw, Snap!" in driver.page_source:
                print("Page is showing 'Aw, Snap!' error message. Refreshing the page...")
                driver.refresh()
                time.sleep(3)  # Wait for the page to refresh
                continue  # Skip the rest of the loop iteration

            links = driver.find_elements(By.TAG_NAME, 'a')
            html_link = None
            for link in links:
                if 'articlehtml' in link.get_attribute('href'):
                    html_link = link
                    break

            if html_link is not None:
                html_url = html_link.get_attribute('href')
                driver.get(html_url)
                time.sleep(3)  # Wait for the HTML content to load

                # Determine the subfolder within the output base folder
                subfolder = os.path.relpath(root, input_folder)
                output_subfolder = os.path.join(output_base_folder, subfolder)
                os.makedirs(output_subfolder, exist_ok=True)
                output_file_path = os.path.join(output_subfolder, f'{doi_filename}.html')
                with open(output_file_path, 'w', encoding='utf-8') as f:
                    f.write(driver.page_source)

                # Add the newly downloaded DOI to the existing DOIs set
                existing_dois.add(doi_data)
            else:
                print(f'Error: HTML link not found for paper with DOI {doi_data}')
        except Exception as e:
            print(f'Error downloading paper with DOI {doi_data}: {str(e)}')

    driver.quit()

# Specify the path to the subfolder containing JSON files
input_folder = ''

# Extract DOIs from the JSON files within the specified folder
dois_with_root = await extract_dois_from_folder(input_folder)

# Call the function with appropriate arguments
await download_rsc(dois_with_root, output_base_folder, input_folder)
