# Install necessary packages

In [None]:
pip install webdriver_manager

In [None]:
pip install selenium

# main

In [None]:
import os
import time
import json
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service

# Set Chrome options
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

# Path to the JSON file containing DOIs
json_file_path = ''

# Output path for downloaded HTML files
output_path = ''

# Load DOIs from the JSON file
with open(json_file_path, 'r', encoding='utf-8') as file_obj:
    dois_data = json.load(file_obj)

# Set up Chrome options
options = Options()
options.add_argument('--user-data-dir=path/to/your/user-data')

# Set Chrome driver path
chrome_driver_path = '.../chromedriver'

# Initialize the Chrome WebDriver using Service and Chrome options
service = Service(executable_path=chrome_driver_path, chrome_options=chrome_options)
driver = webdriver.Chrome(service=service, options=options)

# Keep track of already downloaded indices
already_downloaded_indices = set()

# Check existing files in the output path
existing_files = os.listdir(output_path)
for existing_file in existing_files:
    try:
        index = int(existing_file.split('_')[1].split('.')[0])
        already_downloaded_indices.add(index)
    except (ValueError, IndexError):
        pass

# Iterate through DOIs and download HTML content
for i, doi in enumerate(tqdm(dois_data, desc="Downloading RSC Papers")):
    if i not in already_downloaded_indices:
        try:
            # Extract the last part of the DOI to use in the search URL
            doi_suffix = doi.split("/")[-1]
            
            # Transform the DOI suffix into the RSC search URL format
            search_url = f'https://pubs.rsc.org/en/results?searchtext={doi_suffix}'

            # Open the search URL in Chrome
            driver.get(search_url)
            time.sleep(3)  # Wait for the search results page to load

            # Find all links on the page
            links = driver.find_elements(By.TAG_NAME, 'a')

            # Check each link for the presence of "articlehtml" in its href attribute
            html_link = None
            for link in links:
                if 'articlehtml' in link.get_attribute('href'):
                    html_link = link
                    break

            if html_link is not None:
                # Extract the URL from the href attribute
                html_url = html_link.get_attribute('href')

                # Open the HTML URL in Chrome
                driver.get(html_url)
                time.sleep(3)  # Wait for the HTML content to load

                # Get the HTML source and save it to a file with the DOI as the filename
                doi_filename = doi_suffix.replace("/", "-")
                output_file_path = os.path.join(output_path, f'{doi_filename}.html')
                with open(output_file_path, 'w', encoding='utf-8') as f:
                    f.write(driver.page_source)

                # Add the index to the set of downloaded indices
                already_downloaded_indices.add(i)
            else:
                print(f'Error: HTML link not found for paper with DOI {doi}')

        except Exception as e:
            print(f'Error downloading paper with DOI {doi}: {str(e)}')

# Close the Chrome WebDriver
driver.quit()
