In [None]:
import os
import requests
import json
from tqdm import tqdm
from requests.exceptions import ProxyError, SSLError, HTTPError

def get_filenames_dict(output_folder):
    filenames_dict = {}
    for root, _, files in tqdm(os.walk(output_folder), desc="Reading Filenames", unit="file"):
        for file_name in files:
            if file_name.endswith('.xml'):
                doi = file_name.replace('.xml', '').replace('-', '/')
                filenames_dict[doi] = os.path.join(root, file_name)
    
    # Print the first five stored filenames
    print("First five stored filenames:")
    for i, (doi, filepath) in enumerate(filenames_dict.items()):
        if i >= 5:
            break
        print(f"{i + 1}. {doi}: {filepath}")
    
    return filenames_dict

def download_pdf(doi, output_folder, filenames_dict):
    # Skip if DOI already exists in filenames dictionary
    if doi in filenames_dict:
        print(f"Skipping {doi}, already downloaded.")
        return filenames_dict[doi]
    
    headers = {
        "X-ELS-APIKEY": "",
        'Accept': 'text/xml',
        'X-ELS-Insttoken': "",
    }
    _url_base = 'https://api.elsevier.com/content/article/'

    # Replace '/' with '-' in the DOI
    doi_formatted = doi.replace('/', '-')

    url = _url_base + 'doi/' + str(doi_formatted)

    try:
        res = requests.get(url, headers=headers)
        res.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)

        if res.status_code == 200:
            print(f"Downloading {doi}...")
            with open(os.path.join(output_folder, doi_formatted + '.xml'), 'wb') as f:
                f.write(res.content)
            print(f"Saved {doi_formatted}.xml")
            return os.path.join(output_folder, doi_formatted + '.xml')  # Return the saved file path
        else:
            print(f"Failed to download {doi}. Status Code: {res.status_code}")
            return None  # Return None in case of non-200 status code

    except ProxyError as proxy_error:
        print(f"ProxyError: {proxy_error}. Skipping {doi}")
        return None  # Return None in case of ProxyError

    except SSLError as ssl_error:
        print(f"SSLError: {ssl_error}. Skipping {doi}")
        return None  # Return None in case of SSLError

    except HTTPError as http_error:
        if http_error.response.status_code == 404:
            print(f"HTTPError 404: {http_error}. Skipping {doi}")
            return None  # Return None in case of HTTPError 404
        else:
            print(f"HTTPError: {http_error}. Skipping {doi}")
            return None  # Return None for other HTTP errors

def process_folder(input_folder, output_base_folder):
    # Get the dictionary of filenames
    filenames_dict = get_filenames_dict(output_base_folder)

    for root, _, files in tqdm(os.walk(input_folder), desc="Processing Files", unit="file"):
        for file_name in files:
            if file_name.endswith('.json'):
                json_file_path = os.path.join(root, file_name)
                with open(json_file_path, 'r', encoding='utf-8') as f:
                    dois_dict = json.load(f)

                    # Extract DOIs from the loaded JSON dictionary
                    dois = list(dois_dict.keys())

                    # Process each DOI and download the PDFs
                    for doi in tqdm(dois, desc=f"Downloading PDFs for {file_name}"):
                        output_subfolder = os.path.join(output_base_folder, os.path.relpath(root, input_folder))
                        output_subfolder = output_subfolder.replace("Processed_filter_three_240102", "downloaded_filtered_three_240102")
                        os.makedirs(output_subfolder, exist_ok=True)

                        saved_file_path = download_pdf(doi, output_subfolder, filenames_dict)
                        if saved_file_path:
                            print(f"File saved at: {saved_file_path}")
                        else:
                            print(f"Download failed for {doi}")

# Specify the input and output folder paths
input_folder = 'location to input folder'
output_base_folder = 'location to output folder'

# Call the function to process the folders
process_folder(input_folder, output_base_folder)
