# Method #1 (Save to XML file, limesoup cleaning needed)

In [None]:
import os
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
import json
from io import BytesIO

def clean_filepath(filepath):
    cleaned_filepath = filepath.replace("//", "/")
    while "//" in cleaned_filepath:
        cleaned_filepath = cleaned_filepath.replace("//", "/")
    return cleaned_filepath

def request_article(doi):
    key = ''
    base_url = 'https://spdi.public.springernature.app/xmldata/jats'
    url = f"{base_url}?q=doi:{doi}&api_key={key}/unsw-api"
    response = requests.get(url)
    soup = BeautifulSoup(response.content, features="lxml")
    title = str(soup.find('total'))
    if title == '<total>0</total>':
        return False
    else:
        return response.content

def gather_existing_dois(output_folder):
    existing_dois = set()
    for root, dirs, files in os.walk(output_folder):
        for file in files:
            if file.endswith(".xml"):
                doi = file.replace(".xml", "").replace("-", "/")
                existing_dois.add(doi)
    return existing_dois

def process_json_files(main_folder, output_folder, fail_record_folder):
    existing_dois = gather_existing_dois(output_folder)
    fail_record_file_path = os.path.join(fail_record_folder, "fail_record.json")
    failed_dois = set()
    
    if os.path.exists(fail_record_file_path):
        with open(fail_record_file_path, 'r') as fail_record_file:
            fail_record = json.load(fail_record_file)
            failed_dois = set(fail_record.get("failed_dois", []))

    for root, dirs, files in os.walk(main_folder):
        for file in tqdm(files, desc="Processing JSON files", unit="file"):
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                with open(file_path, 'r') as json_file:
                    data = json.load(json_file)

                # Extract DOIs
                for entry in data.values():
                    doi = entry["doi"]
                    
                    # Skip if DOI has been processed or exists
                    if doi in existing_dois or doi in failed_dois:
                        print(f"Skipping {doi}, already processed.")
                        continue

                    # Check if the file already exists in the output folder
                    doi_for_filename = doi.replace("/", "-")
                    output_file_path = os.path.join(output_folder, root[len(main_folder) + 1:], f"{doi_for_filename}.xml")

                    # Clean up the output file path
                    cleaned_output_file_path = clean_filepath(output_file_path)

                    # Create the output folder if it doesn't exist
                    output_subfolder = os.path.dirname(cleaned_output_file_path)
                    os.makedirs(output_subfolder, exist_ok=True)

                    article_downloaded = request_article(doi)

                    if not article_downloaded:
                        # Mark DOI as failed
                        failed_dois.add(doi)
                    else:
                        # Save the XML data directly to file
                        with open(cleaned_output_file_path, 'wb') as file:
                            file.write(article_downloaded)

    # Save the fail record after processing all files
    fail_record = {"failed_dois": list(failed_dois)}
    with open(clean_filepath(fail_record_file_path), 'w') as fail_record_file:
        json.dump(fail_record, fail_record_file, indent=2)

# Specify the main folder path, output folder path, and fail record folder path
main_folder_path = "path to folder containing json metadata files"
output_folder_path = "path to output folder"
fail_record_folder_path = ""

# Create the fail record folder if it doesn't exist
os.makedirs(fail_record_folder_path, exist_ok=True)

# Call the main function
process_json_files(main_folder_path, output_folder_path, fail_record_folder_path)


# Method #2 save to json-contented XML file, xml2json conversion needed

In [None]:
import os
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
import json
from io import BytesIO
from chemdataextractor.doc import Document, Heading, Paragraph
from chemdataextractor.reader import NlmXmlReader

def sanitize_filepath(filepath):
    # Replace backslashes with slashes
    filepath = filepath.replace("\\", "/")
    # Replace colons with dots
    filepath = filepath.replace(":", ".")
    return filepath

def clean_filepath(filepath):
    # First, sanitize the filepath to replace invalid characters
    sanitized_filepath = sanitize_filepath(filepath)
    # Ensure there are no redundant slashes
    cleaned_filepath = sanitized_filepath.replace("//", "/")
    while "//" in cleaned_filepath:
        cleaned_filepath = cleaned_filepath.replace("//", "/")
    return cleaned_filepath

def request_article(doi):
    key = ''
    base_url = 'https://spdi.public.springernature.app/xmldata/jats'
    url = base_url + '?q=doi:' + doi + '&api_key=' + key + '/unsw-api'
    response = requests.get(url)
    soup = BeautifulSoup(response.content, features="lxml")
    title = str(soup.find('total'))
    if title == '<total>0</total>':
        return False
    else:
        return response.content

def gather_existing_dois(output_folder):
    existing_dois = set()
    for root, dirs, files in os.walk(output_folder):
        for file in files:
            if file.endswith(".xml"):
                doi = file.replace(".xml", "").replace("-", "/")
                existing_dois.add(doi)
    return existing_dois

def process_json_files(main_folder, output_folder, fail_record_folder):
    existing_dois = gather_existing_dois(output_folder)
    fail_record_file_path = os.path.join(fail_record_folder, "fail_record.json")
    failed_dois = set()
    if os.path.exists(fail_record_file_path):
        with open(fail_record_file_path, 'r') as fail_record_file:
            fail_record = json.load(fail_record_file)
            failed_dois = set(fail_record.get("failed_dois", []))

    for root, dirs, files in os.walk(main_folder):
        for file in tqdm(files, desc="Processing JSON files", unit="file"):
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                with open(file_path, 'r') as json_file:
                    data = json.load(json_file)

                # Extract dois
                for entry in data.values():
                    doi = entry["doi"]
                    
                    # Skip if DOI has been processed or exists
                    if doi in existing_dois or doi in failed_dois:
                        print(f"Skipping {doi}, already processed.")
                        continue

                    # Check if the file already exists in the output folder
                    doi_for_filename = doi.replace("/", "-")
                    output_file_path = os.path.join(output_folder, root[len(main_folder) + 1:], f"{doi_for_filename}.xml")

                    # Clean up the output file path
                    cleaned_output_file_path = clean_filepath(output_file_path)

                    # Create the output folder if it doesn't exist
                    output_subfolder = os.path.dirname(cleaned_output_file_path)
                    os.makedirs(output_subfolder, exist_ok=True)

                    article_downloaded = request_article(doi)

                    if not article_downloaded:
                        # Mark DOI as failed
                        failed_dois.add(doi)
                    else:
                        r = BytesIO(article_downloaded)
                        try:
                            doc = Document.from_file(r, readers=[NlmXmlReader()])
                        except:
                            print('Failed to read DOI:', doi)
                            # Mark DOI as failed
                            failed_dois.add(doi)
                            continue

                        # Store as txt
                        store_json = {"Sections": []}
                        tmp = {}
                        tmp["name"] = ""
                        tmp["content"] = []

                        for e in doc.elements:
                            if isinstance(e, Heading):
                                if tmp != {} and tmp["content"] != []:
                                    store_json["Sections"].append(tmp)
                                tmp = {}
                                tmp["name"] = str(e)
                                tmp["content"] = []
                            if isinstance(e, Paragraph):
                                word_num = len(str(e).split(' '))
                                if word_num > 100:
                                    str_e = str(e).replace('/n', '')
                                    if not str_e.startswith("Open Access"):
                                        tmp["content"].append(str_e)

                        json_str = json.dumps(store_json, indent=4)
                        with open(cleaned_output_file_path, 'w', encoding='utf-8') as json_file:
                            json_file.write(json_str)

    # Save the fail record after processing all files
    fail_record = {"failed_dois": list(failed_dois)}
    with open(clean_filepath(fail_record_file_path), 'w') as fail_record_file:
        json.dump(fail_record, fail_record_file, indent=2)

# Specify the main folder path, output folder path, and fail record folder path
main_folder_path = ""
output_folder_path = ""
fail_record_folder_path = "" # 

# Create the fail record folder if it doesn't exist
os.makedirs(fail_record_folder_path, exist_ok=True)

# Call the main function
process_json_files(main_folder_path, output_folder_path, fail_record_folder_path)


# Method #3 save to all content file in one key & value, as json files, no cleaning needed.

In [None]:
import os
import json
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

def request_article(doi):
    try:
        key = ''
        base_url = 'https://spdi.public.springernature.app/xmldata/jats'
        url = base_url + '?q=doi:' + doi + '&api_key=' + key + '/unsw-api'
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
        soup = BeautifulSoup(response.content, features="lxml")
        title = str(soup.find('total'))
        if title == '<total>0</total>':
            return False
        else:
            return soup.find('body').get_text()
    except requests.exceptions.RequestException as e:
        print(f"Error downloading content for DOI {doi}: {e}")
        return False

def process_json_files(main_folder, output_folder, fail_record_folder):
    for root, dirs, files in os.walk(main_folder):
        for file in tqdm(files, desc="Processing JSON files", unit="file"):
            if file.endswith(".json"):
                file_path = os.path.join(root, file)
                with open(file_path, 'r') as json_file:
                    data = json.load(json_file)

                for doi, content in tqdm(data.items(), desc="Processing DOIs", unit="doi"):
                    print(f"Processing DOI: {doi}")

                    # Check if the file already exists in the output folder
                    doi_for_filename = doi.replace("/", "-")
                    output_file_path = os.path.join(output_folder, root[len(main_folder) + 1:], f"{doi_for_filename}.json")

                    if os.path.exists(output_file_path):
                        print(f"Skipping {doi}, file already exists.")
                        continue

                    article_text = request_article(doi)

                    if article_text:
                        # Create a new folder structure for storing downloaded content
                        output_subfolder = os.path.join(output_folder, root[len(main_folder) + 1:])
                        os.makedirs(output_subfolder, exist_ok=True)

                        # Save downloaded content to a new JSON file
                        output_file_path = os.path.join(output_subfolder, f"{doi_for_filename}.json")
                        with open(output_file_path, 'w') as output_json_file:
                            json.dump({"doi": doi, "downloaded_text": article_text}, output_json_file, indent=2)
                    else:
                        # Save DOI to the fail record
                        fail_record_file_path = os.path.join(fail_record_folder, "fail_record.json")
                        fail_record = {"failed_dois": []}

                        if os.path.exists(fail_record_file_path):
                            with open(fail_record_file_path, 'r') as fail_record_file:
                                fail_record = json.load(fail_record_file)

                        fail_record["failed_dois"].append(doi)

                        with open(fail_record_file_path, 'w') as fail_record_file:
                            json.dump(fail_record, fail_record_file, indent=2)

# Specify the main folder path, output folder path, and fail record folder path
main_folder_path = ''
output_folder_path = ''
fail_record_folder_path = ''

# Call the main function
process_json_files(main_folder_path, output_folder_path, fail_record_folder_path)
