<a href="https://colab.research.google.com/github/IKKEM-Lin/colab/blob/main/extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data preparation


In [None]:
!curl -Lo "jacs.zip" https://tmp-hd105.vx-cdn.com/file-6694d77ca7e31-6694d7a14f681/jacs.zip
!unzip jacs.zip
!curl -Lo "jacs_dois.json" https://tmp-hd101.vx-cdn.com/file-6694dbd2d3923-6694dbe659cbe/jacs_dois.json

# Main progress

In [None]:
!pip install html5lib
!pip install BeautifulSoup4

In [27]:
import gzip
import html5lib
from bs4 import BeautifulSoup
import email
from google.colab import userdata
import json

publicationConfig = {
    "10.1021": {
        "abstract_selector": ['p.articleBody_abstractText'],
        "paragraph_selector": ['div.NLM_p', ".article_content-left > p"],
        "ref_selector": 'a[class^="ref ref"]',
    },
    "10.1002": {
        "abstract_selector": [".article__body .abstract-group .article-section__abstract:last-child .article-section__content"],
        "paragraph_selector": ['.article__body .article-section__full :where(.article-section__content > p, .article-section__sub-content > p)'],
        "ref_selector": 'a[data-tab="pane-pcw-references"]',
    },
    "10.1007": {
        "abstract_selector": ["#Abs1-content p"],
        "paragraph_selector": ['.main-content .c-article-section__content > p'],
        "ref_selector": 'a[id^="ref"]',
    },
    "10.1039": {
        "abstract_selector": ["article .capsule__text"],
        "paragraph_selector": ['#pnlArticleContentLoaded > p'],
        "ref_selector": 'a[href^="#cit"]',
    },
    "10.1016": {
        "abstract_selector": ['div.abstract.author > div'],
        "paragraph_selector": ['div#body > div:first-child > section[id^=s] p[id^=p]', 'div#body > div:first-child  :where(section[id^=aep-section] > p, section[id^=aep-section] div > p)'],
        "ref_selector": ':where(a.workspace-trigger, a[href^="#bib"])',
    },
    "10.1038": {
        "abstract_selector": ['#Abs1-content'],
        "paragraph_selector": ['article .main-content .c-article-section__content > p'],
        "ref_selector": 'a[data-test="citation-ref"]',
    },
    "10.1126": {
        "abstract_selector": ['[role="doc-abstract"] > [role="paragraph"]'],
        "paragraph_selector": ['#bodymatter [role="paragraph"]'],
        "ref_selector": 'a[role="doc-biblioref"]',
    }
}

In [32]:
def get_article_html(doi):
  with gzip.open(f"./jacs/{doi.replace('/', '_')}.html.gz", 'rb') as f:
    decode_data = f.read()
  return decode_data

def get_abstract(soup, abstract_selector):
    for selector in abstract_selector:
        abstract = soup.select_one(selector)
        if abstract:
            return abstract.text.strip()
    return ""

def get_paragraphs(soup, paragraph_selector, ref_selector):
    paragraphs = []
    for selector in paragraph_selector:
        paragraphs = soup.select(selector)
        if paragraphs:
            break
    result = []
    for p in paragraphs:
        source = p.decode()
        refs = [ref.extract() for ref in p.select(ref_selector)]
        result.append({
            # "source": source,
            "text": p.text.strip(),
            "refs": list(set([ref.text.strip() for ref in refs])),
        })
    return result

# Main function
def get_article_content(doi):
  html_content = get_article_html(doi)
  result = html5lib.serialize(html5lib.parse(html_content), encoding="utf-8", omit_optional_tags=False)
  soup = BeautifulSoup(result, 'html.parser')
  publication_config = publicationConfig[doi.split("/")[0]]
  abstract = get_abstract(soup, publication_config["abstract_selector"])
  paragraphs = get_paragraphs(soup, publication_config["paragraph_selector"], publication_config["ref_selector"])
  return {
    "abstract": abstract,
    "paragraphs": list(map(lambda p: p["text"], paragraphs)),
  }

# Example

In [29]:
with open("jacs_dois.json") as f:
  dois = json.load(f)

In [34]:
doi = dois[0]
ss = get_article_content(doi)

In [None]:
ss