<a href="https://colab.research.google.com/github/IKKEM-Lin/colab/blob/main/extractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install minio
!pip install html5lib
!pip install BeautifulSoup4

In [2]:
from minio import Minio
import gzip
import html5lib
from bs4 import BeautifulSoup
import email
from google.colab import userdata

# Minio config, value of MINIO_ACCESS_KEY and MINIO_SECRET_KEY should not share/expose

MINIO_ACCESS_KEY = userdata.get('AK')
MINIO_SECRET_KEY = userdata.get('SK')

minio_client = Minio('minio.hzc.pub', access_key=MINIO_ACCESS_KEY, secret_key=MINIO_SECRET_KEY)
# if run in local 306
# minio_client = Minio("192.168.1.219:9000", access_key=MINIO_ACCESS_KEY, secret_key=MINIO_SECRET_KEY, secure=False)

publicationConfig = {
    "10.1021": {
        "abstract_selector": ['p.articleBody_abstractText'],
        "paragraph_selector": ['div.NLM_p', ".article_content-left > p"],
        "ref_selector": 'a[class^="ref ref"]',
    },
    "10.1002": {
        "abstract_selector": [".article__body .abstract-group .article-section__abstract:last-child .article-section__content"],
        "paragraph_selector": ['.article__body .article-section__full :where(.article-section__content > p, .article-section__sub-content > p)'],
        "ref_selector": 'a[data-tab="pane-pcw-references"]',
    },
    "10.1007": {
        "abstract_selector": ["#Abs1-content p"],
        "paragraph_selector": ['.main-content .c-article-section__content > p'],
        "ref_selector": 'a[id^="ref"]',
    },
    "10.1039": {
        "abstract_selector": ["article .capsule__text"],
        "paragraph_selector": ['#pnlArticleContentLoaded > p'],
        "ref_selector": 'a[href^="#cit"]',
    },
    "10.1016": {
        "abstract_selector": ['div.abstract.author > div'],
        "paragraph_selector": ['div#body > div:first-child > section[id^=s] p[id^=p]', 'div#body > div:first-child  :where(section[id^=aep-section] > p, section[id^=aep-section] div > p)'],
        "ref_selector": ':where(a.workspace-trigger, a[href^="#bib"])',
    },
    "10.1038": {
        "abstract_selector": ['#Abs1-content'],
        "paragraph_selector": ['article .main-content .c-article-section__content > p'],
        "ref_selector": 'a[data-test="citation-ref"]',
    },
    "10.1126": {
        "abstract_selector": ['[role="doc-abstract"] > [role="paragraph"]'],
        "paragraph_selector": ['#bodymatter [role="paragraph"]'],
        "ref_selector": 'a[role="doc-biblioref"]',
    }
}

In [3]:
def extract_html_from_mhtml(mhtml_text):
    msg = email.message_from_bytes(mhtml_text)
    html_content = None
    for part in msg.walk():
        if part.get_content_type() == 'text/html':
            html_content = part.get_payload(decode=True).decode('utf-8')
            break
    return html_content

# Get html from minio by doi key
def get_article_html(doi):
  try:
    object = minio_client.get_object("chem-brain", f"{doi}/_.sf.html.gz")
    type = "html"
  except:
    object = minio_client.get_object("chem-brain", f"{doi}/_.mhtml.gz")
    type = "mhtml"
  data = object.read()
  decode_data = gzip.decompress(data)
  if type == "mhtml":
    decode_data = extract_html_from_mhtml(decode_data)
  return decode_data

def get_abstract(soup, abstract_selector):
    for selector in abstract_selector:
        abstract = soup.select_one(selector)
        if abstract:
            return abstract.text.strip()
    return ""

def get_paragraphs(soup, paragraph_selector, ref_selector):
    paragraphs = []
    for selector in paragraph_selector:
        paragraphs = soup.select(selector)
        if paragraphs:
            break
    result = []
    for p in paragraphs:
        source = p.decode()
        refs = [ref.extract() for ref in p.select(ref_selector)]
        result.append({
            # "source": source,
            "text": p.text.strip(),
            "refs": list(set([ref.text.strip() for ref in refs])),
        })
    return result

# Main function
def get_article_content(doi):
  html_content = get_article_html(doi)
  result = html5lib.serialize(html5lib.parse(html_content), encoding="utf-8", omit_optional_tags=False)
  soup = BeautifulSoup(result, 'html.parser')
  publication_config = publicationConfig[doi.split("/")[0]]
  abstract = get_abstract(soup, publication_config["abstract_selector"])
  paragraphs = get_paragraphs(soup, publication_config["paragraph_selector"], publication_config["ref_selector"])
  return {
    "abstract": abstract,
    "paragraphs": list(map(lambda p: p["text"], paragraphs)),
  }

# Example

In [4]:
doi = "10.1002/acm2.12866"
ss = get_article_content(doi)

In [5]:
ss

{'abstract': 'To evaluate the accuracy of a commercial optical surface tracking (OST) system and to demonstrate how it can be implemented to monitor patient positioning during non-coplanar single isocenter stereotactic treatments of brain metastases. A 3-camera OST system was used (Catalyst HD™, C-RAD) on a TruebeamSTx with a 6DoF couch. The setup accuracy and agreement between the OST system, and CBCT and kV-MV imaging at couch angles 0° and 270°, respectively, were examined. Film measurements at 3 depths in the Rando-Alderson phantom were performed using a single isocenter non-coplanar VMAT plan containing 4 brain lesions. Setup of the phantom was performed with CBCT at couch 0° and subsequently monitored by OST at other couch angles. Setup data for 7 volunteers were collected to evaluate the accuracy and reproducibility of the OST system at couch angles 0°, 45°, 90°, 315°, and 270°. These results were also correlated to the couch rotation offsets obtained by a Winston-Lutz (WL) test