In [1]:
import argparse
import sys
import regex as re
import io
import gzip
import json
from bs4 import BeautifulSoup
import lxml
from collections import defaultdict
from tqdm import tqdm
import spacy
import scispacy

# Load SciSpacy model
nlp = spacy.load("en_core_sci_sm")



  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [7]:
# Precompile regex patterns for section tagging
titleMapsBody = {
    'INTRO': ['introduction', 'background', 'related literature', 'literature review', 'objective', 'aim ', 'purpose of this study', 'study (purpose|aim|aims)', r'\d+\. (purpose|aims|aim)', '(aims|aim|purpose) of the study', '(the|drug|systematic|book) review', 'review of literature', 'related work', 'recent advance'],
    'METHODS': ['supplement', 'methods and materials', 'method', 'material', 'experimental procedure', 'implementation', 'methodology', 'treatment', 'statistical analysis', "experimental", r'\d+\. experimental$', 'experimental (section|evaluation|design|approach|protocol|setting|set up|investigation|detail|part|perspective|tool)', "the study", r'\d+\. the study$', "protocol", "protocols", 'study protocol', 'construction and content', r'experiment \d+', '^experiments$', 'analysis', 'utility', 'design', r'\d+\. theory$', "theory", 'theory and ', 'theory of '],
    'RESULTS': ['result', 'finding', 'diagnosis'],
    'DISCUSS': ['discussion', 'management of', r'\d+\. management', 'safety and tolerability', 'limitations', 'perspective', 'commentary', r'\d+\. comment'],
    'CONCL': ['conclusion', 'key message', 'future', 'summary', 'recommendation', 'implications for clinical practice', 'concluding remark'],
    'CASE': ['case study report', 'case report', 'case presentation', 'case description', r'case \d+', r'\d+\. case', 'case summary', 'case history'],
    'ACK_FUND': ['funding', 'acknowledgement', 'acknowledgment', 'financial disclosure'],
    'AUTH_CONT': ["author contribution", "authors' contribution", "author's contribution"],
    'COMP_INT': ['competing interest', 'conflict of interest', 'conflicts of interest', 'disclosure', 'declaration'],
    'ABBR': ['abbreviation'],
    'SUPPL': ['supplemental data', 'supplementary file', 'supplemental file', 'supplementary data', 'supplementary figure', 'supplemental figure', 'supporting information', 'supplemental file', 'supplemental material', 'supplementary material', 'supplement material', 'additional data files', 'supplemental information', 'supplementary information', 'supplemental information', 'supporting information', 'supplemental table', 'supplementary table', 'supplement table', 'supplementary material', 'supplemental material', 'supplement material', 'supplementary video']
}

titleExactMapsBody = {
    'INTRO': ["aim", "aims", "purpose", "purposes", "purpose/aim", "purpose of study", "review", "reviews", "minireview"],
    'METHODS': ["experimental", "the study", "protocol", "protocols"],
    'DISCUSS': ["management", "comment", "comments"],
    'CASE': ["case", "cases"]
}

titleMapsBack = {
    'REF': ['reference', 'literature cited', 'references', 'bibliography'],
    'ACK_FUND': ['funding', 'acknowledgement', 'acknowledgment', 'acknowlegement', 'acknowlegement', 'open access', 'financial support', 'grant', 'author note', 'financial disclosure'],
    'ABBR': ['abbreviation', 'glossary'],
    'COMP_INT': ['competing interest', 'conflict of interest', 'conflicts of interest', 'disclosure', 'declaration', 'conflicts', 'interest'],
    'SUPPL': ['supplementary', 'supporting information', 'supplemental', 'web extra material'],
    'APPENDIX': ['appendix', 'appendices'],
    'AUTH_CONT': ['author', 'contribution']
}

# Precompile regex patterns
compiled_titleMapsBody = {
    key: [re.compile(pattern, re.IGNORECASE) for pattern in patterns]
    for key, patterns in titleMapsBody.items()
}

compiled_titleExactMapsBody = {
    key: [pattern.lower() for pattern in patterns]
    for key, patterns in titleExactMapsBody.items()
}

compiled_titleMapsBack = {
    key: [re.compile(pattern, re.IGNORECASE) for pattern in patterns]
    for key, patterns in titleMapsBack.items()
}

def createSecTag(soup, secType):
    secTag = soup.new_tag('SecTag')
    secTag['type'] = secType
    return secTag

# Function to read XML or GZ files and split into individual articles
def getfileblocks(file_path, document_flag):
    sub_file_blocks = []
    if file_path.endswith('.gz'):
        open_func = lambda x: gzip.open(x, 'rt', encoding='utf8')
    else:
        open_func = lambda x: open(x, 'r', encoding='utf8')

    try:
        with open_func(file_path) as fh:
            content = fh.read()
            if document_flag in ['f', 'a']:
                # Split content by <!DOCTYPE article ...> or <article ...> tags
                articles = re.split(r'(?=<!DOCTYPE article|<article(?![\w-]))', content)
                sub_file_blocks = [article.strip() for article in articles if article.strip() and '<!DOCTYPE' not in article]
            else:
                print('ERROR: unknown document type :' + document_flag)
    except Exception as e:
        print('Error processing file: ' + str(file_path))
        print(e)

    return sub_file_blocks

# Function to split text into sentences using spaCy
def sentence_split(text, sent_id):
    sentences = []
    doc = nlp(text)
    for sent in doc.sents:
        sentences.append(sent.text.strip())
    return sent_id + len(sentences), sentences

# Function to process nested tags and collect sentences
def call_sentence_tags(ch, sent_id):
    sentences = []
    for gch in ch.children:
        if isinstance(gch, str):
            continue  # Skip strings directly under ch
        if gch.name in ['article-title', 'title', 'subtitle', 'trans-title', 'trans-subtitle', 'alt-title', 'label', 'td', 'th']:
            if gch.find('p', recursive=False):
                sent_id, sub_sentences = call_sentence_tags(gch, sent_id)
                sentences.extend(sub_sentences)
            else:
                text = gch.get_text(separator=' ', strip=True)
                if text:
                    sent_id, sents = sentence_split(text, sent_id)
                    sentences.extend(sents)
        elif gch.name in ["sec", "fig", "statement", "div", "boxed-text", "list", "list-item", "disp-quote", "speech",
                          "fn-group", "fn", "def-list", "def-item", "def", "ack", "array", "table-wrap", "table",
                          "tbody", "thead", "tr", "caption", "answer", "sec-meta", "glossary", "question", "question-wrap"]:
            sent_id, sub_sentences = call_sentence_tags(gch, sent_id)
            sentences.extend(sub_sentences)
        elif gch.name == 'p':
            sent_id, sub_sentences = process_p_tag(gch, sent_id)
            sentences.extend(sub_sentences)
        else:
            text = gch.get_text(separator=' ', strip=True)
            if text:
                sent_id, sents = sentence_split(text, sent_id)
                sentences.extend(sents)
    return sent_id, sentences

# Function to process paragraph tags
def process_p_tag(gch, sent_id):
    sentences = []
    if not (len(gch.contents) == 1 and (not gch.contents[0].string) and (gch.contents[0].name in ["ext-link", "e-mail", "uri", "inline-supplementary-material", "related-article", "related-object", "address", "alternatives", "array", "funding-source", "inline-graphic"])):
        text = gch.get_text(separator=' ', strip=True)
        if text:
            sent_id, sents = sentence_split(text, sent_id)
            sentences.extend(sents)
    return sent_id, sentences

# Function to process the front section
def process_front(front):
    sent_id = 1
    sections = {}
    keywords = []
    
    if front.find('article-meta'):
        art_meta = front.find('article-meta')
        
        for ch in art_meta.find_all(recursive=False):
            if ch.name in ['title-group', 'supplement', 'supplementary-material', 'abstract', 'trans-abstract', 'kwd-group', 'funding-group']:
                section_title = ch.name.upper()
                
                if section_title == 'KWD-GROUP':
                    # Extract keywords as a list from kwd-group
                    keywords = [kwd.text.strip() for kwd in ch.find_all('kwd')]
                else:
                    sent_id, sentences = call_sentence_tags(ch, sent_id)
                    if sentences:
                        sections.setdefault(section_title, []).extend(sentences)
            else:
                pass  # Ignore other tags
    
    return sent_id, sections, keywords

# Function to process the back section
def process_back(back, sent_id):
    sent_id = 1
    sections = {}
    for ch in back.find_all(recursive=False):
        if ch.name in ['sec', 'p', 'ack', 'alternatives', 'array', 'preformat', 'fig', 'fig-group', 'question-wrap',
                 'question-wrap-group', 'list', 'table-wrap-group', 'table-wrap', 'display-formula',
                 'display-formula-group', 'def-list', 'list', 'supplementary-material', 'kwd-group',
                 'funding-group', 'statement', 'ref-list', 'glossary']:
            # Sections with titles
            if ch.name == 'ref-list':
                sent_id, sentences = reference_sents(ch, sent_id)
                if sentences:
                    sections.setdefault('REF', []).extend(sentences)
            else:
                title = ch.find('title')
                if title:
                    section_title = title.get_text(separator=' ', strip=True).strip().upper()
                else:
                    section_title = ch.name.upper()
                sent_id, sentences = call_sentence_tags(ch, sent_id)
                if sentences:
                    sections.setdefault(section_title, []).extend(sentences)
        else:
            pass  # Ignore other tags
    return sent_id, sections

# Function to process reference sentences
def reference_sents(ref_list, sent_id):
    sentences = []
    for ch in ref_list.children:
        if isinstance(ch, str):
            continue  # Skip strings directly under ref_list
        if ch.name == 'ref':
            sub_text = ''
            for gch in ch.children:
                if isinstance(gch, str):
                    continue
                sub_text += " " + " ".join([d.string for d in gch.descendants if d.string])
            if sub_text:
                sent_id, sents = sentence_split(sub_text, sent_id)
                sentences.extend(sents)
        elif ch.name in ["sec", "fig", "statement", "div", "boxed-text", "list", "list-item", "disp-quote", "speech",
                         "fn-group", "fn", "def-list", "def-item", "def", "ack", "array", "table-wrap", "table",
                         "tbody", "caption", "answer", "sec-meta", "glossary", "question", "question-wrap"]:
            sent_id, sub_sentences = call_sentence_tags(ch, sent_id)
            sentences.extend(sub_sentences)
        else:
            pass  # Ignore other tags
    return sent_id, sentences

# Function to match section titles to predefined section types
def titleMatch(title, secFlag):
    matchKeys = []
    # Check if the flag is 'body' or 'back' and apply the respective mappings
    if secFlag == 'body':
        titleMaps = compiled_titleMapsBody
        exactMaps = compiled_titleExactMapsBody
    else:
        titleMaps = compiled_titleMapsBack
        exactMaps = {}

    title_lower = title.lower().strip()
    # Check exact matches first
    for key, patterns in exactMaps.items():
        if title_lower in patterns:
            matchKeys.append(key)
            break  # If exact match found, no need to check further

    # If no exact match, check regex patterns
    if not matchKeys:
        for key, patterns in titleMaps.items():
            if any(pattern.search(title_lower) for pattern in patterns):
                matchKeys.append(key)

    return ','.join(matchKeys) if matchKeys else None

# Function to apply section tagging to the soup object
def section_tag(soup):
    # Add Figure sections
    for fig in soup.find_all('fig', recursive=True):
        if not fig.find_all('fig', recursive=True):
            fig_tag = createSecTag(soup, 'FIG')
            fig.wrap(fig_tag)
    
    # Add Table sections
    for table in soup.find_all('table-wrap', recursive=True):
        if not table.find_all('table-wrap', recursive=True):
            table_tag = createSecTag(soup, 'TABLE')
            table.wrap(table_tag)

    # Process front section
    if soup.front:
        if soup.front.abstract:
            secAbs = createSecTag(soup, 'ABSTRACT')
            soup.front.abstract.wrap(secAbs)
        if soup.front.find('kwd-group'):
            secKwd = createSecTag(soup, 'KEYWORD')
            soup.front.find('kwd-group').wrap(secKwd)

    # Process body section
    if soup.body:
        for sec in soup.body.find_all('sec', recursive=False):
            title = sec.find('title')
            if title:
                title_text = title.get_text(separator=' ', strip=True)
                mappedTitle = titleMatch(title_text, 'body')
                if mappedTitle:
                    secBody = createSecTag(soup, mappedTitle)
                    sec.wrap(secBody)
    # Process back sections
    if soup.back:
        for sec in soup.back.find_all(['sec', 'ref-list', 'app-group', 'ack', 'glossary', 'notes', 'fn-group'], recursive=False):
            if sec.name == 'ref-list':
                secRef = createSecTag(soup, 'REF')
                sec.wrap(secRef)
            else:
                title = sec.find('title')
                if title:
                    title_text = title.get_text(separator=' ', strip=True)
                    mappedTitle = titleMatch(title_text, 'back')
                    if mappedTitle:
                        secBack = createSecTag(soup, mappedTitle)
                        sec.wrap(secBack)

# Function to process the body section
def process_body(body, sent_id):
    sent_id = 1
    sections = {}
    for ch in body.find_all(recursive=False):
        if ch.name == 'p':
            sent_id, sentences = process_p_tag(ch, sent_id)
            sections.setdefault('BODY', []).extend(sentences)
        elif ch.name in ['sec', 'ack', 'alternatives', 'array', 'preformat', 'fig', 'fig-group', 'question-wrap', 'list', 'table-wrap-group', 'table-wrap', 'display-formula', 'display-formula-group', 'def-list', 'list', 'supplementary-material', 'kwd-group', 'funding-group', 'statement']:
            title = ch.find('title')
            if title:
                section_title = title.get_text(separator=' ', strip=True).strip().upper()
            else:
                section_title = ch.name.upper()
            sent_id, sentences = call_sentence_tags(ch, sent_id)
            if sentences:
                sections.setdefault(section_title, []).extend(sentences)
    return sent_id, sections

# Main function to process each article and collect data
def process_full_text(each_file):
    # Replace body tag with orig_body to prevent BeautifulSoup from removing it
    each_file = re.sub(r'<body(\s[^>]*)?>', '<orig_body\\1>', each_file)
    each_file = each_file.replace('</body>', '</orig_body>')
    try:
        xml_soup = BeautifulSoup(each_file, 'lxml')
        # Remove extra html and body tags added by BeautifulSoup
        if xml_soup.html:
            xml_soup.html.unwrap()
        if xml_soup.body:
            xml_soup.body.unwrap()
        if xml_soup.find('orig_body'):
            xml_soup.find('orig_body').name = 'body'

        # Extract attributes from the <article> tag
        article_tag = xml_soup.find('article')
        if article_tag:
            open_status = article_tag.get('open-status', '')
            article_type = article_tag.get('article-type', '')
        else:
            open_status = ''
            article_type = ''

        # Extract article IDs
        article_ids = {}
        for id_tag in xml_soup.find_all('article-id'):
            id_type = id_tag.get('pub-id-type', 'unknown')
            article_ids[id_type] = id_tag.text.strip()
        if not article_ids:
            print('No article IDs found')
            return None

        # Apply section tagging
        section_tag(xml_soup)
        
        sent_id = 1
        sections = {}
        keywords = []

        # Process sections under SecTag
        for sec_tag in xml_soup.find_all('SecTag'):
            sec_type = sec_tag.get('type', 'unknown').strip().upper()
            if sec_type == 'KEYWORD':
                # Extract keywords
                keywords = [kwd.text.strip() for kwd in sec_tag.find_all('kwd')]
                continue  # Skip further processing of keywords here
            if sec_type not in sections:
                sections[sec_type] = []
            # Exclude nested 'SecTag's to avoid duplicate text
            for nested_sec in sec_tag.find_all('SecTag', recursive=True):
                nested_sec.extract()
            sent_id, sentences = call_sentence_tags(sec_tag, sent_id)
            sections[sec_type].extend(sentences)

        # Process front section if not already processed
        if xml_soup.article.find('front'):
            sent_id, front_sections, front_keywords = process_front(xml_soup.article.find('front'))
            for k, v in front_sections.items():
                sections.setdefault(k, []).extend(v)
            if front_keywords:
                keywords.extend(front_keywords)

        # Process body section if not already processed
        if xml_soup.article.find('body'):
            sent_id, body_sections = process_body(xml_soup.article.find('body'), sent_id)
            for k, v in body_sections.items():
                sections.setdefault(k, []).extend(v)
        
        # Process back section if not already processed
        if xml_soup.article.find('back'):
            sent_id, back_sections = process_back(xml_soup.article.find('back'), sent_id)
            for k, v in back_sections.items():
                sections.setdefault(k, []).extend(v)

        # Remove empty sections
        sections = {k: v for k, v in sections.items() if v}

        return {
            'article_ids': article_ids,
            'open_status': open_status,
            'article_type': article_type,
            'keywords': keywords,
            'sections': sections
        }

    except Exception as e:
        print(f"Error processing article: {e}")
        return None

# Function to process each article and write to output file
def process_each_article(each_file_path, out_file, document_flag):
    files_list = getfileblocks(each_file_path, document_flag)
    with open(out_file, 'w', encoding='utf-8') as out:
        for each_file in tqdm(files_list, desc="Processing Articles", disable=False):
            if document_flag == 'f':
                data = process_full_text(each_file)
            else:
                print('Document type not supported.')
                continue
            if data:
                out.write(json.dumps(data) + '\n')



In [8]:
!ls

output1.jsonl  patch-07-10-2024-0.xml.gz   sentenciser-Copy2.ipynb
output2.jsonl  patch-28-01-2023-21.xml.gz  sentenciser-Copy3.ipynb
outputa.jsonl  profile_stats		   sentenciser.ipynb
output.jsonl   sentenciser-Copy1.ipynb


In [9]:
# input_file =  'patch-28-01-2023-21.xml.gz' #'patch-07-10-2024-0.xml.gz'
# output_file ='output.jsonl' 
# document_flag = 'f'

In [5]:
process_each_article(input_file, output_file, document_flag)

Processing Articles:   0%|                             | 0/1000 [00:00<?, ?it/s]

Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable

Processing Articles:   3%|▋                   | 32/1000 [00:02<01:16, 12.72it/s]

Error processing article: local variable 'sent_id' referenced before assignment


Processing Articles:   3%|▋                   | 34/1000 [00:03<01:48,  8.91it/s]

Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable 'sent_id' referenced before assignment
Error processing article: local variable

Processing Articles:   6%|█▎                  | 65/1000 [00:04<00:51, 18.33it/s]

Error processing article: local variable 'sent_id' referenced before assignment


Processing Articles:   6%|█▎                  | 65/1000 [00:04<01:06, 14.12it/s]


KeyboardInterrupt: 

In [10]:
ss = getfileblocks(input_file, document_flag)


In [11]:
len(ss)

1000

In [12]:
process_full_text(ss[999])

{'article_ids': {'pmcid': '9878535',
  'publisher-id': 'JIAPS-27-741',
  'doi': '10.4103/jiaps.jiaps_84_22'},
 'open_status': 'O',
 'article_type': 'research-article',
 'keywords': ['Angiotensin II receptor type 2 gene',
  'congenital anomalies of the kidney and urinary tract',
  'nephrogenic genes'],
 'sections': {'ABSTRACT': ['Background: Congenital anomalies of the kidney and urinary tract (CAKUT) are a common cause of end-stage renal disease in children.',
   'While certain nephrogenic genes have been incriminated in these malformations, data to identify the frequency of gene polymorphisms in Asian Indian children with CAKUT are scarce.',
   'This study was done to identify the effect of polymorphisms in paired-box gene 2 (PAX2), bone morphogenetic protein (BMP)-4, angiotensin-converting enzyme (ACE), and angiotensin II receptor Type 2 (AGTR2) nephrogenic genes on the development of CAKUT.',
   'Materials and Methods: In this prospective cohort study, 158 children <12 years old (86

In [None]:
ss[194]

In [None]:
#fix keywords

#<kwd-group><kwd>astrocyte</kwd><kwd>microglia</kwd><kwd>excitability</kwd><kwd>hydrogen sulfide</kwd><kwd>BDNF</kwd><kwd>polyamine</kwd><kwd>dexmedetomidine</kwd><kwd>astrocyte-microglia co-culture</kwd></kwd-group>


In [None]:
<title>Author contributions</title> # titles are repeating, take care

In [None]:
# Function to process the front section
def process_front(front):
    sent_id = 1
    sections = {}
    
    if front.find('article-meta'):
        art_meta = front.find('article-meta')
        
        for ch in art_meta.find_all(recursive=False):
            if ch.name in ['title-group', 'supplement', 'supplementary-material', 'abstract', 'trans-abstract', 'kwd-group', 'funding-group']:
                section_title = ch.name
                
                if section_title == 'kwd-group':
                    # Extract keywords as a list from kwd-group
                    keywords = [kwd.text.strip() for kwd in ch.find_all('kwd')]
                    sections['KEYWORD'] = keywords  # Store keywords as a list
                else:
                    sent_id, sentences = call_sentence_tags(ch, sent_id)
                    if sentences:
                        sections[section_title] = sentences
            else:
                pass  # Ignore other tags
                
    return sent_id, sections


In [None]:
each_file = ss[194].replace('<body>', '<orig_body>').replace('</body>', '</orig_body>')

xml_soup = BeautifulSoup(each_file, 'lxml')
# Unwrap additional tags to keep the main structure intact
if xml_soup.html:
    xml_soup.html.unwrap()
if xml_soup.body:
    xml_soup.body.unwrap()
if xml_soup.find('orig_body'):
    xml_soup.find('orig_body').name = 'body'
            
sent_id, front_sections = process_front(xml_soup.article.find('front'))


In [None]:
front_sections

In [None]:
sections={}
# Process front, body, and back sections
if 'ABSTRACT' not in sections and xml_soup.article.find('front'):
    sent_id, front_sections = process_front(xml_soup.article.find('front'))
    for k, v in front_sections.items():
        sections.setdefault(k, []).extend(v)  # Ensure that 'KEYWORD' remains a list


In [None]:
sections