In [8]:
import re
import gzip
import json
import json
from bs4 import BeautifulSoup
from tqdm import tqdm
from rapidfuzz import process, fuzz
import spacy
import difflib
from collections import defaultdict
import pprint
# Initialize spaCy model
nlp = spacy.load("en_core_sci_sm", disable=["parser", "ner", "tagger", "lemmatizer"])
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x7f79553fe080>

In [9]:
# Precompile regex patterns for section tagging
titleMapsBody = {
    'INTRO': [
        'introduction', 'background', 'related literature', 'literature review', 'objective',
        'aim ', 'purpose of this study', 'study (purpose|aim|aims)', r'\d+\. (purpose|aims|aim)',
        '(aims|aim|purpose) of the study', '(the|drug|systematic|book) review', 'review of literature',
        'related work', 'recent advance', 'overview', 'i ntroduction', 'historical overview',
        'scope', 'context', 'rationale', 'hypothesis', 'motivation', 'i ntroduction', 'i ntro', 'i n t r o d u c t i o n'
    ],
    'METHODS': [
        'supplement', 'methods and materials', 'method', 'material', 'experimental procedure',
        'implementation', 'methodology', 'treatment', 'statistical analysis', "experimental",
        r'\d+\. experimental$', 'experimental (section|evaluation|design|approach|protocol|setting|set up|investigation|detail|part|perspective|tool)',
        "the study", r'\d+\. the study$', "protocol", "protocols", 'study protocol',
        'construction and content', r'experiment \d+', '^experiments$', 'analysis', 'utility',
        'design', r'\d+\. theory$', "theory", 'theory and ', 'theory of ',
        'data analysis', 'data collection', 'methodological approach', 'techniques', 'sample',
        'materials and methods', 'analytical methods', 'research methods', 'methodological framework',
        'm aterials and m ethods', 'm a t e r i a l s a n d m e t h o d s', 'm ethods'
    ],
    'RESULTS': [
        'result', 'finding', 'diagnosis', 'outcomes', 'findings', 'observations',
        'key results', 'main results', 'data', 'analysis results', 'primary results',
        'research findings', 'experimental results', 'empirical findings', 'report of results',
        'r esults', 'r e s u l t s'
    ],
    'DISCUSS': [
        'discussion', 'management of', r'\d+\. management', 'safety and tolerability',
        'limitations', 'perspective', 'commentary', r'\d+\. comment', 'interpretation',
        'interpretation of results', 'analysis of findings', 'discussion and implications',
        'contextualization', 'reflection', 'critical analysis', 'discussion and future work',
        'insights', 'consideration', 'comparison with previous studies', 'd iscussion', 'd i s c u s s i o n'
    ],
    'CONCL': [
        'conclusion', 'key message', 'future', 'summary', 'recommendation',
        'implications for clinical practice', 'concluding remark', 'closing remarks',
        'takeaway', 'final remarks', 'overall conclusion', 'summary and conclusion',
        'implications', 'closing statement', 'wrap-up', 'summary of findings',
        'future directions', 'outlook', 'next steps', 'c onclusion', 'c o n c l u s i o n'
    ],
    'CASE': [
        'case study report', 'case report', 'case presentation', 'case description',
        r'case \d+', r'\d+\. case', 'case summary', 'case history', 'case overview',
        'case study', 'case examination', 'case details', 'case documentation',
        'case example', 'case profile', 'c ase', 'c a s e'
    ],
    'ACK_FUND': [
        'funding', 'acknowledgement', 'acknowledgment', 'financial disclosure',
        'funding sources', 'funding support', 'financial support', 'grant support',
        'grant acknowledgement', 'acknowledgement of funding', 'funder', 'acknowledgements',
        'a c k n o w l e d g e m e n t', 'a c k f u n d'
    ],
    'AUTH_CONT': [
        "author contribution", "authors' contribution", "author's contribution",
        "contribution of authors", "authors' roles", "author responsibilities", "authorship contributions",
        'a u t h o r c o n t r i b u t i o n'
    ],
    'COMP_INT': [
        'competing interest', 'conflict of interest', 'conflicts of interest',
        'disclosure', 'declaration', 'competing interests', 'conflict statement',
        'financial conflicts', 'competing financial interests', 'c o m p i n t'
    ],
    'ABBR': [
        'abbreviation', 'abbreviations list', 'acronyms', 'nomenclature',
        'glossary', 'terms', 'terminology', 'abbreviation glossary', 'a b b r e v i a t i o n'
    ],
    'SUPPL': [
        'supplemental data', 'supplementary file', 'supplemental file', 'supplementary data',
        'supplementary figure', 'supplemental figure', 'supporting information',
        'supplemental file', 'supplemental material', 'supplementary material',
        'supplement material', 'additional data files', 'supplemental information',
        'supplementary information', 'supporting files', 'appendix', 'online appendix',
        'supporting documentation', 'extra data', 'additional material', 'annex',
        's u p p l e m e n t', 's u p p l e m e n t a r y'
    ]
}

titleExactMapsBody = {
    'INTRO': [
        "aim", "aims", "purpose", "purposes", "purpose/aim",
        "purpose of study", "review", "reviews", "minireview", "overview", "background",
        'i n t r o d u c t i o n', 'intro'
    ],
    'METHODS': [
        "experimental", "the study", "protocol", "protocols", "procedure", "methodology", "data analysis",
        'm e t h o d s', 'methods'
    ],
    'DISCUSS': [
        "management", "comment", "comments", "discussion", "limitations", "perspectives",
        'd i s c u s s', 'discussion'
    ],
    'CASE': [
        "case", "cases", "case study", "case report", "case overview", 'case'
    ]
}

titleMapsBack = {
    'REF': [
        'reference', 'literature cited', 'references', 'bibliography', 'source list', 'citations',
        'works cited', 'cited literature', 'bibliographical references', 'citations list',
        'r e f e r e n c e s'
    ],
    'ACK_FUND': [
        'funding', 'acknowledgement', 'acknowledgment', 'acknowlegement',
        'acknowlegement', 'open access', 'financial support', 'grant',
        'author note', 'financial disclosure', 'support statement', 'funding acknowledgment',
        'a c k n o w l e d g e'
    ],
    'ABBR': [
        'abbreviation', 'glossary', 'abbreviations list', 'acronyms', 'terminology', 'abbreviation glossary',
        'a b b r e v i a t i o n'
    ],
    'COMP_INT': [
        'competing interest', 'conflict of interest', 'conflicts of interest',
        'disclosure', 'declaration', 'conflicts', 'interest', 'financial conflicts',
        'c o m p i n t'
    ],
    'CASE': [
        'case study report', 'case report', 'case presentation', 'case description',
        r'case \d+', r'\d+\. case', 'case summary', 'case history', 'case overview',
        'case study', 'case examination', 'case details', 'case documentation',
        'case example', 'case profile', 'c ase', 'c a s e'
    ],
    'ACK_FUND': [
        'funding', 'acknowledgement', 'acknowledgment', 'financial disclosure',
        'funding sources', 'funding support', 'financial support', 'grant support',
        'grant acknowledgement', 'acknowledgement of funding', 'funder', 'acknowledgements',
        'a c k n o w l e d g e m e n t', 'a c k f u n d'
    ],
    'AUTH_CONT': [
        "author contribution", "authors' contribution", "author's contribution",
        "contribution of authors", "authors' roles", "author responsibilities", "authorship contributions",
        'a u t h o r c o n t r i b u t i o n'
    ],
    'COMP_INT': [
        'competing interest', 'conflict of interest', 'conflicts of interest',
        'disclosure', 'declaration', 'competing interests', 'conflict statement',
        'financial conflicts', 'competing financial interests', 'c o m p i n t'
    ],
    'ABBR': [
        'abbreviation', 'abbreviations list', 'acronyms', 'nomenclature',
        'glossary', 'terms', 'terminology', 'abbreviation glossary', 'a b b r e v i a t i o n'
    ],
    'SUPPL': [
        'supplemental data', 'supplementary file', 'supplemental file', 'supplementary data',
        'supplementary figure', 'supplemental figure', 'supporting information',
        'supplemental file', 'supplemental material', 'supplementary material',
        'supplement material', 'additional data files', 'supplemental information',
        'supplementary information', 'supporting files', 'appendix', 'online appendix',
        'supporting documentation', 'extra data', 'additional material', 'annex',
        's u p p l e m e n t', 's u p p l e m e n t a r y'
    ]
}

# titleExactMapsBody = {
#     'INTRO': [
#         "aim", "aims", "purpose", "purposes", "purpose/aim",
#         "purpose of study", "review", "reviews", "minireview", "overview", "background",
#         'i n t r o d u c t i o n', 'intro'
#     ],
#     'METHODS': [
#         "experimental", "the study", "protocol", "protocols", "procedure", "methodology", "data analysis",
#         'm e t h o d s', 'methods'
#     ],
#     'DISCUSS': [
#         "management", "comment", "comments", "discussion", "limitations", "perspectives",
#         'd i s c u s s', 'discussion'
#     ],
#     'CASE': [
#         "case", "cases", "case study", "case report", "case overview", 'case'
#     ]
# }

# titleMapsBack = {
#     'REF': [
#         'reference', 'literature cited', 'references', 'bibliography', 'source list', 'citations',
#         'works cited', 'cited literature', 'bibliographical references', 'citations list',
#         'r e f e r e n c e s'
#     ],
#     'ACK_FUND': [
#         'funding', 'acknowledgement', 'acknowledgment', 'acknowlegement',
#         'acknowlegement', 'open access', 'financial support', 'grant',
#         'author note', 'financial disclosure', 'support statement', 'funding acknowledgment',
#         'a c k n o w l e d g e'
#     ],
#     'ABBR': [
#         'abbreviation', 'glossary', 'abbreviations list', 'acronyms', 'terminology', 'abbreviation glossary',
#         'a b b r e v i a t i o n'
#     ],
#     'COMP_INT': [
#         'competing interest', 'conflict of interest', 'conflicts of interest',
#         'disclosure', 'declaration', 'conflicts', 'interest', 'financial conflicts',
#         'c o m p i n t'
#     ],
#     'SUPPL': [
#         'supplementary', 'supporting information', 'supplemental', 'web extra material',
#         'supplemental files', 'online supplement', 'appendix', 'annex', 'additional resources',
#         's u p p l e m e n t', 'supporting info'
#     ],
#     'APPENDIX': [
#         'appendix', 'appendices', 'annex', 'additional material', 'extra material', 'a p p e n d i x'
#     ],
#     'AUTH_CONT': [
#         'author', 'contribution', 'authors’ contributions', 'author contributions', 'roles of authors', 'authorship roles',
#         'a u t h o r s h i p'
#     ]
# }

#     ],
#     'SUPPL': [
#         'supplementary', 'supporting information', 'supplemental', 'web extra material',
#         'supplemental files', 'online supplement', 'appendix', 'annex', 'additional resources',
#         's u p p l e m e n t', 'supporting info'
#     ],
#     'APPENDIX': [
#         'appendix', 'appendices', 'annex', 'additional material', 'extra material', 'a p p e n d i x'
#     ],
#     'AUTH_CONT': [
#         'author', 'contribution', 'authors’ contributions', 'author contributions', 'roles of authors', 'authorship roles',
#         'a u t h o r s h i p'
#     ]
# }


In [10]:
# Precompile regex patterns
compiled_titleMapsBody = {
    key: [re.compile(pattern, re.IGNORECASE) for pattern in patterns]
    for key, patterns in titleMapsBody.items()
}

compiled_titleExactMapsBody = {
    key: [pattern.lower() for pattern in patterns]
    for key, patterns in titleExactMapsBody.items()
}

compiled_titleMapsBack = {
    key: [re.compile(pattern, re.IGNORECASE) for pattern in patterns]
    for key, patterns in titleMapsBack.items()
}

def createSecTag(soup, secType):
    secTag = soup.new_tag('SecTag')
    secTag['type'] = secType
    return secTag

# Function to read XML or GZ files and split into individual articles
def getfileblocks(file_path, document_flag):
    sub_file_blocks = []
    if file_path.endswith('.gz'):
        open_func = lambda x: gzip.open(x, 'rt', encoding='utf8')
    else:
        open_func = lambda x: open(x, 'r', encoding='utf8')

    try:
        with open_func(file_path) as fh:
            content = fh.read()
            if document_flag in ['f', 'a']:
                # Split content by <!DOCTYPE article ...> or <article ...> tags
                articles = re.split(r'(?=<!DOCTYPE article|<article(?![\w-]))', content)
                sub_file_blocks = [article.strip() for article in articles if article.strip() and '<!DOCTYPE' not in article]
            else:
                print('ERROR: unknown document type :' + document_flag)
    except Exception as e:
        print('Error processing file: ' + str(file_path))
        print(e)

    return sub_file_blocks

# Function to split text into sentences using spaCy
def batch_sentence_split(text_segments, batch_size=100):
    """
    General function to process a list of text segments and split them into sentences using spaCy.
    
    Args:
    - text_segments (list of str): List of text segments to be processed.
    - batch_size (int): The number of texts to process in one batch (default is 1000).

    Returns:
    - List of sentences extracted from the text segments.
    """
    sentences = []
    # Use spaCy's pipe function to process all collected texts in one batch
    docs = nlp.pipe(text_segments, batch_size=batch_size, n_process=3)  # Adjust batch_size and n_process as needed

    for doc in docs:
        for sent in doc.sents:
            sentences.append(sent.text.strip())

    return sentences



# Function to process nested tags and collect sentences
def call_sentence_tags(ch):
    """
    Collect all the text from nested tags and process them in batches using spaCy for faster sentence splitting.
    
    Args:
    - ch (BeautifulSoup Tag): The parent tag to process and extract text from nested tags.

    Returns:
    - List of sentences extracted from the nested tags.
    """
    text_segments = []

    # Collect all text segments to be processed
    def collect_text_segments(node):
        for gch in node.children:
            if isinstance(gch, str):
                continue  # Skip strings directly under ch
            if gch.name in ['article-title', 'title', 'subtitle', 'trans-title', 'trans-subtitle', 'alt-title', 'label', 'td', 'th']:
                if gch.find('p', recursive=False):
                    collect_text_segments(gch)
                else:
                    text = gch.get_text(separator=' ', strip=True)
                    if text:
                        text_segments.append(text)
            elif gch.name in ["sec", "fig", "statement", "div", "boxed-text", "list", "list-item", "disp-quote", "speech",
                              "fn-group", "fn", "def-list", "def-item", "def", "ack", "array", "table-wrap", "table",
                              "tbody", "thead", "tr", "caption", "answer", "sec-meta", "glossary", "question", "question-wrap"]:
                collect_text_segments(gch)
            elif gch.name == 'p':
                sub_text = gch.get_text(separator=' ', strip=True)
                if sub_text:
                    text_segments.append(sub_text)
            else:
                text = gch.get_text(separator=' ', strip=True)
                if text:
                    text_segments.append(text)

    # Recursively collect text segments
    collect_text_segments(ch)


    return batch_sentence_split(text_segments)


# Function to process paragraph tags
from bs4 import Tag

def process_p_tag(p_tags):
    """
    Process multiple <p> tags in batch, collecting text and splitting into sentences using spaCy.
    
    Args:
    - p_tags (list of BeautifulSoup Tag): List of <p> tags to process.

    Returns:
    - List of sentences extracted from the <p> tags.
    """
    text_segments = []

    # Collect text from each <p> tag
    for gch in p_tags:
        if isinstance(gch, Tag):  # Ensure gch is a Tag object
            if not (len(gch.contents) == 1 and 
                    (not gch.contents[0].string) and 
                    (gch.contents[0].name in ["ext-link", "e-mail", "uri", "inline-supplementary-material", 
                                              "related-article", "related-object", "address", "alternatives", 
                                              "array", "funding-source", "inline-graphic"])):
                text = gch.get_text(separator=' ', strip=True)
                if text:
                    text_segments.append(text)

    return batch_sentence_split(text_segments)



# Function to process the front section
def process_front(front):
    sections = {}
    keywords = []
    
    if front.find('article-meta'):
        art_meta = front.find('article-meta')
        
        for ch in art_meta.find_all(recursive=False):
            if ch.name in ['title-group', 'supplement', 'supplementary-material', 'abstract', 'trans-abstract', 'kwd-group', 'funding-group']:
                section_title = ch.name.upper()
                
                if section_title == 'KWD-GROUP':
                    # Extract keywords as a list from kwd-group
                    keywords = [kwd.text.strip() for kwd in ch.find_all('kwd')]
                else:
                    sentences = call_sentence_tags(ch)
                    if sentences:
                        sections.setdefault(section_title, []).extend(sentences)
            else:
                pass  # Ignore other tags
    
    return sections, keywords

# Function to process the back section
def process_back(back):
    sections = {}
    for ch in back.find_all(recursive=False):
        if ch.name in ['sec', 'p', 'ack', 'alternatives', 'array', 'preformat', 'fig', 'fig-group', 'question-wrap',
                 'question-wrap-group', 'list', 'table-wrap-group', 'table-wrap', 'display-formula',
                 'display-formula-group', 'def-list', 'list', 'supplementary-material', 'kwd-group',
                 'funding-group', 'statement', 'ref-list', 'glossary']:
            # Sections with titles
            if ch.name == 'ref-list':
                sentences = reference_sents(ch)
                if sentences:
                    sections.setdefault('REF', []).extend(sentences)
            else:
                title = ch.find('title')
                if title:
                    section_title = title.get_text(separator=' ', strip=True).strip().upper()
                else:
                    section_title = ch.name.upper()
                sentences = call_sentence_tags(ch)
                if sentences:
                    sections.setdefault(section_title, []).extend(sentences)
        else:
            pass  # Ignore other tags
    return sections

def reference_sents(ref_list):
    """
    Process reference sentences by collecting text from <ref> tags and using batch processing to split into sentences.
    
    Args:
    - ref_list (BeautifulSoup Tag): The parent tag containing references.

    Returns:
    - List of sentences extracted from the references.
    """
    text_segments = []

    # Collect text from <ref> tags and other relevant nested tags
    for ch in ref_list.children:
        if isinstance(ch, str):
            continue  # Skip strings directly under ref_list
        if ch.name == 'ref':
            sub_text = ''
            for gch in ch.children:
                if isinstance(gch, str):
                    continue
                sub_text += " " + " ".join([d.string for d in gch.descendants if d.string])
            if sub_text:
                text_segments.append(sub_text.strip())
        elif ch.name in ["sec", "fig", "statement", "div", "boxed-text", "list", "list-item", "disp-quote", "speech",
                         "fn-group", "fn", "def-list", "def-item", "def", "ack", "array", "table-wrap", "table",
                         "tbody", "caption", "answer", "sec-meta", "glossary", "question", "question-wrap"]:
            sub_sentences = call_sentence_tags(ch)
            text_segments.extend(sub_sentences)


    return batch_sentence_split(text_segments)


# Function to match section titles to predefined section types
def titleMatch(title, secFlag):
    matchKeys = []
    # Check if the flag is 'body' or 'back' and apply the respective mappings
    if secFlag == 'body':
        titleMaps = compiled_titleMapsBody
        exactMaps = compiled_titleExactMapsBody
    else:
        titleMaps = compiled_titleMapsBack
        exactMaps = {}

    title_lower = title.lower().strip()
    # Check exact matches first
    for key, patterns in exactMaps.items():
        if title_lower in patterns:
            matchKeys.append(key)
            break  # If exact match found, no need to check further

    # If no exact match, check regex patterns
    if not matchKeys:
        for key, patterns in titleMaps.items():
            if any(pattern.search(title_lower) for pattern in patterns):
                matchKeys.append(key)

    return ','.join(matchKeys) if matchKeys else None

# Function to apply section tagging to the soup object
def section_tag(soup):
    # Add Figure sections
    for fig in soup.find_all('fig', recursive=True):
        if not fig.find_all('fig', recursive=True):
            fig_tag = createSecTag(soup, 'FIG')
            fig.wrap(fig_tag)
    
    # Add Table sections
    for table in soup.find_all('table-wrap', recursive=True):
        if not table.find_all('table-wrap', recursive=True):
            table_tag = createSecTag(soup, 'TABLE')
            table.wrap(table_tag)

    # Process front section
    if soup.front:
        if soup.front.abstract:
            secAbs = createSecTag(soup, 'ABSTRACT')
            soup.front.abstract.wrap(secAbs)
        if soup.front.find('kwd-group'):
            secKwd = createSecTag(soup, 'KEYWORD')
            soup.front.find('kwd-group').wrap(secKwd)

    # Process body section
    if soup.body:
        for sec in soup.body.find_all('sec', recursive=False):
            title = sec.find('title')
            if title:
                title_text = title.get_text(separator=' ', strip=True)
                mappedTitle = titleMatch(title_text, 'body')
                if mappedTitle:
                    secBody = createSecTag(soup, mappedTitle)
                    sec.wrap(secBody)
    # Process back sections
    if soup.back:
        for sec in soup.back.find_all(['sec', 'ref-list', 'app-group', 'ack', 'glossary', 'notes', 'fn-group'], recursive=False):
            if sec.name == 'ref-list':
                secRef = createSecTag(soup, 'REF')
                sec.wrap(secRef)
            else:
                title = sec.find('title')
                if title:
                    title_text = title.get_text(separator=' ', strip=True)
                    mappedTitle = titleMatch(title_text, 'back')
                    if mappedTitle:
                        secBack = createSecTag(soup, mappedTitle)
                        sec.wrap(secBack)

# Function to process the body section
def process_body(body):
    sections = {}
    for ch in body.find_all(recursive=False):
        if ch.name == 'p':
            sentences = process_p_tag(ch)
            sections.setdefault('BODY', []).extend(sentences)
        elif ch.name in ['sec', 'ack', 'alternatives', 'array', 'preformat', 'fig', 'fig-group', 'question-wrap', 'list', 'table-wrap-group', 'table-wrap', 'display-formula', 'display-formula-group', 'def-list', 'list', 'supplementary-material', 'kwd-group', 'funding-group', 'statement']:
            title = ch.find('title')
            if title:
                section_title = title.get_text(separator=' ', strip=True).strip().upper()
            else:
                section_title = ch.name.upper()
            sentences = call_sentence_tags(ch)
            if sentences:
                sections.setdefault(section_title, []).extend(sentences)
    return sections

# Main function to process each article and collect data
def process_full_text(each_file):

    # Replace body tag with orig_body to prevent BeautifulSoup from removing it
    each_file = re.sub(r'<body(\s[^>]*)?>', '<orig_body\\1>', each_file)
    each_file = each_file.replace('</body>', '</orig_body>')
    try:
        xml_soup = BeautifulSoup(each_file, 'lxml')
        # Remove extra html and body tags added by BeautifulSoup
        if xml_soup.html:
            xml_soup.html.unwrap()
        if xml_soup.body:
            xml_soup.body.unwrap()
        if xml_soup.find('orig_body'):
            xml_soup.find('orig_body').name = 'body'

        # Extract attributes from the <article> tag
        article_tag = xml_soup.find('article')
        if article_tag:
            open_status = article_tag.get('open-status', '')
            article_type = article_tag.get('article-type', '')
        else:
            open_status = ''
            article_type = ''

        # Extract article IDs
        article_ids = {}
        for id_tag in xml_soup.find_all('article-id'):
            id_type = id_tag.get('pub-id-type', 'unknown')
            article_ids[id_type] = id_tag.text.strip()
        if not article_ids:
            print('No article IDs found')
            return None

        # Apply section tagging
        section_tag(xml_soup)
        
        sections = {}
        keywords = []

        # Process sections under SecTag
        for sec_tag in xml_soup.find_all('SecTag'):
            sec_type = sec_tag.get('type', 'unknown').strip().upper()
            if sec_type == 'KEYWORD':
                # Extract keywords
                keywords = [kwd.text.strip() for kwd in sec_tag.find_all('kwd')]
                continue  # Skip further processing of keywords here
            if sec_type not in sections:
                sections[sec_type] = []
            # Exclude nested 'SecTag's to avoid duplicate text
            for nested_sec in sec_tag.find_all('SecTag', recursive=True):
                nested_sec.extract()
            sentences = call_sentence_tags(sec_tag)
            sections[sec_type].extend(sentences)

        # Process front section if not already processed
        if xml_soup.article.find('front'):
            front_sections, front_keywords = process_front(xml_soup.article.find('front'))
            for k, v in front_sections.items():
                sections.setdefault(k, []).extend(v)
            if front_keywords:
                keywords.extend(front_keywords)

        # Process body section if not already processed
        if xml_soup.article.find('body'):
            body_sections = process_body(xml_soup.article.find('body'))
            for k, v in body_sections.items():
                sections.setdefault(k, []).extend(v)
        
        # Process back section if not already processed
        if xml_soup.article.find('back'):
            back_sections = process_back(xml_soup.article.find('back'))
            for k, v in back_sections.items():
                sections.setdefault(k, []).extend(v)

        # Remove empty sections
        sections = {k: v for k, v in sections.items() if v}

        return {
            'article_ids': article_ids,
            'open_status': open_status,
            'article_type': article_type,
            'keywords': keywords,
            'sections': sections
        }

    except Exception as e:
        print(f"Error processing article: {e}")
        return None

# Function to process each article and write to output file
def process_each_article(each_file_path, out_file, document_flag):
    files_list = getfileblocks(each_file_path, document_flag)
    with open(out_file, 'w', encoding='utf-8') as out:
        for each_file in tqdm(files_list, desc="Processing Articles", disable=False):
            if document_flag == 'f':
                data = process_full_text(each_file)
            else:
                print('Document type not supported.')
                continue
            if data:
                out.write(json.dumps(data) + '\n')



In [11]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def process_json(data, ordered_labels):
    # Step 1: Initialize sections and directly map TITLE-GROUP to TITLE if present
    sections = data['sections']
    if "TITLE-GROUP" in sections:
        sections["TITLE"] = sections.pop("TITLE-GROUP")
    
    # Step 2: Identify keys in sections not present in ordered_labels
    section_keys = set(sections.keys())
    ordered_labels_set = set(ordered_labels)
    unfound_keys = section_keys - ordered_labels_set  # Keys in sections not in ordered_labels

    # Step 3: Normalize only the unfound section keys (remove spaces, uppercase)
    normalized_unfound_keys = {key.replace(" ", "").upper(): key for key in unfound_keys}
    
    # Step 4: Map unfound normalized keys to ordered labels using fuzzy matching (threshold 80%)
    mapped_labels = {}
    for normalized_key, original_key in normalized_unfound_keys.items():
        # Perform fuzzy matching
        match, score = process.extractOne(normalized_key, ordered_labels, scorer=fuzz.partial_ratio)
        if score >= 80:
            mapped_labels[original_key] = match
        else:
            mapped_labels[original_key] = original_key  # Keep original if no close match
    
    # Step 5: Structure JSON without ordering or sent_id for now
    result_json = {}
    for section_key in sections:
        label = mapped_labels.get(section_key, section_key)  # Use mapped label if exists, else original
        result_json[label] = [{"text": text} for text in sections[section_key]]

    # Step 6: Reorder JSON according to ordered_labels and add any unmapped sections at the end
    ordered_json = {}
    for label in ordered_labels:
        if label in result_json:
            ordered_json[label] = result_json.pop(label)
    ordered_json.update(result_json)  # Add remaining sections in their original order
    
    # Step 7: Assign unique incremental sent_id starting from 1
    sent_id = 1
    for section in ordered_json.values():
        for entry in section:
            entry["sent_id"] = sent_id
            sent_id += 1  # Increment sent_id for each entry uniquely
    
    # Update the original data with the modified sections
    data['sections'] = ordered_json

    return data


In [12]:
!ls

output1.jsonl	    output_no_batch.jsonl	sentenciser-Copy2.ipynb
output2.jsonl	    patch-07-10-2024-0.xml.gz	sentenciser-Copy3.ipynb
outputa.jsonl	    patch-28-01-2023-21.xml.gz	sentenciser-Copy4.ipynb
output_batch.jsonl  profile_stats		sentenciser.ipynb
output.jsonl	    sentenciser-Copy1.ipynb	xx.py


In [13]:
input_file =  'patch-28-01-2023-21.xml.gz' #'patch-07-10-2024-0.xml.gz'
output_file ='output_batch.jsonl' 
document_flag = 'f'

In [19]:
import cProfile
import pstats

if __name__ == '__main__':
    cProfile.run('process_each_article(input_file, output_file, document_flag)', 'profile_stats')

    p = pstats.Stats('profile_stats')
    p.sort_stats('cumtime').print_stats(20)  # Print top 20 functions by cumulative time


Processing Articles: 100%|████████████████████████████████████████████████████████| 1000/1000 [17:17<00:00,  1.04s/it]

Sun Oct 27 01:56:31 2024    profile_stats

         662357480 function calls (662102723 primitive calls) in 1039.090 seconds

   Ordered by: cumulative time
   List reduced from 529 to 20 due to restriction <20>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000 1039.090 1039.090 {built-in method builtins.exec}
        1    0.001    0.001 1039.090 1039.090 <string>:1(<module>)
        1    0.225    0.225 1039.089 1039.089 /tmp/ipykernel_2589526/1224015570.py:412(process_each_article)
     1000    0.777    0.001 1035.152    1.035 /tmp/ipykernel_2589526/1224015570.py:322(process_full_text)
    13827    5.401    0.000  951.370    0.069 /tmp/ipykernel_2589526/1224015570.py:46(batch_sentence_split)
   186078    3.123    0.000  941.021    0.005 /home/stirunag/falconframes_env/lib/python3.10/site-packages/spacy/language.py:1534(pipe)
    13037    0.332    0.000  936.118    0.072 /tmp/ipykernel_2589526/1224015570.py:70(call_sentence_tags)
   18




In [None]:
# process_each_article(input_file, output_file, document_flag)

In [14]:
ss = getfileblocks(input_file, document_flag)


In [15]:
len(ss)

1000

In [17]:
import time 
start_time = time.time()  # Record the start time
tt = process_full_text(ss[191])
end_time = time.time()  # Record the end time
elapsed_time = end_time - start_time  # Calculate the elapsed time
print(f"Execution time: {elapsed_time:.4f} seconds")

print("Structured JSON in strict order with sent_ids starting from 1:")
# pprint.pprint(tt, indent=2)
tt

Execution time: 0.8288 seconds
Structured JSON in strict order with sent_ids starting from 1:


{'article_ids': {'pmcid': '9878372',
  'publisher-id': 'v12i1e41533',
  'pmid': '36630158',
  'doi': '10.2196/41533'},
 'open_status': 'O',
 'article_type': 'research-article',
 'keywords': ['general practice',
  'vital signs/methods',
  'vital signs/standards',
  'photoplethysmography',
  'remote photoplethysmography',
  'rPPG',
  'Lifelight',
  'contactless',
  'software'],
 'sections': {'ABSTRACT': ['Background Measuring vital signs (VS) is an important aspect of clinical care but is time-consuming and requires multiple pieces of equipment and trained staff.',
   'Interest in the contactless measurement of VS has grown since the COVID-19 pandemic, including in nonclinical situations.',
   'Lifelight is an app being developed as a medical device for the contactless measurement of VS using remote photoplethysmography (rPPG) via the camera on smart devices.',
   'The VISION-D (Measurement of Vital Signs by Lifelight Software in Comparison to the Standard of Care—Development) and VISION

In [18]:
tt['sections'].keys()

dict_keys(['ABSTRACT', 'INTRO', 'METHODS', 'TABLE', 'RESULTS', 'DISCUSS', 'ABBR', 'REF', 'TITLE-GROUP', 'ACK'])

In [None]:
ordered_labels = ['TITLE', 'ABSTRACT', 'INTRO', 'METHODS', 'RESULTS', 'DISCUSS', 'CONCL', 'CASE', 'ACK_FUND', 'AUTH_CONT', 'COMP_INT', 'ABBR', 'SUPPL', 'REF', 'ACK_FUND', 'ABBR', 'COMP_INT', 'SUPPL', 'APPENDIX', 'AUTH_CONT']

yy = process_json(tt, ordered_labels)
pprint.pprint(yy, indent=2)

In [None]:
yy

In [None]:
'Title', 'Abstract'. 'Methods', 'Results', 

In [None]:
import time

def measure_execution_time(func, *args, **kwargs):
    """
    Measure the execution time of a function.
    
    Args:
    - func (function): The function to measure.
    - *args: Positional arguments to pass to the function.
    - **kwargs: Keyword arguments to pass to the function.

    Returns:
    - result: The result of the function execution.
    - elapsed_time: The time taken to execute the function in seconds.
    """
    start_time = time.time()  # Record the start time
    result = func(*args, **kwargs)  # Execute the function
    end_time = time.time()  # Record the end time
    elapsed_time = end_time - start_time  # Calculate the elapsed time
    print(f"Execution time: {elapsed_time:.4f} seconds")
    return result, elapsed_time
