In [257]:
#!pip install lxml

In [258]:
# --- 0. Environment Setup & Offline Preparation ---

# Standard Imports
import os
import glob
import re
import pandas as pd
import lxml.etree as etree
from lxml.etree import _Element as Element # Type hinting for lxml.etree.Element
import collections # For deque in parenthesis removal
import fitz # PyMuPDF for PDF processing
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.modeling_utils import PreTrainedModel
from transformers.tokenization_utils import PreTrainedTokenizer
from transformers.utils.quantization_config import BitsAndBytesConfig
from transformers.training_args import TrainingArguments
from trl import SFTTrainer
import torch
from datasets import Dataset # Hugging Face datasets library
import kagglehub
import spacy
import json

# Set device for PyTorch
device = "cuda" if torch and torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


Using device: cpu


In [259]:
# Uncomment and run this line once to download the model
#!python -m spacy download en_core_web_sm

In [260]:
# Define constants for file paths and model configurations
BASE_INPUT_DIR = './kaggle/input/make-data-count-finding-data-references'
ARTICLE_TRAIN_DIR = os.path.join(BASE_INPUT_DIR, 'train')
ARTICLE_TEST_DIR = os.path.join(BASE_INPUT_DIR, 'test')

# Define directories for articles in train and test sets
LABELED_TRAINING_DATA_CSV_PATH = os.path.join(BASE_INPUT_DIR, 'train_labels.csv')

# Define the base model path
QWEN_BASE_MODEL_PATH = kagglehub.model_download("qwen-lm/qwen-3/transformers/0.6b")

# Output directory for the fine-tuned model and results
BASE_OUTPUT_DIR = "./kaggle/working"
FINE_TUNED_MODEL_OUTPUT_DIR = os.path.join(BASE_OUTPUT_DIR, "qwen_finetuned_dataset_classifier")
FINAL_RESULTS_CSV_PATH = os.path.join(BASE_OUTPUT_DIR, "article_dataset_classification.csv")

# Load a spaCy model (e.g., 'en_core_web_sm')
# python -m spacy download en_core_web_sm 
NLP_SPACY = spacy.load("en_core_web_sm")


In [291]:
# --- 2. Information Extraction (IE) - Dataset Identification ---

# Regex patterns for common dataset identifiers
# DOI_PATTERN = r'10\.\d{4,5}/[-._;()/:A-Za-z0-9\u002D\u2010\u2011\u2012\u2013\u2014\u2015]+'	DOI_PATTERN
# DOI_PATTERN = r'10\.\s?\d{4,5}\/[-._()<>;\/:A-Za-z0-9]+\s?(?:(?![A-Z]+)(?!\d{1,3}\.))+[-._()<>;\/:A-Za-z0-9]+'
#DOI_PATTERN = r'\bhttps://doi.org/10\.\d{4,5}\/[-._\/:A-Za-z0-9]+'
DOI_PATTERN = r'\b10\.\d{4,5}\/[-._\/:A-Za-z0-9]+'
EPI_PATTERN = r'\bEPI[-_A-Z0-9]{2,}'
SAM_PATTERN = r'\bSAMN[0-9]{2,}'          # SAMN07159041
IPR_PATTERN = r'\bIPR[0-9]{2,}'
CHE_PATTERN = r'\bCHEMBL[0-9]{2,}'
PRJ_PATTERN = r'\bPRJ[A-Z0-9]{2,}'
E_G_PATTERN = r'\bE-[A-Z]{4}-[0-9]{2,}'   # E-GEOD-19722 or E-PROT-100
ENS_PATTERN = r'\bENS[A-Z]{4}[0-9]{2,}'
CVC_PATTERN = r'\bCVCL_[A-Z0-9]{2,}'
EMP_PATTERN = r'\bEMPIAR-[0-9]{2,}'
PXD_PATTERN = r'\bPXD[0-9]{2,}'
HPA_PATTERN = r'\bHPA[0-9]{2,}'
SRR_PATTERN = r'\bSRR[0-9]{2,}'
GSE_PATTERN = r'\b(GSE|GSM|GDS|GPL)\d{4,6}\b' # Example for GEO accession numbers (e.g., GSE12345, GSM12345)
GNB_PATTERN = r'\b[A-Z]{1,2}\d{5,6}\b' # GenBank accession numbers (e.g., AB123456, AF000001)
CAB_PATTERN = r'\bCAB[0-9]{2,}'

# Combine all patterns into a list
DATASET_ID_PATTERNS = [
    DOI_PATTERN,
    EPI_PATTERN,
    SAM_PATTERN,
    IPR_PATTERN,
    CHE_PATTERN,
    PRJ_PATTERN,
    E_G_PATTERN,
    ENS_PATTERN,
    CVC_PATTERN,
    EMP_PATTERN,
    PXD_PATTERN,
    HPA_PATTERN,
    SRR_PATTERN,
    GSE_PATTERN,
    GNB_PATTERN,
    CAB_PATTERN,
]

# Compile all patterns for efficiency
COMPILED_DATASET_ID_REGEXES = [re.compile(p) for p in DATASET_ID_PATTERNS]

# Data related keywords to look for in the text
# These keywords help to ensure that the text is relevant to datasets
DATA_RELATED_KEYWORDS = ['data release', 'download', 'program data', 'data availability', 'the data', 'dataset', 'database', 'repository', 'data source', 'data access', 'archive', 'arch.', 'digital']

def is_text_data_related(text: str) -> bool:
    if not text:
        return False
    
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in DATA_RELATED_KEYWORDS)

def text_has_dataset_id(text: str) -> bool:
    """
    Check if the given text contains any dataset identifier.
    
    Args:
        text (str): The text to check for dataset identifiers.
        
    Returns:
        bool: True if any dataset identifier is found, False otherwise.
    """

    occurrences_with_context: list[str] = []
    for regex in COMPILED_DATASET_ID_REGEXES:
        if regex.search(text):
            text_lower = text.lower()
            # Check for specific keywords in the text
            if any(keyword in text_lower for keyword in DATA_RELATED_KEYWORDS):
                return True
    return False

def extract_dataset_ids(text: str, context_chars: int = 250) -> str:
    """
    Extract dataset identifiers with context from the given text.
    
    Args:
        text (str): The text to search for dataset identifiers.
        context_chars (int): Number of characters to include before and after the match for context.
        
    Returns:
        list[str]: A list of extracted dataset identifiers with context.
    """
    text = text.replace('\u200b', '')
    is_small_context = len(text) < context_chars * 2
    dataset_ids: list[str] = []
    occurrences_with_context: list[str] = []
    if is_text_data_related(text):
        for regex in COMPILED_DATASET_ID_REGEXES:
            matches = regex.finditer(text, re.IGNORECASE)
            for match in matches:
                dataset_id = text[match.start() : match.end()]
                if is_small_context:
                    dataset_ids.append(dataset_id)
                else:
                    extracted_snippet = text[max(0, match.start() - context_chars): match.end() + context_chars ]
                    extracted_snippet = extracted_snippet.replace('\n', '').replace('[', '').replace(']', '')
                    extracted_snippet = re.sub(r'\s+', ' ', extracted_snippet).strip()
                    if is_text_data_related(extracted_snippet):
                        occurrences_with_context.append("{" + f'"dataset_ids": {[dataset_id]}, citation_context: "{extracted_snippet}"' + "}")
        if dataset_ids:
            occurrences_with_context.append("{" + f'"dataset_ids": {dataset_ids}, citation_context: "{text}"' + "}")
    
    # If no occurrences found, return an empty string
    # Otherwise, join the occurrences with a specific separator
    if not occurrences_with_context:
        return ""
    return ",".join(occurrences_with_context)

In [262]:
# Use NLP to get sentences from the given text
NON_STD_UNICODE_DASHES = re.compile(r'[\u2010\u2011\u2012\u2013\u2014]')
def get_sentences_from_text(text: str, nlp=NLP_SPACY) -> str:
    if not text:
        return ""
    
    # Replace all non-standard unicode dashes with '-'
    text = NON_STD_UNICODE_DASHES.sub('-', text)
    text = text.replace('-\n', '-').replace('_\n', '_').replace('/\n', '/').replace('\n', ' ').replace('\u200b', '').strip()
    doc_spacy = nlp(text)
    return "\n".join([sent.text for sent in doc_spacy.sents])

In [263]:
def extract_author_names(full_text: str, nlp=NLP_SPACY) -> str:
    """
    Extracts potential author names from the beginning of a research article's text
    using spaCy's Named Entity Recognition. It attempts to isolate the author section
    and applies heuristics to filter out non-author entities.

    Args:
        full_text (str): The complete text content of the research article,
                         typically extracted from a PDF.

    Returns:
        List[str]: A list of unique strings, each representing a potential author name,
                   sorted alphabetically. Returns an empty list if no authors are found.
    """
    if not full_text or not full_text.strip():
        return []

    full_text = full_text.replace('1\n,', ',').replace('1,', ',').replace('\u2019', "'")

    # 1. Isolate the potential author section
    # Authors are typically at the very beginning, before the abstract or introduction.
    # We'll search for common section headers to define the end of the author block.
    # Using regex for case-insensitive search and handling various newline/spacing.
    header_patterns = [
        r"\n\s*Abstract\s*\n",
        r"\n\s*Introduction\s*\n",
        r"\n\s*Summary\s*\n",
        r"\n\s*Keywords\s*\n",
        r"\n\s*Graphical Abstract\s*\n",
        r"\n\s*1\.\s*Introduction\s*\n", # Common for numbered sections
        r"\n\s*DOI:\s*\n" # Sometimes DOI appears before abstract
    ]

    author_section_end_index = len(full_text)
    for pattern in header_patterns:
        match = re.search(pattern, full_text, re.IGNORECASE)
        if match:
            # Take text up to the start of the found header
            author_section_end_index = min(author_section_end_index, match.start())
            break
    
    # As a fallback or if no header is found early, limit the search to the first
    # 2500 characters. This prevents processing the entire document for authors.
    author_section_text = full_text[:min(author_section_end_index, 2500)]

    if not author_section_text.strip():
        return []

    # 2. Process the isolated author section with spaCy
    doc = nlp(author_section_text)

    # 3. Extract PERSON entities and apply initial filtering
    potential_authors: set[str] = set()
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            name = ent.text.strip()
            # Basic filtering to reduce false positives:
            # - Exclude very short strings (e.g., single letters, common conjunctions)
            # - Exclude common stop words (e.g., "The", "And")
            # - Exclude all-uppercase strings that might be acronyms (e.g., "WHO", "NASA")
            # - Ensure it contains at least one space (e.g., "John Doe") or is a capitalized
            #   single word that's longer than 2 characters (e.g., "Smith").
            if (len(name) > 1 and
                name.lower() not in nlp.Defaults.stop_words and
                not name.isupper() and
                (' ' in name or (name[0].isupper() and len(name) > 2))):
                
                potential_authors.add(name)

    # 4. Apply more advanced heuristics to filter out non-author names
    # This step is crucial for accuracy and often requires tuning.
    filtered_authors = set()
    for author in potential_authors:
        # Heuristic 1: Filter out names that contain common affiliation keywords.
        # This is a simple check; more robust solutions might use spaCy's dependency
        # parsing to check if a PERSON entity is part of an ORG entity.
        affiliation_keywords = ["univ", "observ", "institute", "department", "center", "lab",
                                "hospital", "college", "school", "inc.", "ltd.", "company",
                                "corp.", "group", "foundation", "research"]
        if any(keyword in author.lower() for keyword in affiliation_keywords):
            continue # Skip if it looks like an affiliation

        # Heuristic 2: Filter out names that contain email patterns or ORCID patterns.
        if '@' in author or re.search(r'\b\d{4}-\d{4}-\d{4}-\d{3}[\dX]\b', author):
            continue # Skip if it contains an email or ORCID

        # Heuristic 3: Filter out names that are likely just initials or very short.
        # This is partially covered by initial filtering, but can be refined.
        # E.g., "J. D." might be an author, but "J." alone is unlikely.
        if len(author.split()) == 1 and len(author) <= 2 and author.isupper():
            continue # Skip single-letter or two-letter uppercase (e.g., "JD")

        filtered_authors.add(author)

    # Convert to list and sort for consistent output
    return ", ".join(list(filtered_authors))

In [264]:

def extract_element_text(element: Element | None) -> str:
    if element is not None:
        # Use itertext() to get all text content from the <p> tag and its descendants
        # and join them into a single string.
        all_text = " ".join(element.itertext(tag=None)).replace('\u200b', '').strip()
        return all_text[:2000]
    else:
        return ""
    
def extract_next_sibling_text(elements: list[Element] | None, sibling_xpath: str) -> str:
    """
    Extracts text from the next sibling of the given XML element.
    
    Args:
        element (Element | None): The XML element whose next sibling's text is to be extracted.
        sibling_xpath (str): The XPath expression to find the next sibling element. (eg. "following-sibling::passage[1]")
        
    Returns:
        str: A string containing the text from the next sibling element, or an empty string if no sibling exists.

    """
    # Check if the provided elements list is None or empty
    if not elements:
        return ""
    
    # Assuming there's only one such element, take the first one found
    # and find the element immediately following based on the given sibling_xpath.
    first_element = elements[0]
    sibling_elements = first_element.xpath(sibling_xpath)

    if not sibling_elements:
        # print("DEBUG: No following <passage> element found.") # Uncomment for debugging
        return ""
    
    next_sibling = sibling_elements[0]
    if next_sibling is None:
        return ""
    
    return extract_element_text(next_sibling)

def extract_elements_text(elements: list[Element] | None, sep: str = " ") -> str:
    elements_text = []
    if elements is None:
        return ""
    
    for element in elements:
        text = extract_element_text(element)
        if text:
            elements_text.append(text)

    return sep.join(elements_text).strip()

def extract_elements_text_from_xpath_list(root: Element | None, xpath_list: list[str], ns: dict[str, str] | None = None) -> str:
    elements_text = ""
    if root is None or not xpath_list:
        return ""
    
    for xpath in xpath_list:
        element = root.find(xpath, namespaces=ns)
        elements_text += extract_element_text(element)
    return elements_text

def extract_text_from_elements_within_element(element: Element | None, child_xpaths: list[str] = [], ns: dict[str, str] | None = None) -> str:
    """
    Extracts text from elements within a given XML element that match the specified tag names.
    
    Args:
        element (Element | None): The XML element to search within.
        tag_names (list[str]): A list of tag names to search for.
        
    Returns:
        str: A string containing the extracted text from the matching elements.
    """
    if element is None:
        return ""
    
    if not child_xpaths:
        # If no child tag names are provided, return the text of the element itself
        return extract_element_text(element)
    
    extracted_text = []
    for xpath in child_xpaths:
        for child in element.findall(xpath, namespaces=ns):
            text = extract_element_text(child)
            if text:
                extracted_text.append(text)
    
    return "|".join(extracted_text)

def extract_data_related_elements_text(elements: list[Element] | None, child_xpaths: list[str] = [], ns: dict[str, str] | None = None) -> list[str]:
    elements_text = []
    if elements is None:
        return elements_text
    
    for element in elements:
        text = extract_dataset_ids(extract_text_from_elements_within_element(element, child_xpaths, ns))
        if text:
            elements_text.append(text)

    return elements_text

def extract_data_related_elements_text_from_xpath_list(root: Element | None, xpath_list: list[str], ns: dict[str, str] | None = None) -> list[str]:
    """
    Extracts text from elements in the XML tree that match the provided XPath expressions.
    
    Args:
        root (Element | None): The root element of the XML tree.
        xpath_list (list[str]): A list of XPath expressions to search for elements.
        
    Returns:
        list[str]: A list of extracted text from the matching elements.
    """
    elements_text = []
    if root is None or not xpath_list:
        return elements_text
    
    for xpath in xpath_list:
        primary_xpath, *child_xpath_text = xpath.split('||')
        child_xpaths = child_xpath_text[0].split(',') if child_xpath_text else []
        elements = root.findall(primary_xpath, namespaces=ns)
        if elements:
            elements_text.extend(extract_data_related_elements_text(elements, child_xpaths, ns))
    return elements_text


In [265]:
def extract_pdf_text(pdf_doc: fitz.Document)  -> dict[str, str | list[str]]:
    """
    Extracts all text from a PDF document using PyMuPDF.
    
    Args:
        pdf_doc (fitz.Document): The PDF document to extract text from.
        
    Returns:
        str: A JSON string of the article_dict containing specific elements extracted from the PDF.
    """

    # Initialize the article dictionary with empty strings
    article_dict = {
        'title': '',
        'authors': '',
        'abstract': '',
        'data_availability': '',
        'other_dataset_citations': []
    }

    # Initialize variables for text extraction
    p1 = None  # Placeholder for the first page text
    other_dataset_citations = set()  # Use a set to avoid duplicates
    for page in pdf_doc:
        # Extract text from the page
        textpage = page.get_textpage()
        if page.number == 0:
            p1_txt = textpage.extractTEXT()
            p1 = get_sentences_from_text(p1_txt)
            p1 = p1[:int(len(p1)/2)]
            article_dict['authors'] = extract_author_names(p1_txt, nlp=NLP_SPACY)

        # Extract text from all blocks that have an abstract or dataset id's
        blocks = textpage.extractBLOCKS()
        for block in blocks:
            block_text = get_sentences_from_text(block[4])
            block_text_lower = block_text.lower()
            if page.number == 0 and len(block_text) > 100 and "abstract" in block_text_lower:
                # Add the abstract block text to the article dictionary
                article_dict['abstract'] = block_text
            elif "data availability" in block_text_lower or "data accessibility" in block_text_lower:
                # Add the data availability block text to the article dictionary
                article_dict['data_availability'] = block_text
            else:
                context_chars = min(250, len(block_text))  # Use a minimum
                dataset_ids_found = extract_dataset_ids(block_text, context_chars)  # Extract dataset IDs from the block text
                if dataset_ids_found:
                    # Append the dataset IDs found in the block to the other_dataset_citations
                    other_dataset_citations.add(dataset_ids_found)

    article_dict['other_dataset_citations'] = list(other_dataset_citations) if other_dataset_citations else []
    # If an abstract was not found, use the first page text as the abstract
    if not article_dict['abstract'] and p1:
        article_dict['abstract'] = p1

    # Return the article dictionary as a JSON string
    return article_dict


In [266]:

def extract_xml_text_jats(root: Element) -> dict[str, str | list[str]]:
    # Find the title, abstract, and data availablity info for Journal Archiving and Interchange DTD (JATS)
    # The ".//" ensures it searches anywhere in the document, not just direct children of root.
    ns = None  # No namespaces for JATS

    xpath_title = ".//article-title"
    xpath_authors_1 = ".//contrib-group/contrib[@contrib-type='author']/name"
    xpath_authors_2 = ".//biblstruct/analytic/author[@role='corresp']/persname"
    authors = extract_elements_text(root.findall(xpath_authors_1, namespaces=ns), sep=", ")
    if not authors:
        authors = extract_elements_text(root.findall(xpath_authors_2, namespaces=ns), sep=", ")
    xpath_abstract = ".//abstract"
    xpath_data_avails = [".//notes[@notes-type='data-availability']", ".//sec[@sec-type='data-availability']"]
    xpath_citations = [".//element-citation||.article-title,.source,.pub-id", ".//mixed-citation"]  # List of XPath expressions for citations

    return {
        'title': extract_element_text(root.find(xpath_title, ns)),
        'authors': authors,
        'abstract': extract_element_text(root.find(xpath_abstract, ns)),
        'data_availability': extract_elements_text_from_xpath_list(root, xpath_data_avails, ns=ns),
        'other_dataset_citations': extract_data_related_elements_text_from_xpath_list(root, xpath_citations, ns=ns),
    }


In [267]:
def extract_xml_text_tei(root: Element) -> dict[str, str | list[str]]:
    # Find the title, abstract, and data availability info for Text Encoding Initiative (TEI)
    # Set the namespace for TEI
    ns = {'tei': 'http://www.tei-c.org/ns/1.0'}

    xpath_title = ".//tei:title"
    xpath_authors = ".//tei:sourcedesc/tei:biblstruct/tei:analytic/tei:author/tei:persname"
    xpath_abstract = ".//tei:abstract"
    xpath_data_avail = "" #".//tei:biblstruct"
    xpath_citations = [".//tei:biblstruct||.//tei:title,.//tei:idno,.//tei:notes"]  # List of XPath expressions for citations
        
    return {
        'title': extract_element_text(root.find(xpath_title, namespaces=ns)),
        'authors': extract_elements_text(root.findall(xpath_authors, namespaces=ns), sep=", "),
        'abstract': extract_element_text(root.find(xpath_abstract, namespaces=ns)),
        'data_availability': xpath_data_avail,  # No direct extraction for TEI data_availability
        'other_dataset_citations': extract_data_related_elements_text_from_xpath_list(root, xpath_citations, ns=ns),
    }


In [268]:
def extract_xml_text_wiley(root: Element) -> dict[str, str | list[str]]:
    # Find the title, abstract, and data availability info for Wiley XML format
    # Set the namespace for Wiley
    ns = {'ns': 'http://www.wiley.com/namespaces/wiley'}

    xpath_title = ".//ns:publicationMeta[@level='part']/ns:titleGroup"    #<publicationMeta level="part"><titleGroup><title type="main">
    xpath_authors = ".//selfCitationGroup/citation[@type='self']/author"
    xpath_abstract = ".//ns:abstract[@type='main']"  #<abstract type="main"
    xpath_data_avail = ".//ns:section[@type='dataAvailability']"  #<section numbered="no" type="dataAvailability"
    xpath_citations = [".//ns:citation||.//ns:articleTitle,.//ns:journalTitle,.//ns:url"]  # List of XPath expressions for citations
        
    return {
        'title': extract_elements_text(root.findall(xpath_title, namespaces=ns)),
        'authors': extract_elements_text(root.findall(xpath_authors, namespaces=ns), sep=", "),
        'abstract': extract_element_text(root.find(xpath_abstract, namespaces=ns)),
        'data_availability': extract_element_text(root.find(xpath_data_avail, namespaces=ns)),
        'other_dataset_citations': extract_data_related_elements_text_from_xpath_list(root, xpath_citations, ns=ns),
    }

In [269]:
def extract_xml_text_bioc(root: Element) -> dict[str, str | list[str]]:
    # Find the title, abstract, and data availability info for BioC-API XML format
    ns = None  # No namespaces for BioC

    xpath_title = "string(.//passage[infon[@key='section_type' and text()='TITLE']]/text)"
    xpath_authors = "string(.//infon[@key='name_0'] | .//infon[@key='name_1'] | .//infon[@key='name_2'] | .//infon[@key='name_3'])"
    xpath_abstract = "string(.//passage[infon[@key='section_type' and text()='ABSTRACT']]/text)"
    xpath_data_avail = ".//passage[text[text()='DATA ACCESSIBILITY:']]"
    xpath_data_avail_sibling = "following-sibling::passage[1]"
    xpath_citations = []
        
    return {
        'title': root.xpath(xpath_title, namespaces=ns),
        'authors': root.xpath(xpath_authors, namespaces=ns).strip().replace('surname:', '').replace(';given-names:', ' '),
        'abstract': root.xpath(xpath_abstract, namespaces=ns)[:2000],  # Limit to 2000 characters
        'data_availability': extract_next_sibling_text(root.xpath(xpath_data_avail, namespaces=ns), xpath_data_avail_sibling),
        'other_dataset_citations': xpath_citations,
    }

In [270]:
def extract_xml_text_taxonx(root: Element) -> dict[str, str | list[str]]:
    # Find the title, abstract, and data availability info for TaxonX format
    ns = None  # No namespaces for Taxonomic Treatment Publishing DTD

    xpath_title = "string(.//article-meta/title-group/article-title)"
    xpath_authors = ""
    xpath_abstract = "string(.//article-meta/abstract)"
    xpath_data_avail = ""
    xpath_citations = []
        
    return {
        'title': root.xpath(xpath_title, namespaces=ns),
        'authors': xpath_authors,
        'abstract': root.xpath(xpath_abstract, namespaces=ns)[:2000],  # Limit to 2000 characters
        'data_availability': xpath_data_avail,  # No direct extraction for TaxonX data_availability
        'other_dataset_citations': xpath_citations,
    }

In [271]:
# Dictionary mapping XML types to their respective extraction functions
XML_TYPE_EXTRACTORS = {
    'jats': extract_xml_text_jats,
    'tei': extract_xml_text_tei,
    'wiley': extract_xml_text_wiley,
    'bioc': extract_xml_text_bioc,
    'taxonx': extract_xml_text_taxonx,
}

# --- Data Loading ---
def load_file_paths(dataset_type_dir: str) -> pd.DataFrame: 
    pdf_path = os.path.join(dataset_type_dir, 'PDF')
    xml_path = os.path.join(dataset_type_dir, 'XML')
    dataset_type = os.path.basename(dataset_type_dir)
    pdf_files = [f for f in os.listdir(pdf_path) if f.endswith('.pdf')]
    xml_files = [f for f in os.listdir(xml_path) if f.endswith('.xml')]
    df_pdf = pd.DataFrame({
        'article_id': [f.replace('.pdf', '') for f in pdf_files],
        'pdf_file_path': [os.path.join(pdf_path, f) for f in pdf_files]
    })
    df_xml = pd.DataFrame({
        'article_id': [f.replace('.xml', '') for f in xml_files],
        'xml_file_path': [os.path.join(xml_path, f) for f in xml_files]
    })
    merge_df = pd.merge(df_pdf, df_xml, on='article_id', how='outer', suffixes=('_pdf', '_xml'), validate="one_to_many")
    merge_df['dataset_type'] = dataset_type
    return merge_df

def read_pdf_text(file_path: str, xml_type: str | None = None) -> dict[str, str | list[str]]:
    """Extracts all text from a PDF file using PyMuPDF."""
    article_dict = {}
    if file_path and os.path.exists(file_path):
        try:
            with fitz.open(file_path) as doc:
                article_dict = extract_pdf_text(doc)  # Extract text from the PDF document
        except Exception as e:
            print(f"Error reading PDF {file_path}: {e}")
    else:
        print(f"PDF file not found: {file_path}")
    
    return article_dict

def read_xml_text(file_path: str, xml_type: str) -> dict[str, str | list[str]]:
    """Reads and extracts text from an XML file based on the specified XML type.
    Args:
        file_path (str): The path to the XML file.
        xml_type (str): The type of XML format (e.g., 'jats', 'tei', 'wiley', 'bioc', 'taxonx').
    Returns:
        dict: A dictionary containing the extracted text from the XML file.
    """
    # Initialize the article dictionary
    article_dict = {}
    if file_path and os.path.exists(file_path):
        # Disable external entity resolution for security
        parser = etree.XMLParser(resolve_entities=False, no_network=True)
        try:
            tree = etree.parse(file_path, parser)
            root = tree.getroot()
            # Use the appropriate extraction function based on the xml_type
            extract_function = XML_TYPE_EXTRACTORS.get(xml_type, extract_xml_text_jats)  
            article_dict = extract_function(root)
        except Exception as e:
            print(f"Error reading XML {file_path}: {e}")
    else:
        print(f"XML file not found: {file_path}")    
    return article_dict

def process_unsupported_file(file_path: str, xml_type: str | None = None) -> str:
    return f"Unsupported file type for: {file_path}"

# Dictionary mapping file extensions to loading functions
FILE_LOADERS = {
    '.xml': read_xml_text,
    '.pdf': read_pdf_text,
}

def load_article_text(file_path: str, xml_type: str | None = None) -> str:
    """
    Loads text content from a single article file (PDF or XML).
    Returns the text content of the given file.
    """
    text_content = ""

    # Get the file extension (e.g., '.xml', '.pdf')
    _, file_extension = os.path.splitext(file_path)
    file_extension = file_extension.lower() # Ensure lowercase for consistent lookup

    # Get the appropriate function from the dictionary,
    # or fall back to a default 'unsupported' function if not found.
    loader_function = FILE_LOADERS.get(file_extension, process_unsupported_file)

    # Call the selected function
    article_dict = loader_function(file_path, xml_type=xml_type)
    text_content = json.dumps(article_dict, separators=(',', ':'))
    print(f"Extracted text from {file_path}. Length: {len(text_content)} characters")

    return text_content


In [272]:
# Test loading various PDF files
pdf_file_path = os.path.join(ARTICLE_TRAIN_DIR, 'PDF', '10.1002_2017jc013030.pdf')
#pdf_file_path = os.path.join(ARTICLE_TRAIN_DIR, 'PDF', '10.1017_rdc.2022.19.pdf')
#pdf_file_path = os.path.join(ARTICLE_TRAIN_DIR, 'PDF', '10.1017_s0007123423000601.pdf')
#pdf_file_path = os.path.join(ARTICLE_TRAIN_DIR, 'PDF', '10.3389_fcimb.2024.1292467.pdf')
#pdf_file_path = os.path.join(ARTICLE_TRAIN_DIR, 'PDF', '10.1002_esp.5058.pdf') # This one is big
#pdf_file_path = os.path.join(ARTICLE_TRAIN_DIR, 'PDF', '10.1002_esp.5059.pdf') # This one is big
#pdf_file_path = os.path.join(ARTICLE_TRAIN_DIR, 'PDF', '10.1002_ece3.4466.pdf') # dryad
#pdf_file_path = os.path.join(ARTICLE_TRAIN_DIR, 'XML', '10.1002_ece3.4466.xml') # dryad
# pdf_file_path = os.path.join(ARTICLE_TRAIN_DIR, 'XML', '10.1002_mp.14424.xml')
# pdf_file_path = os.path.join(ARTICLE_TRAIN_DIR, 'XML', '10.1007_s00259-022-06053-8.xml')    # jats
pdf_file_path = os.path.join(ARTICLE_TRAIN_DIR, 'XML', '10.1007_s00382-022-06361-7.xml')    # tei
pdf_file_path = os.path.join(ARTICLE_TRAIN_DIR, 'XML', '10.1111_1365-2435.13431.xml')       # wiley
pdf_file_path = os.path.join(ARTICLE_TRAIN_DIR, 'XML', '10.1111_mec.16977.xml')             # bioc
# pdf_file_path = os.path.join(ARTICLE_TRAIN_DIR, 'XML', '10.3897_zoologia.35.e23481.xml')      # taxonx
# pdf_file_path = os.path.join(ARTICLE_TRAIN_DIR, 'XML', '10.1002_ece3.6144.xml')               # jats
file_text = load_article_text(pdf_file_path, 'bioc')
display(len(file_text))
file_text

Extracted text from ./kaggle/input/make-data-count-finding-data-references\train\XML\10.1111_mec.16977.xml. Length: 1824 characters


1824

'{"title":"Genetic basis of ecologically relevant body shape variation among four genera of cichlid fishes","authors":"DeLorenzo Leah","abstract":"Divergence in body shape is one of the most widespread and repeated patterns of morphological variation in fishes and is associated with habitat specification and swimming mechanics. Such ecological diversification is the first stage of the explosive adaptive radiation of cichlid fishes in the East African Rift Lakes. We use two hybrid crosses of cichlids (Metriaclima sp. \\u00c3\\u0097 Aulonocara sp. and Labidochromis sp. \\u00c3\\u0097 Labeotropheus sp., >975 animals total) to determine the genetic basis of body shape diversification that is similar to benthic-pelagic divergence across fishes. Using a series of both linear and geometric shape measurements, we identify 34 quantitative trait loci (QTL) that underlie various aspects of body shape variation. These QTL are spread throughout the genome, each explain 3.2\\u00e2\\u0080\\u00938.6% 

## EDA

In [273]:
# Load the labeled training data CSV file
print(f"Loading labeled training data from: {LABELED_TRAINING_DATA_CSV_PATH}")
train_labels_df = pd.read_csv(LABELED_TRAINING_DATA_CSV_PATH)

print(f"Training labels shape: {train_labels_df.shape}")
display(train_labels_df.head())

# Create a new column 'dataset_id_trim' by extracting the first 3 characters of the 'dataset_id' column
train_labels_df['dataset_id_trim'] = train_labels_df['dataset_id'].str[:3]
# Find the most frequent types of dataset_id's
freq_dataset_id_df = train_labels_df.groupby('dataset_id_trim').count().reset_index()
freq_dataset_id_df = freq_dataset_id_df[['dataset_id_trim', 'article_id']].sort_values(by='article_id', ascending=False)
print(f"Grouped dataset ID counts:\n{freq_dataset_id_df.head(10)}")

# Display the first 10 rows where dataset_id_trim is 'EPI'
display(train_labels_df[train_labels_df['dataset_id_trim'] == 'EPI'].sample(3))

Loading labeled training data from: ./kaggle/input/make-data-count-finding-data-references\train_labels.csv
Training labels shape: (1028, 3)


Unnamed: 0,article_id,dataset_id,type
0,10.1002_2017jc013030,https://doi.org/10.17882/49388,Primary
1,10.1002_anie.201916483,Missing,Missing
2,10.1002_anie.202005531,Missing,Missing
3,10.1002_anie.202007717,Missing,Missing
4,10.1002_chem.201902131,Missing,Missing


Grouped dataset ID counts:
   dataset_id_trim  article_id
52             htt         325
29             Mis         309
20             EPI          64
47             SAM          41
25             IPR          33
11             CHE          29
41             PRJ          26
16             E-G          25
19             ENS          21
26             K02          20


Unnamed: 0,article_id,dataset_id,type,dataset_id_trim
923,10.3390_v11060565,EPI954553,Secondary,EPI
382,10.1128_JVI.01717-21,EPI_ISL_332399,Primary,EPI
925,10.3390_v11060565,EPI954555,Secondary,EPI


In [274]:
# Group training data by article_id to get all datasets for each article
# This creates a dictionary where keys are article_ids and values are lists of dataset dicts
grouped_training_data = {}
for article_id, group_df in train_labels_df.groupby('article_id'):
    grouped_training_data[article_id] = group_df[['dataset_id', 'type']].to_dict('records')

# Example usage of grouped_training_data
print(f"Example grouped training data for article_id '10.1002_2017jc013030': {grouped_training_data['10.1002_2017jc013030']}")


Example grouped training data for article_id '10.1002_2017jc013030': [{'dataset_id': 'https://doi.org/10.17882/49388', 'type': 'Primary'}]


In [275]:
def read_first_line_of_xml(file_path: str) -> str | None:
    """
    Reads and returns the first line of an XML file.

    Args:
        file_path (str): The path to the XML file.

    Returns:
        str | None: The first line of the file, stripped of leading/trailing whitespace,
                    or None if the file cannot be read or is empty.
    """
    if not file_path and not os.path.exists(file_path):
        return None
    
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            first_line = f.readline().replace('<?xml version="1.0" encoding="UTF-8"?>', '').replace('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>', '').strip()
            # If the first line is empty, read the next line
            if not first_line:
                first_line = f.readline()
            return first_line.strip()[:90] if first_line else None
    except UnicodeDecodeError:
        try:
            with open(file_path, 'r', encoding='iso-8859-1') as f:
                first_line = f.readline().replace('<?xml version="1.0" encoding="UTF-8"?>', '').replace('<?xml version="1.0" encoding="UTF-8" standalone="yes"?>', '').strip()
                if not first_line:
                    first_line = f.readline()
                return first_line.strip()[:90] if first_line else None
        except Exception as e:
            return None
    except Exception as e:
        print(f"Error reading file '{file_path}': {e}")
        return None

In [276]:
def identify_xml_type(first_line: str) -> str:
    """
    Identifies the XML type based on the first line of the XML file.
    
    Args:
        file_path (str): The path to the XML file.
        
    Returns:
        str: The identified XML type ('jats', 'tei', 'wiley', 'bioc', or 'unknown').
    """
    if not first_line:
        return "unknown"
    first_line_lower = first_line.lower()
    # Check for specific patterns in the first line
    if 'journal archiving and interchange dtd' in first_line_lower:
        return "jats"
    elif 'xmlns="http://www.tei-c.org/ns/1.0"' in first_line_lower:
        return "tei"
    elif 'xmlns="http://www.wiley.com/namespaces/wiley"' in first_line_lower:
        return "wiley"
    elif 'bioc.dtd' in first_line_lower or 'bioc-api' in first_line_lower:
        return "bioc"
    elif 'taxonomic treatment publishing dtd' in first_line_lower:
        return "taxonx"
    
    return "unknown"

In [277]:
# Load file paths for training and testing datasets
train_file_paths_df = load_file_paths(ARTICLE_TRAIN_DIR)
test_file_paths_df = load_file_paths(ARTICLE_TEST_DIR)
train_file_paths_df['xml_file_path'] = train_file_paths_df['xml_file_path'].fillna('')
test_file_paths_df['xml_file_path'] = test_file_paths_df['xml_file_path'].fillna('')

# Remove rows in train_file_paths_df that have a corresponding article_id in test_file_paths_df
# train_file_paths_df = train_file_paths_df[~train_file_paths_df['article_id'].isin(test_file_paths_df['article_id'])]

# Merge the file paths with the grouped_training_data
train_file_paths_df['dataset_info'] = train_file_paths_df['article_id'].map(grouped_training_data)
test_file_paths_df['dataset_info'] = test_file_paths_df['article_id'].map(grouped_training_data)

# Get first line of XML files to determine the type
train_file_paths_df['xml_first_line'] = train_file_paths_df['xml_file_path'].apply(read_first_line_of_xml)
test_file_paths_df['xml_first_line'] = test_file_paths_df['xml_file_path'].apply(read_first_line_of_xml)

# Identify the xml_type based on the first line of the XML file
train_file_paths_df['xml_type'] = train_file_paths_df['xml_first_line'].apply(lambda x: identify_xml_type(x) if x else None)
test_file_paths_df['xml_type'] = test_file_paths_df['xml_first_line'].apply(lambda x: identify_xml_type(x) if x else None)

print(f"Train files paths shape: {train_file_paths_df.shape}")
display(train_file_paths_df.sample(3))
print(f"Test files paths shape: {test_file_paths_df.shape}")
display(test_file_paths_df.sample(3))

Train files paths shape: (524, 7)


Unnamed: 0,article_id,pdf_file_path,xml_file_path,dataset_type,dataset_info,xml_first_line,xml_type
405,10.1590_0047-2085000000239,./kaggle/input/make-data-count-finding-data-re...,,train,"[{'dataset_id': 'Missing', 'type': 'Missing'}]",,
259,10.1186_s12870-019-2199-7,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,train,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats
476,10.3390_microorganisms8121872,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,train,"[{'dataset_id': 'CHEMBL1568820', 'type': 'Seco...","<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats


Test files paths shape: (30, 7)


Unnamed: 0,article_id,pdf_file_path,xml_file_path,dataset_type,dataset_info,xml_first_line,xml_type
25,10.1002_esp.5058,./kaggle/input/make-data-count-finding-data-re...,,test,[{'dataset_id': 'https://doi.org/10.5061/dryad...,,
5,10.1002_chem.201903120,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,test,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats
7,10.1002_chem.202001412,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,test,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats


In [278]:
train_file_paths_df['xml_first_line'].unique()

array(['<html><body><tei xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" xmlns:xlink="htt',
       '<!DOCTYPE article PUBLIC "-//NLM//DTD JATS (Z39.96) Journal Archiving and Interchange DTD ',
       '<!DOCTYPE collection SYSTEM "BioC.dtd"><collection><source>BioC-API</source><date>20250509',
       None,
       '<component xmlns="http://www.wiley.com/namespaces/wiley" xmlns:wiley="http://www.wiley.com',
       '<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Archiving and Interchange DTD v3.0 20080202/',
       '<!DOCTYPE collection SYSTEM "BioC.dtd"><collection><source>BioC-API</source><date>20250507',
       '<!DOCTYPE article PUBLIC "-//TaxonX//DTD Taxonomic Treatment Publishing DTD v0 20100105//E'],
      dtype=object)

In [279]:
train_file_paths_df

Unnamed: 0,article_id,pdf_file_path,xml_file_path,dataset_type,dataset_info,xml_first_line,xml_type
0,10.1002_2017jc013030,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,train,[{'dataset_id': 'https://doi.org/10.17882/4938...,"<html><body><tei xml:space=""preserve"" xmlns=""h...",tei
1,10.1002_anie.201916483,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,train,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats
2,10.1002_anie.202005531,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,train,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE collection SYSTEM ""BioC.dtd""><collec...",bioc
3,10.1002_anie.202007717,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,train,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats
4,10.1002_chem.201902131,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,train,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats
...,...,...,...,...,...,...,...
519,10.7554_elife.74937,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,train,[{'dataset_id': 'https://doi.org/10.5281/zenod...,"<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats
520,10.7717_peerj.10452,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,train,"[{'dataset_id': 'PRJNA664798', 'type': 'Second...","<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats
521,10.7717_peerj.11352,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,train,[{'dataset_id': 'https://doi.org/10.7291/d11m3...,"<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats
522,10.7717_peerj.12422,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,train,[{'dataset_id': 'https://doi.org/10.15468/dl.c...,"<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats


In [280]:
# Load the PDF text content for each article in the training set
train_file_paths_df['pdf_text'] = train_file_paths_df['pdf_file_path'].apply(lambda x: load_article_text(x, xml_type=None) if x else "")
# Load the XML text content for each article in the training set
train_file_paths_df['xml_text'] = train_file_paths_df.apply(lambda row: load_article_text(row['xml_file_path'], xml_type=row['xml_type']) if row['xml_file_path'] else "", axis=1)
# Load the PDF text content for each article in the test set
test_file_paths_df['pdf_text'] = test_file_paths_df['pdf_file_path'].apply(lambda x: load_article_text(x, xml_type=None) if x else "")
# Load the XML text content for each article in the test set
test_file_paths_df['xml_text'] = test_file_paths_df.apply(lambda row: load_article_text(row['xml_file_path'], xml_type=row['xml_type']) if row['xml_file_path'] else "", axis=1)
# Combine the PDF and XML text content into a single 'text' column for training and test sets
# train_file_paths_df['text'] = train_file_paths_df['pdf_text'] + "\n" + train_file_paths_df['xml_text']
test_file_paths_df['text'] = test_file_paths_df['pdf_text'] + "\n" + test_file_paths_df['xml_text']
# Display the first few rows of the training file paths DataFrame
print(f"Training file paths DataFrame shape: {train_file_paths_df.shape}")
display(train_file_paths_df.head(3))
# Display the first few rows of the training file paths DataFrame
print(f"Training file paths DataFrame shape: {test_file_paths_df.shape}")
display(test_file_paths_df.head(3))
# Save the training and test file paths DataFrames to CSV files
train_file_paths_df.to_csv(os.path.join(BASE_OUTPUT_DIR, 'train_file_paths.csv'), index=False)
test_file_paths_df.to_csv(os.path.join(BASE_OUTPUT_DIR, 'test_file_paths.csv'), index=False)


Extracted text from ./kaggle/input/make-data-count-finding-data-references\train\PDF\10.1002_2017jc013030.pdf. Length: 4956 characters
Extracted text from ./kaggle/input/make-data-count-finding-data-references\train\PDF\10.1002_anie.201916483.pdf. Length: 1355 characters
Extracted text from ./kaggle/input/make-data-count-finding-data-references\train\PDF\10.1002_anie.202005531.pdf. Length: 1736 characters
Extracted text from ./kaggle/input/make-data-count-finding-data-references\train\PDF\10.1002_anie.202007717.pdf. Length: 1734 characters
Extracted text from ./kaggle/input/make-data-count-finding-data-references\train\PDF\10.1002_chem.201902131.pdf. Length: 671 characters
Extracted text from ./kaggle/input/make-data-count-finding-data-references\train\PDF\10.1002_chem.201903120.pdf. Length: 941 characters
Extracted text from ./kaggle/input/make-data-count-finding-data-references\train\PDF\10.1002_chem.202000235.pdf. Length: 734 characters
Extracted text from ./kaggle/input/make-data-c

Unnamed: 0,article_id,pdf_file_path,xml_file_path,dataset_type,dataset_info,xml_first_line,xml_type,pdf_text,xml_text
0,10.1002_2017jc013030,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,train,[{'dataset_id': 'https://doi.org/10.17882/4938...,"<html><body><tei xml:space=""preserve"" xmlns=""h...",tei,"{""title"":"""",""authors"":""Annick Bricaud, Bernard...","{""title"":""Assessing the variability in the rel..."
1,10.1002_anie.201916483,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,train,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats,"{""title"":"""",""authors"":""Reiner Anwander, Daniel...","{""title"":""Effective and Reversible Carbon Diox..."
2,10.1002_anie.202005531,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,train,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE collection SYSTEM ""BioC.dtd""><collec...",bioc,"{""title"":"""",""authors"":""Gregory J. P. Perry, Tr...","{""title"":""Trifluoromethyl Sulfoxides: Reagents..."


Training file paths DataFrame shape: (30, 10)


Unnamed: 0,article_id,pdf_file_path,xml_file_path,dataset_type,dataset_info,xml_first_line,xml_type,pdf_text,xml_text,text
0,10.1002_2017jc013030,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,test,[{'dataset_id': 'https://doi.org/10.17882/4938...,"<html><body><tei xml:space=""preserve"" xmlns=""h...",tei,"{""title"":"""",""authors"":""Annick Bricaud, Bernard...","{""title"":""Assessing the variability in the rel...","{""title"":"""",""authors"":""Annick Bricaud, Bernard..."
1,10.1002_anie.201916483,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,test,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats,"{""title"":"""",""authors"":""Reiner Anwander, Daniel...","{""title"":""Effective and Reversible Carbon Diox...","{""title"":"""",""authors"":""Reiner Anwander, Daniel..."
2,10.1002_anie.202005531,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,test,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE collection SYSTEM ""BioC.dtd""><collec...",bioc,"{""title"":"""",""authors"":""Gregory J. P. Perry, Tr...","{""title"":""Trifluoromethyl Sulfoxides: Reagents...","{""title"":"""",""authors"":""Gregory J. P. Perry, Tr..."


In [281]:
# Save the training and test file paths DataFrames to CSV files
train_file_paths_df.to_csv(os.path.join(BASE_OUTPUT_DIR, 'train_file_paths.csv'), index=False)
test_file_paths_df.to_csv(os.path.join(BASE_OUTPUT_DIR, 'test_file_paths.csv'), index=False)


In [282]:
# --- QwenModelEval Class ---
# kagglehub.model_download("qwen-lm/qwen-3/transformers/0.6b")
#max_new_tokens=32768
class QwenModelEval:
    def __init__(self, model_name, sys_prompt, enable_thinking=True, max_new_tokens=1024, max_input_length=8200):
        print(f"Loading Qwen model and tokenizer from: {model_name}")
        self.model_name = model_name
        self.sys_prompt = sys_prompt
        self.enable_thinking = enable_thinking  # Enable or disable thinking mode
        self.max_new_tokens = max_new_tokens  # Set the maximum number of new tokens to generate
        self.max_input_length = max_input_length  # Set the maximum input length for the model
        # Load the tokenizer and model
        # Using trust_remote_code=True to allow custom model code execution
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
        self.model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", trust_remote_code=True)
        self.model.eval() # Set the model to evaluation mode here.

    def generate_response(self, user_input):  
        inputs = self._get_inputs(user_input)
        # Disable gradient calculation during inference
        # Generate the response using the model
        with torch.no_grad(): 
            generated_ids = self.model.generate(
                **inputs,
                max_new_tokens=self.max_new_tokens,
                pad_token_id=self.tokenizer.eos_token_id,
                eos_token_id=self.tokenizer.convert_tokens_to_ids("<|im_end|>"),
                # do_sample=False, # Use greedy decoding (fastest)
                # num_beams=1,     # Do not use beam search (fastest)
                # temperature=0.0, # Make output deterministic (if do_sample=False, this has no effect)                
                temperature=0.6 if self.enable_thinking else 0.7,
                top_p=0.95 if self.enable_thinking else 0.8,
                top_k=20,
                min_p=0
            )
        # Parse the response and thinking content
        return self._parse_response(inputs, generated_ids)

    def _get_inputs(self, user_input):
        """Prepare the input for the model based on user input."""
        # Trim the user input to a maximum length for better performance
        user_input = user_input[:self.max_input_length]  # Limit input length to 4096 characters
        print(f"Preparing input with length: {len(user_input)}")
        # Create the messages for the chat template
        messages = [
            {"role": "system", "content": self.sys_prompt},
            {"role": "user", "content": user_input}
        ]
        text = self.tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
            enable_thinking=self.enable_thinking
        )
        return self.tokenizer(text, return_tensors="pt").to(self.model.device)
    
    def _parse_response(self, inputs, generated_ids):
        print("Parsing response from generated IDs...")
        # Extract the output IDs from the generated IDs
        output_ids = generated_ids[0][len(inputs.input_ids[0]):].tolist()
        try:
            index = len(output_ids) - output_ids[::-1].index(151668)
        except ValueError:
            index = 0

        thinking_content = self.tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
        raw_response = self.tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
        response = self._parse_json(raw_response)
        return response, thinking_content
    
    def _parse_json(self, raw_response: str) -> list[dict[str,str]]:
        # Remove code block markers and leading/trailing whitespace
        cleaned = raw_response.strip()
        if cleaned.startswith("```json"):
            cleaned = cleaned[len("```json"):].strip()
        if cleaned.endswith("```"):
            cleaned = cleaned[:-3].strip()

        # Now parse as JSON
        try:
            return json.loads(cleaned)
        except json.JSONDecodeError as e:
            return []        
        


In [292]:
# Define the one-shot reasoning and task prompt
# This prompt is designed to guide the model through a structured reasoning process
SYS_PROMPT = """
You are an advanced AI research assistant specialized in identifying and classifying datasets used within academic research papers.
Your primary goal is to accurately extract and categorize dataset identifiers (dataset_ids) from provided paper sections.

---

### Input Data Structure

You will receive a JSON string representing key sections of an academic paper, structured as follows:

```json
{
    "title": "Title of the paper",
    "authors": "List of authors, e.g., 'Author A, Author B'",
    "abstract": "Abstract of the paper",
    "data_availability": "Data availability information",
    "other_dataset_citations": [
        {"dataset_ids": ["10.12345/12345"], "citation_context": "Dataset citation context 1"},
        {"dataset_ids": ["10.1234/xxxx.1x1x-xx11", "EPI_ISL_12345678"], "citation_context": "Dataset citation context 2"},
        ...
    ]
}
```

**Guidance on Input Sections:**
*   **`title`**: Provides general context for the paper's topic.
*   **`authors`**: Lists the authors of the paper, which can help in classifying datasets as Primary or Secondary.
*   **`abstract`**: **Crucial** for understanding the research scope and classifying datasets as Primary or Secondary. Use it to determine if a dataset is *directly related* to *this paper's* specific research.
*   **`data_availability`**: This section, if present, often contains **Primary** datasets directly used or generated by the authors for the research of this paper.
*   **`other_dataset_citations`**: A list of potential dataset citations. The `citation_context` is vital to confirm if a `dataset_id` truly refers to a dataset and to aid in classification.

---

### Core Objective & Critical Exclusion

Your overarching objective is to identify and classify **only valid, data-related `dataset_id`s**.

**CRITICAL EXCLUSION**: You **MUST NOT** extract any `dataset_id`s that refer to other academic papers, articles, or the paper itself. Focus strictly on identifiers for *datasets*.

---

### Key Definitions

*   **`dataset_id`**: A unique, persistent identifier for a dataset. There are two main types:

    1.  **Digital Object Identifier (DOI)**:
        *   **Format**: `[prefix]/[suffix]`. The prefix always starts with "10." and is followed by a 4 or 5 digit number. The suffix can contain letters, numbers, and special characters.
        *   May or may not start with "https://doi.org/" or "doi:".
        *   **IMPORTANT DOI VALIDATION RULE**:
            *   Only identify DOIs that are explicitly used as `dataset_id`s.
            *   **DO NOT extract DOIs for academic papers/articles.**
            *   **If a DOI is presented as a reference to a publication (e.g., "as described in [DOI]", "cited in [DOI]", "see [DOI] for details on the method"), it is NOT a dataset_id.**
            *   A DOI is a `dataset_id` ONLY if the surrounding `citation_context` or `data_availability` section clearly indicates it refers to a dataset, data repository, data archive, or similar data-specific entity.

    2.  **Accession ID**:
        *   Typically alphanumeric strings that uniquely identify a dataset within a specific data repository.
        *   Can be found in both `data_availability` and `other_dataset_citations` sections.
        *   *Examples*: `"EPI_ISL_12345678"` (EPI dataset), `"IPR000264"` (InterPro dataset), `"SAMN07159041"` (NCBI Sequence Read Archive dataset), `"CHEMBL1782574"` (ChEMBL dataset)

*   **Dataset Type Classification**:

    *   **Primary**: Raw or processed data *generated or created by the **authors** as part of the research specifically related to the topic of this paper*. This data is central to the findings presented in *this* article.
        *   *Keywords often associated with Primary*: "generated in this study", "our data", "newly sequenced", "collected for this project".
    *   **Secondary**: Raw or processed data *reused from existing records or published data that was only used by the authors and not created by the **authors** for this paper*. This data might be used for comparison, background, or as a starting point, but it's not the novel data produced by *this* specific research.
        *   *Keywords often associated with Secondary*: "publicly available data", "previously published", "re-analyzed data", "obtained from", "data from [another study/database]", "existing dataset", "external database".
    *   **Fallback Rule**: If, after careful consideration of the `abstract` and `citation_context`, the relationship of the dataset to *this paper's* specific research remains ambiguous, then default to "Primary".

---

### Tasks: Step-by-Step Instructions

Follow these three tasks in order:

**Task 1: Identify Valid Dataset IDs**

1.  **Search Priority**: Begin by searching the `data_availability` section. Then, search the `other_dataset_citations` section.
2.  **Validation**: For each potential `dataset_id` (DOI or Accession ID), confirm it is truly data-related.
    *   **For DOIs**: Strictly apply the **IMPORTANT DOI VALIDATION RULE** defined above. If it refers to a publication, **DO NOT** extract it.
    *   **For all IDs**: Look for surrounding terms like "data release", "data availability", "dataset", "database", "repository", "data source", "data access", or "data archive" within the `data_availability` section or the `citation_context`.
3.  **Deduplication**: If the same `dataset_id` is found multiple times, **only process the first instance encountered**.
4.  **Conditional Proceeding**:
    *   If **no valid `dataset_id`s are found** after searching both sections, **skip directly to Task 3** and output the "Missing" JSON structure.
    *   If one or more valid `dataset_id`s are found, proceed to Task 2.

**Task 2: Classify Dataset Types**

1.  For each valid `dataset_id` identified in Task 1, classify its type as either "Primary" or "Secondary".
2.  **Crucially, use the `abstract` section and the `citation_context` (if applicable) to determine if the dataset is *directly related* to the novel research presented in *this paper*.**
3.  Apply the "Key Definitions" for Primary and Secondary types, paying close attention to the associated keywords.
4.  Remember the "Fallback Rule": Only default to "Primary" if, after careful consideration, the classification remains truly ambiguous.

**Task 3: Format and Return Results**

Return your final results as a JSON array of objects.

1.  **Scenario A: No Valid Datasets Found**
    If Task 1 resulted in no valid `dataset_id`s, return a single JSON object with the following structure:
    ```json
    [
        {
            "dataset_id": "Missing",
            "type": "Missing"
        }
    ]
    ```
2.  **Scenario B: One or More Valid Datasets Found**
    If Task 1 identified one or more valid `dataset_id`s, return every valid dataset found in a JSON array of objects, where each object has the following structure:
    ```json
    [
        {
            "dataset_id": "example_id_1",
            "type": "Primary"
        },
        {
            "dataset_id": "example_id_2",
            "type": "Secondary"
        },
        ...
    ]
    ```
"""

In [293]:
# Instantiate the QwenModelEval class with the model path and system prompt
inference_model = QwenModelEval(QWEN_BASE_MODEL_PATH, sys_prompt=SYS_PROMPT, enable_thinking=True, max_new_tokens=1576)

Loading Qwen model and tokenizer from: C:\Users\jim\.cache\kagglehub\models\qwen-lm\qwen-3\transformers\0.6b\1


In [294]:
def process_articles(file_paths_df: pd.DataFrame, model) -> pd.DataFrame:
    results = []
    for i, row in file_paths_df.iterrows():
        article_id = row['article_id']
        dataset_info = row['dataset_info']
        pdf_text = row['pdf_text']
        xml_text = row['xml_text']
        text_type = "XML" if xml_text else "PDF"
        text_content = xml_text if xml_text else pdf_text
        response = ""
        thinking_content = ""

        # Prepare the user input for the model
        user_input = f"Text Content: {text_content}\n"
        
        if text_content:
            print(f"Processing article {i}/{len(file_paths_df)}: {article_id}, type: {text_type}")
            # Generate response from the model
            response, thinking_content = model.generate_response(user_input)

        results.append({
            'article_id': article_id,
            'dataset_info': dataset_info,
            'text_type': text_type,
            'llm_input': user_input,
            'llm_response': response,
            'llm_thinking_content': thinking_content
        })

    return pd.DataFrame(results).sort_values(by=["article_id"]).reset_index(drop=True)




In [286]:
# Load the training file paths DataFrame from the CSV file
train_file_paths_df = pd.read_csv(os.path.join(BASE_OUTPUT_DIR, 'train_file_paths.csv'))
# Load the test file paths DataFrame from the CSV file
test_file_paths_df = pd.read_csv(os.path.join(BASE_OUTPUT_DIR, 'test_file_paths.csv'))
# Display the first few rows of the training file paths DataFrame
print(f"Training file paths DataFrame shape: {train_file_paths_df.shape}")
display(train_file_paths_df.head(3))
# Display the first few rows of the test file paths DataFrame
print(f"Test file paths DataFrame shape: {test_file_paths_df.shape}")
display(test_file_paths_df.head(3))


Training file paths DataFrame shape: (524, 9)


Unnamed: 0,article_id,pdf_file_path,xml_file_path,dataset_type,dataset_info,xml_first_line,xml_type,pdf_text,xml_text
0,10.1002_2017jc013030,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,train,[{'dataset_id': 'https://doi.org/10.17882/4938...,"<html><body><tei xml:space=""preserve"" xmlns=""h...",tei,"{""title"":"""",""authors"":""Annick Bricaud, Bernard...","{""title"":""Assessing the variability in the rel..."
1,10.1002_anie.201916483,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,train,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats,"{""title"":"""",""authors"":""Reiner Anwander, Daniel...","{""title"":""Effective and Reversible Carbon Diox..."
2,10.1002_anie.202005531,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,train,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE collection SYSTEM ""BioC.dtd""><collec...",bioc,"{""title"":"""",""authors"":""Gregory J. P. Perry, Tr...","{""title"":""Trifluoromethyl Sulfoxides: Reagents..."


Test file paths DataFrame shape: (30, 10)


Unnamed: 0,article_id,pdf_file_path,xml_file_path,dataset_type,dataset_info,xml_first_line,xml_type,pdf_text,xml_text,text
0,10.1002_2017jc013030,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,test,[{'dataset_id': 'https://doi.org/10.17882/4938...,"<html><body><tei xml:space=""preserve"" xmlns=""h...",tei,"{""title"":"""",""authors"":""Annick Bricaud, Bernard...","{""title"":""Assessing the variability in the rel...","{""title"":"""",""authors"":""Annick Bricaud, Bernard..."
1,10.1002_anie.201916483,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,test,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats,"{""title"":"""",""authors"":""Reiner Anwander, Daniel...","{""title"":""Effective and Reversible Carbon Diox...","{""title"":"""",""authors"":""Reiner Anwander, Daniel..."
2,10.1002_anie.202005531,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,test,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE collection SYSTEM ""BioC.dtd""><collec...",bioc,"{""title"":"""",""authors"":""Gregory J. P. Perry, Tr...","{""title"":""Trifluoromethyl Sulfoxides: Reagents...","{""title"":"""",""authors"":""Gregory J. P. Perry, Tr..."


In [287]:
# Fill NaN values in the 'xml_type' column with None
train_file_paths_df['xml_type'] = train_file_paths_df['xml_type'].fillna('')
train_file_paths_df['xml_text'] = train_file_paths_df['xml_text'].fillna('')
test_file_paths_df['xml_type'] = test_file_paths_df['xml_type'].fillna('')
test_file_paths_df['xml_text'] = test_file_paths_df['xml_text'].fillna('')
test_file_paths_df

Unnamed: 0,article_id,pdf_file_path,xml_file_path,dataset_type,dataset_info,xml_first_line,xml_type,pdf_text,xml_text,text
0,10.1002_2017jc013030,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,test,[{'dataset_id': 'https://doi.org/10.17882/4938...,"<html><body><tei xml:space=""preserve"" xmlns=""h...",tei,"{""title"":"""",""authors"":""Annick Bricaud, Bernard...","{""title"":""Assessing the variability in the rel...","{""title"":"""",""authors"":""Annick Bricaud, Bernard..."
1,10.1002_anie.201916483,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,test,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats,"{""title"":"""",""authors"":""Reiner Anwander, Daniel...","{""title"":""Effective and Reversible Carbon Diox...","{""title"":"""",""authors"":""Reiner Anwander, Daniel..."
2,10.1002_anie.202005531,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,test,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE collection SYSTEM ""BioC.dtd""><collec...",bioc,"{""title"":"""",""authors"":""Gregory J. P. Perry, Tr...","{""title"":""Trifluoromethyl Sulfoxides: Reagents...","{""title"":"""",""authors"":""Gregory J. P. Perry, Tr..."
3,10.1002_anie.202007717,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,test,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats,"{""title"":"""",""authors"":""Al\ufffdO, Lukas Maximi...","{""title"":""Metal\u2013Ligand Cooperativity of t...","{""title"":"""",""authors"":""Al\ufffdO, Lukas Maximi..."
4,10.1002_chem.201902131,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,test,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats,"{""title"":"""",""authors"":""Nitrogen Heterocycles, ...","{""title"":""A Synthetic Route Toward Tetrazoles:...","{""title"":"""",""authors"":""Nitrogen Heterocycles, ..."
5,10.1002_chem.201903120,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,test,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats,"{""title"":"""",""authors"":""Matthias, Philippe R. M...","{""title"":""Structure\u2013Solubility Relationsh...","{""title"":"""",""authors"":""Matthias, Philippe R. M..."
6,10.1002_chem.202000235,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,test,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats,"{""title"":"""",""authors"":""Borane, Lewis acids.[3,...","{""title"":""An Isolable Bis(Silanone\u2013Borane...","{""title"":"""",""authors"":""Borane, Lewis acids.[3,..."
7,10.1002_chem.202001412,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,test,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats,"{""title"":"""",""authors"":""R. Huisgen, Helmar G\u0...","{""title"":""Iron(0)\u2010Mediated Stereoselectiv...","{""title"":"""",""authors"":""R. Huisgen, Helmar G\u0..."
8,10.1002_chem.202001668,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,test,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats,"{""title"":"""",""authors"":""Anthony B. Carter+,[a, ...","{""title"":""The First Use of a ReX 5 Synthon to...","{""title"":"""",""authors"":""Anthony B. Carter+,[a, ..."
9,10.1002_chem.202003167,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,test,"[{'dataset_id': 'Missing', 'type': 'Missing'}]","<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats,"{""title"":"""",""authors"":""Mar, RhI, bimetallic bo...","{""title"":""Metal\u2010only Lewis Pairs of Rhodi...","{""title"":"""",""authors"":""Mar, RhI, bimetallic bo..."


In [288]:
test_file_paths_df_2 = test_file_paths_df.copy().sample(3, random_state=42).reset_index(drop=True)
test_file_paths_df_2

Unnamed: 0,article_id,pdf_file_path,xml_file_path,dataset_type,dataset_info,xml_first_line,xml_type,pdf_text,xml_text,text
0,10.1002_mp.14424,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,test,[{'dataset_id': 'https://doi.org/10.7937/k9/tc...,"<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats,"{""title"":"""",""authors"":""Zhao Li, John P., Brand...","{""title"":""PleThora: Pleural effusion and thora...","{""title"":"""",""authors"":""Zhao Li, John P., Brand..."
1,10.1002_ece3.6144,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,test,[{'dataset_id': 'https://doi.org/10.5061/dryad...,"<!DOCTYPE article PUBLIC ""-//NLM//DTD JATS (Z3...",jats,"{""title"":"""",""authors"":""Beng, Elena M. Duke\n |...","{""title"":""Efficacy of metabarcoding for identi...","{""title"":"""",""authors"":""Beng, Elena M. Duke\n |..."
2,10.1002_ejoc.202000139,./kaggle/input/make-data-count-finding-data-re...,,test,"[{'dataset_id': 'Missing', 'type': 'Missing'}]",,,"{""title"":"""",""authors"":""Ann Christin Reiers\u00...",,"{""title"":"""",""authors"":""Ann Christin Reiers\u00..."


In [289]:
processed_articles_df = process_articles(test_file_paths_df_2, inference_model)
processed_articles_df

Processing article 0/3: 10.1002_mp.14424, type: XML
Preparing input with length: 3045
Parsing response from generated IDs...
Processing article 1/3: 10.1002_ece3.6144, type: XML
Preparing input with length: 2203
Parsing response from generated IDs...
Processing article 2/3: 10.1002_ejoc.202000139, type: PDF
Preparing input with length: 985
Parsing response from generated IDs...


Unnamed: 0,article_id,dataset_info,text_type,llm_input,llm_response,llm_thinking_content
0,10.1002_ece3.6144,[{'dataset_id': 'https://doi.org/10.5061/dryad...,XML,"Text Content: {""title"":""Efficacy of metabarcod...","[{'dataset_id': '10.5061/dryad.zw3r22854', 'ty...","<think>\nOkay, let's start by looking at the i..."
1,10.1002_ejoc.202000139,"[{'dataset_id': 'Missing', 'type': 'Missing'}]",PDF,"Text Content: {""title"":"""",""authors"":""Ann Chris...","[{'dataset_id': '10.1002/ejoc.202000139', 'typ...","<think>\nOkay, let's tackle this problem. The ..."
2,10.1002_mp.14424,[{'dataset_id': 'https://doi.org/10.7937/k9/tc...,XML,"Text Content: {""title"":""PleThora: Pleural effu...",[{'dataset_id': '10.7937/K9/TCIA.2015.PF0M9REI...,"<think>\nOkay, let's tackle this dataset class..."


In [295]:
# Save processed_articles_df to CSV
processed_articles_df.to_csv("processed_articles_5.csv", index=False)


In [296]:
test_set_processed_articles_df = process_articles(test_file_paths_df, inference_model)
test_set_processed_articles_df.to_csv("test_set_processed_articles.csv", index=False)
test_set_processed_articles_df


Processing article 0/30: 10.1002_2017jc013030, type: XML
Preparing input with length: 1648
Parsing response from generated IDs...
Processing article 1/30: 10.1002_anie.201916483, type: XML
Preparing input with length: 1139
Parsing response from generated IDs...
Processing article 2/30: 10.1002_anie.202005531, type: XML
Preparing input with length: 249
Parsing response from generated IDs...
Processing article 3/30: 10.1002_anie.202007717, type: XML
Preparing input with length: 1459
Parsing response from generated IDs...
Processing article 4/30: 10.1002_chem.201902131, type: XML
Preparing input with length: 1277
Parsing response from generated IDs...
Processing article 5/30: 10.1002_chem.201903120, type: XML
Preparing input with length: 2051
Parsing response from generated IDs...
Processing article 6/30: 10.1002_chem.202000235, type: XML
Preparing input with length: 882
Parsing response from generated IDs...
Processing article 7/30: 10.1002_chem.202001412, type: XML
Preparing input with 

Unnamed: 0,article_id,dataset_info,text_type,llm_input,llm_response,llm_thinking_content
0,10.1002_2017jc013030,[{'dataset_id': 'https://doi.org/10.17882/4938...,XML,"Text Content: {""title"":""Assessing the variabil...","[{'dataset_id': '10.17882/49388', 'type': 'Pri...","<think>\nOkay, let's tackle this problem step ..."
1,10.1002_anie.201916483,"[{'dataset_id': 'Missing', 'type': 'Missing'}]",XML,"Text Content: {""title"":""Effective and Reversib...","[{'dataset_id': 'Missing', 'type': 'Missing'}]","<think>\nOkay, let's tackle this problem. The ..."
2,10.1002_anie.202005531,"[{'dataset_id': 'Missing', 'type': 'Missing'}]",XML,"Text Content: {""title"":""Trifluoromethyl Sulfox...","[{'dataset_id': 'Missing', 'type': 'Missing'}]","<think>\nOkay, let's tackle this. The user pro..."
3,10.1002_anie.202007717,"[{'dataset_id': 'Missing', 'type': 'Missing'}]",XML,"Text Content: {""title"":""Metal\u2013Ligand Coop...","[{'dataset_id': 'Missing', 'type': 'Missing'}]","<think>\nOkay, let's tackle this problem. The ..."
4,10.1002_chem.201902131,"[{'dataset_id': 'Missing', 'type': 'Missing'}]",XML,"Text Content: {""title"":""A Synthetic Route Towa...","[{'dataset_id': 'Missing', 'type': 'Missing'}]","<think>\nOkay, let's tackle this problem step ..."
5,10.1002_chem.201903120,"[{'dataset_id': 'Missing', 'type': 'Missing'}]",XML,"Text Content: {""title"":""Structure\u2013Solubil...","[{'dataset_id': 'Missing', 'type': 'Missing'}]","<think>\nOkay, let me start by analyzing the p..."
6,10.1002_chem.202000235,"[{'dataset_id': 'Missing', 'type': 'Missing'}]",XML,"Text Content: {""title"":""An Isolable Bis(Silano...","[{'dataset_id': 'Missing', 'type': 'Missing'}]","<think>\nOkay, let's start by analyzing the pr..."
7,10.1002_chem.202001412,"[{'dataset_id': 'Missing', 'type': 'Missing'}]",XML,"Text Content: {""title"":""Iron(0)\u2010Mediated ...","[{'dataset_id': 'Missing', 'type': 'Missing'}]","<think>\nOkay, let's tackle this problem. The ..."
8,10.1002_chem.202001668,"[{'dataset_id': 'Missing', 'type': 'Missing'}]",XML,"Text Content: {""title"":""The First Use of a ReX...","[{'dataset_id': 'Missing', 'type': 'Missing'}]","<think>\nOkay, let's tackle this problem. The ..."
9,10.1002_chem.202003167,"[{'dataset_id': 'Missing', 'type': 'Missing'}]",XML,"Text Content: {""title"":""Metal\u2010only Lewis ...","[{'dataset_id': 'Missing', 'type': 'Missing'}]","<think>\nOkay, let's tackle this problem. The ..."


In [25]:
eval_model = QwenModelEval(QWEN_BASE_MODEL_PATH, sys_prompt="You are a chatbot.",)

Loading Qwen model and tokenizer from: C:\Users\jim\.cache\kagglehub\models\qwen-lm\qwen-3\transformers\0.6b\1


In [None]:

# --- 2. Data Preparation for LLM Training (Revised for Combined Task) ---

def load_base_llm_for_training():
    """Loads the base Qwen model and tokenizer for fine-tuning."""
    global llm_tokenizer, llm_model
    if not AutoModelForCausalLM or not QWEN_BASE_MODEL_PATH:
        print("LLM components not available or base model path not set. Skipping LLM loading.")
        return False
    try:
        print(f"Loading Qwen tokenizer from: {QWEN_BASE_MODEL_PATH}")
        llm_tokenizer = AutoTokenizer.from_pretrained(QWEN_BASE_MODEL_PATH, trust_remote_code=True)
        if llm_tokenizer.pad_token is None:
            llm_tokenizer.pad_token = llm_tokenizer.eos_token
            print("Set tokenizer.pad_token to tokenizer.eos_token")

        print(f"Loading Qwen model from: {QWEN_BASE_MODEL_PATH}")
        llm_model = AutoModelForCausalLM.from_pretrained(
            QWEN_BASE_MODEL_PATH,
            torch_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32,
            device_map="auto", # Automatically uses GPU if available
            trust_remote_code=True,
            # load_in_8bit=True if bnb else False # Uncomment if bitsandbytes is used
        )
        print(f"Base LLM loaded successfully on {llm_model.device}.")
        return True
    except Exception as e:
        print(f"Error loading base LLM for training: {e}")
        llm_tokenizer, llm_model = None, None # Reset to None on failure
        return False

def prepare_training_data_for_llm(
    training_df: pd.DataFrame,
    all_article_texts: dict[str, str],
    tokenizer_max_length: int
) -> Dataset:
    """
    Prepares training data for LLM fine-tuning, aggregating dataset IDs and classifications
    per article and formatting into ChatML JSON output.
    """
    formatted_examples = []

    # Group training data by article_id to get all datasets for each article
    # This creates a dictionary where keys are article_ids and values are lists of dataset dicts
    grouped_training_data = training_df.groupby('article_id').apply(
        lambda x: [{"dataset_id": row['dataset_id'], "classification": row['label']} for _, row in x]
    ).to_dict()

    # Get all article IDs for which we have text content
    all_article_ids_with_text = set(all_article_texts.keys())
    
    # Iterate through all articles for which we have text (these are our potential training examples)
    for article_id in all_article_ids_with_text:
        article_text = all_article_texts.get(article_id, "")
        if not article_text:
            print(f"Warning: Article text for {article_id} not found. Skipping training example.")
            continue

        # Truncate article text to fit within the model's context window
        # Reserve tokens for the prompt and the expected JSON response.
        # A typical Qwen 1.5 model has 32768 max_seq_length.
        # 512 tokens for prompt/response is a safe estimate.
        truncated_article_text = article_text[:tokenizer_max_length - 512] 

        # Determine the ground truth output for this article
        if article_id in grouped_training_data:
            # Article has datasets, format them as JSON
            ground_truth_datasets = grouped_training_data[article_id]
            assistant_response_json = json.dumps(ground_truth_datasets, ensure_ascii=False)
        else:
            # Article has no datasets in training data, so the model should output an empty list.
            # This explicitly trains the model to output '[]' for "Missing" cases.
            assistant_response_json = "[]"
            # print(f"Info: Article {article_id} has no datasets in training data. Training to output '[]'.")

        # Construct the user message for the LLM
        user_message = f"""
Article Text:
{truncated_article_text}

Task: Identify all datasets or databases used in this research article and classify each as "Primary" (if created by the authors for this research) or "Secondary" (if an existing dataset used in this research).

Output Format: Provide a JSON list of objects. Each object should have "dataset_id" and "classification" keys. If no datasets are identified, return an empty JSON list: [].
"""
        # Construct the full ChatML formatted string for SFTTrainer
        # The trainer will use this entire string as the 'text' field.
        chatml_formatted_string = f"<|im_start|>system\nYou are an expert research assistant. Your task is to extract and classify datasets from scientific articles.<|im_end|>\n<|im_start|>user\n{user_message.strip()}<|im_end|>\n<|im_start|>assistant\n{assistant_response_json}<|im_end|>"
        
        formatted_examples.append({"text": chatml_formatted_string})

    if not formatted_examples:
        raise ValueError("No training examples could be prepared. Check your data and article texts.")

    return Dataset.from_list(formatted_examples)

# --- 3. LLM Model Training (Fine-tuning) ---

# Attempt to load tokenizer and model if not already loaded (e.g., if previous training failed or was skipped)
if llm_model is None:
    load_base_llm_for_training()

if llm_model and not training_df.empty and Dataset: # Ensure Dataset is imported
    print("\n--- Preparing data for Fine-tuning (Combined Task) ---")
    # Use the model's max_length for context, or a reasonable default if tokenizer isn't loaded
    max_len = llm_tokenizer.model_max_length if llm_tokenizer else 4096 
    train_dataset = prepare_training_data_for_llm(training_df, all_article_texts, max_len)
    
    print(f"Prepared {len(train_dataset)} examples for fine-tuning.")
    print("Example formatted training instance (first 500 chars):")
    print(train_dataset[0]['text'][:500])

    print("\n--- Starting Fine-tuning (Combined Task) ---")
    try:
        training_args = TrainingArguments(
            output_dir=f"{FINE_TUNED_MODEL_OUTPUT_DIR}/checkpoints",
            num_train_epochs=1,  # Start with 1 epoch, adjust as needed
            per_device_train_batch_size=1, # Adjust based on VRAM
            gradient_accumulation_steps=4, # Effective batch size = 1 * 4 = 4
            learning_rate=2e-5,
            logging_steps=10,
            save_steps=50, # Save checkpoints periodically
            fp16=torch.cuda.is_available() and not torch.cuda.is_bf16_supported(),
            bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
            optim="paged_adamw_8bit", # Good for memory efficiency if bitsandbytes is installed
            # report_to="none", # Disable logging to external services
            # max_steps=100, # For quick testing
        )

        trainer = SFTTrainer(
            model=llm_model,
            tokenizer=llm_tokenizer,
            train_dataset=train_dataset,
            dataset_text_field="text", # This field contains the full ChatML string
            args=training_args,
            max_seq_length=max_len, # Use the model's full context length
            packing=False, # Set to True if your inputs are much shorter than max_seq_length
        )

        trainer.train()
        print("Fine-tuning completed.")

        print(f"Saving fine-tuned model to: {FINE_TUNED_MODEL_OUTPUT_DIR}")
        trainer.save_model(FINE_TUNED_MODEL_OUTPUT_DIR)
        print("Model and tokenizer saved.")

    except Exception as e:
        print(f"An error occurred during fine-tuning: {e}")
        import traceback
        traceback.print_exc()
        llm_model = None # Mark model as failed to load/train
else:
    print("Skipping LLM fine-tuning due to missing training data or LLM components.")


# --- 4. LLM-based Extraction & Classification (Inference) ---

# Load the fine-tuned model for inference (if training was successful)
# If training was skipped or failed, this will attempt to load from the base path or fail.
if inference_model is None: # Only load if not already loaded
    if AutoModelForCausalLM: # Check if transformers is available
        if os.path.exists(FINE_TUNED_MODEL_OUTPUT_DIR) and os.path.isdir(FINE_TUNED_MODEL_OUTPUT_DIR):
            MODEL_TO_LOAD = FINE_TUNED_MODEL_OUTPUT_DIR
            print(f"Loading fine-tuned model for inference from: {MODEL_TO_LOAD}")
        else:
            MODEL_TO_LOAD = QWEN_BASE_MODEL_PATH
            print(f"Fine-tuned model not found. Loading base model for inference from: {MODEL_TO_LOAD}")

        try:
            inference_tokenizer = AutoTokenizer.from_pretrained(MODEL_TO_LOAD, trust_remote_code=True)
            if inference_tokenizer.pad_token is None:
                inference_tokenizer.pad_token = inference_tokenizer.eos_token
            inference_model = AutoModelForCausalLM.from_pretrained(
                MODEL_TO_LOAD,
                torch_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float32,
                device_map="auto",
                trust_remote_code=True
            ).eval() # Set to evaluation mode
            print(f"Inference LLM loaded successfully on {inference_model.device}.")
        except Exception as e:
            print(f"Error loading inference LLM from {MODEL_TO_LOAD}: {e}")
            inference_model, inference_tokenizer = None, None
    else:
        print("Transformers library not available. Cannot load LLM for inference.")


def extract_and_classify_with_llm(article_text: str) -> list[dict]:
    """
    Uses the loaded LLM to extract dataset IDs and classify them.
    Returns a list of dictionaries like [{"dataset_id": "...", "classification": "..."}].
    Returns an empty list if LLM is unavailable or parsing fails.
    """
    if not inference_model or not inference_tokenizer:
        print("  LLM unavailable for extraction/classification.")
        return [] # Return empty list if LLM is not loaded

    # Truncate article text for inference if it exceeds model's context window
    # Use the same max_length as during training for consistency
    max_inference_context_length = inference_tokenizer.model_max_length - 256 # Reserve tokens for prompt and response
    truncated_article_text = article_text[:max_inference_context_length]

    user_message = f"""
Article Text:
{truncated_article_text}

Task: Identify all datasets or databases used in this research article and classify each as "Primary" (if created by the authors for this research) or "Secondary" (if an existing dataset used in this research).

Output Format: Provide a JSON list of objects. Each object should have "dataset_id" and "classification" keys. If no datasets are identified, return an empty JSON list: [].
"""
    messages = [
        {"role": "system", "content": "You are an expert research assistant. Your task is to extract and classify datasets from scientific articles."},
        {"role": "user", "content": user_message.strip()}
    ]
    
    input_ids = inference_tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(inference_model.device)

    try:
        with torch.no_grad():
            outputs = inference_model.generate(
                input_ids,
                max_new_tokens=512, # Allow more tokens for multiple dataset outputs
                pad_token_id=inference_tokenizer.eos_token_id,
                eos_token_id=inference_tokenizer.convert_tokens_to_ids("<|im_end|>")
            )
        
        response_text = inference_tokenizer.decode(
            outputs[0][input_ids.shape[1]:],
            skip_special_tokens=False # Keep special tokens to remove <|im_end|> explicitly
        ).strip()
        response_text = response_text.replace("<|im_end|>", "").strip()
        
        print(f"  LLM raw response: '{response_text}'")

        # Attempt to parse the JSON output
        try:
            parsed_data = json.loads(response_text)
            if isinstance(parsed_data, list):
                # Validate structure: each item should be a dict with 'dataset_id' and 'classification'
                valid_datasets = []
                for item in parsed_data:
                    if isinstance(item, dict) and 'dataset_id' in item and 'classification' in item:
                        # Basic validation for classification label
                        if item['classification'] in ["Primary", "Secondary"]:
                            valid_datasets.append(item)
                        else:
                            print(f"  Warning: Invalid classification '{item['classification']}' for dataset '{item.get('dataset_id', 'N/A')}'. Skipping.")
                    else:
                        print(f"  Warning: Malformed JSON object: {item}. Skipping.")
                return valid_datasets
            else:
                print(f"  Warning: LLM did not return a JSON list: {response_text}")
                return []
        except json.JSONDecodeError as jde:
            print(f"  Error decoding JSON from LLM response: {jde}. Raw response: '{response_text}'")
            return []

    except Exception as e:
        print(f"  Error during LLM generation: {e}")
        return []

# --- Main Processing Loop for all articles (Revised) ---
print("\n--- Starting Article Processing and Classification (LLM-driven) ---")
final_results = []

for article_id, article_text in all_article_texts.items():
    print(f"\nProcessing article: {article_id}")
    
    # LLM directly extracts and classifies
    identified_datasets = extract_and_classify_with_llm(article_text)
    
    if not identified_datasets:
        # If LLM returns an empty list, classify the article as "Missing"
        print(f"  LLM identified no datasets for {article_id}. Classifying as 'Missing'.")
        final_results.append({
            "article_id": article_id,
            "dataset_id": "N/A", # Indicate no specific dataset ID
            "classification_label": "Missing"
        })
    else:
        print(f"  LLM identified {len(identified_datasets)} dataset(s) for {article_id}.")
        for item in identified_datasets:
            final_results.append({
                "article_id": article_id,
                "dataset_id": item.get("dataset_id", "Unknown"), # Use .get() for safety
                "classification_label": item.get("classification", "Uncertain_LLM")
            })


# --- 5. Results & Output ---

print("\n--- Final Results ---")
if final_results:
    results_df = pd.DataFrame(final_results)
    print(results_df.head(10)) # Print first 10 rows
    
    # Save to CSV
    results_df.to_csv(FINAL_RESULTS_CSV_PATH, index=False)
    print(f"\nResults saved to: {FINAL_RESULTS_CSV_PATH}")
else:
    print("No results generated.")

print("\nProcessing complete, Jim!")