## 0. Kaggle Environment and Imports

In [136]:
# Imports
import os
import glob
import re
import pandas as pd
import collections
import xml.etree.ElementTree as ET
import PyPDF2
import time
import fitz
import pymupdf4llm

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


## 1. Configuration

In [7]:

# Constants
MAX_TOKENS = 4096  # Adjust based on your model's capabilities
QWEN_MODEL_NAME_OR_PATH = "/kaggle/input/qwen-model-files/qwen-7b-chat" # Example
OUTPUT_CSV_PATH = "/kaggle/working/article_dataset_classification.csv"
ARTICLES_BASE_DIR = './kaggle/input/make-data-count-finding-data-references/'
ARTICLES_TRAIN_DIR = ARTICLES_BASE_DIR + 'train/'
ARTICLES_TEST_DIR = ARTICLES_BASE_DIR + 'test/'
ARTICLE_FORMATS = [{'format':'PDF', 'ext': '.pdf'}, {'format': 'XML', 'ext': '.xml'}]
train_labels_file_path = ARTICLES_BASE_DIR+'train_labels.csv'
sample_submission_file_path = ARTICLES_BASE_DIR+'sample_submission.csv'


## 2. Data Loading

In [8]:
def get_all_article_files(base_directory):
    """
    Returns a list of all article files (PDF and XML) in the specified base_directory.
    """
    # Collect files from all formats and flatten the list
    all_article_files = [glob.glob(os.path.join(base_directory, fmt['format'], f"*{fmt['ext']}")) for fmt in ARTICLE_FORMATS]  
    return [item for sublist in all_article_files for item in sublist]


In [None]:
def read_pdf_text_as_md(file_path):
    """
    Reads a PDF file and returns its text content as a Markdown string.
    This is super slow!!!
    """
    try:
        return pymupdf4llm.to_markdown(file_path)
    except Exception as e:
        print(f"Error reading PDF file {file_path}: {e}")
        return ""

In [9]:
def read_pdf_text(pdf_file_path) -> str:
    """Extracts all text from a PDF file."""
    text = ""
    # Ensure the file path is a string and not NaN or empty
    if pd.isna(pdf_file_path) or not pdf_file_path:
        return text
    
    # Convert to string and strip whitespace
    pdf_file_path = str(pdf_file_path).strip()

    start_time = time.time()  # Start timing

    try:
        with open(pdf_file_path, 'rb') as pdf_file_obj:
            pdf_reader = PyPDF2.PdfReader(pdf_file_obj)
            num_pages = len(pdf_reader.pages)
            for page_num in range(num_pages):
                page_obj = pdf_reader.pages[page_num]
                text += page_obj.extract_text()
    except Exception as e:
        print(f"Error reading {pdf_file_path} with PyPDF2: {e}")

    elapsed = time.time() - start_time
    print(f"Read PDF file '{pdf_file_path}' in {elapsed:.2f} seconds.")
    return text


In [235]:
def read_pdf_text_fitz(pdf_file_path) -> str:
    """Extracts all text from a PDF file."""
    text = ""
    # Ensure the file path is a string and not NaN or empty
    if pd.isna(pdf_file_path) or not pdf_file_path:
        return text
    
    # Convert to string and strip whitespace
    pdf_file_path = str(pdf_file_path).strip()

    start_time = time.time()  # Start timing

    try:
        with fitz.open(pdf_file_path) as doc:
            for page in doc:
                # Extract text from the page
                text_page = page.get_textpage()
                #page_text = text_page.extractText().replace('\u200b', '')  # Replace zero-width space with empty string

                # Extract words from the page
                #words = page.get_textpage().extractWORDS()
                # Join the words into a single string
                #page_text = ' '.join([word[4] for word in words if word[4]])

                # If you want to keep the original block structure, you can use:
                blocks = text_page.extractBLOCKS()
                # Join the text blocks into a single string
                page_text = '<</BLOCK>><<BLOCK>>'.join([block[4] for block in blocks if block[4]])
                # Append the page text to the overall text
                text += page_text.replace('\u200b', '')
    except Exception as e:
        print(f"Error reading {pdf_file_path} with PyPDF2: {e}")

    elapsed = time.time() - start_time
    # print(f"Read PDF file '{pdf_file_path}' in {elapsed:.2f} seconds.")
    return text

In [109]:
def read_xml_text(xml_file_path) -> str:
    """Reads and concatenates all text content from an XML file."""
    # Using your previously developed function
    all_text_parts = []
    try:
        tree = ET.parse(xml_file_path)
        root = tree.getroot()
        for element in root.iter():
            if element.text:
                cleaned_text = element.text.strip().replace('&#x02010;', '-').replace('‐', '-')
                if cleaned_text:
                    all_text_parts.append(cleaned_text)
            if element.tail:
                cleaned_tail = element.tail.strip().replace('&#x02010;', '-').replace('‐', '-')
                if cleaned_tail:
                    all_text_parts.append(cleaned_tail)
        return " ".join(all_text_parts) if all_text_parts else ""
    except Exception as e:
        print(f"Error reading XML {xml_file_path}: {e}")
        return ""


In [12]:
def read_article_text(file_path: str) -> str:
    """Reads text from an article file, handling both PDF and XML formats."""
    if file_path.endswith('.pdf'):
        return read_pdf_text_fitz(file_path)
    elif file_path.endswith('.xml'):
        return read_xml_text(file_path)
    else:
        print(f"Unsupported file format for {file_path}. Only PDF and XML are supported.")
        return ""
    

## 3. Data Extraction

In [82]:
def get_dataset_id_regex(id: str) -> str:
    """
    Formats a regex pattern for a dataset ID, specifically for Dryad DOIs that may have spaces in them.

    Args:
        id (str): The dataset ID, which may include a Dryad DOI.
    Returns:
        str: A regex pattern that matches the DOI, formatted to allow for optional spaces and dots.
    """
    # Regex to capture the DOI part after "doi.org/"
    # It handles optional "https://" and "www."
    regex_id = id.lower().strip()
    dryad_marker = "/dryad."
    dryad_index = regex_id.find(dryad_marker)
    if dryad_index != -1:
        # Calculate the starting point of the suffix (right after "/dryad." + 5 characters)
        start_of_suffix = dryad_index + len(dryad_marker) + 5
        # "/dryad." is found in the DOI candidate
        prefix = regex_id[:start_of_suffix]
        
        # Get the remaining characters for the suffix
        suffix = regex_id[start_of_suffix : ]
        
        # Construct the regex ID
        regex_id = prefix + '\\s?' + suffix

    return regex_id.replace('.', '\\.\\s?').replace('/', '\\/').replace('-', '-\\s?')


In [31]:
def find_regex_with_context(main_string: str, search_regex: str, context_chars: int = 200) -> list[str]:
    """
    Finds all occurrences of search_regex within main_string and returns
    a context window for each. The context window includes the matching search_string
    itself, surrounded by up to 'context_chars' characters from before its occurrence
    in the main_string.

    Args:
        main_string (str): The string to search within.
        search_regex (str): The regular expression to search for.
        context_chars (int): The number of characters to include before and after
                             the search_string in the context window. Defaults to 100.

    Returns:
        List[str]: A list of strings, where each string is an occurrence of
                   search_string surrounded by its context. Returns an empty
                   list if search_string is not found, or if either
                   main_string or search_string is empty.
    """
    # Ensure the main_string and search_regex are valid
    if not main_string or not search_regex:
        return []

    re_doi = re.compile(search_regex, re.IGNORECASE)
    occurrences_with_context: list[str] = []
    len_search: int = len(search_regex)

    doi_matches = re_doi.finditer(main_string, re.IGNORECASE)
    for match in doi_matches:
        extracted_snippet = main_string[max(0, match.start() - context_chars): match.start() + len_search ]
        occurrences_with_context.append(extracted_snippet.lower())
            
    return occurrences_with_context

In [32]:
def remove_unmatched_parentheses(s: str) -> str:
    """
    Removes non-matching '(' and ')' characters from a string.
    A parenthesis is considered matching if it forms a valid pair.

    Args:
        s (str): The input string.

    Returns:
        str: The string with all non-matching parentheses removed.
    """
    if not s:
        return ""

    # Use a deque as a stack to store indices of opening parentheses.
    # When we find a '(', we push its index. When we find a ')', we pop an index.
    open_paren_indices_stack = collections.deque()
    
    # A boolean list to mark characters that should be kept in the final string.
    # Initially, assume all characters are kept. We'll mark unmatched parentheses as False.
    keep_char = [True] * len(s)

    for i, char in enumerate(s):
        if char == '(':
            # This is a potential opening parenthesis. Store its index.
            open_paren_indices_stack.append(i)
        elif char == ')':
            if open_paren_indices_stack:
                # Found a matching opening parenthesis for this closing one.
                # Pop the index of the matched opening parenthesis from the stack.
                open_paren_indices_stack.pop()
            else:
                # This closing parenthesis has no matching opening parenthesis.
                # It is unmatched and should be removed.
                keep_char[i] = False
        # For non-parenthesis characters, keep_char[i] remains True (its default value).
    
    # After iterating through the entire string, any opening parentheses
    # remaining in the stack are unmatched because they never found a closing pair.
    # Mark these for removal.
    while open_paren_indices_stack:
        unmatched_open_idx = open_paren_indices_stack.pop()
        keep_char[unmatched_open_idx] = False
            
    # Construct the final string by iterating through the original string
    # and appending only the characters marked to be kept.
    final_chars = [s[i] for i, should_keep in enumerate(keep_char) if should_keep]
            
    return "".join(final_chars)


In [None]:
def scrub_doi(doi: str) -> str:
    """
    Scrubs a DOI string by removing unwanted characters and formatting it.

    Args:
        doi (str): The DOI string to be scrubbed.
    Returns:
        str: The cleaned DOI string.
    """
    doi = doi.strip()
    # Remove non-matching "(" and ")" characters
    doi = remove_unmatched_parentheses(doi)
    # Remove trailing periods and semicolons
    doi = re.sub(r'[.;]$', '', doi)
    # Remove any leading or trailing whitespace
    doi = doi.strip()
    return doi

In [215]:
def extract_dois_from_text(text) -> list[str]:
    """
    Extracts DOIs from text using a flexible regex.
    Args:
        text (str): The text from which to extract DOIs.
    Returns:
        list[str]: A list of unique DOIs found in the text.
    Note:
        This function uses a regex pattern to find DOIs in the text.
        It assumes that DOIs are formatted as "10.xxxx/yyyy" or similar.
        The regex is designed to be flexible and may need adjustments based on specific DOI formats.
    """
    if not text:
        return []
    
    # Flexible DOI regex pattern to find any string starting with '10.' followed by numbers/dots/slashes.
    #doi_pattern = r'\b10\.\d{4,9}/[-._;()/:A-Za-z0-9]+\b'
    #doi_pattern = r'\b10\.\s?\d{4,9}\/[-._()<>;\/:A-Za-z0-9\s]+[-._()<>;\/:0-9]+'
    #doi_pattern = r'\b10\.\s?\d{4,9}\/[-._()<>;\/:A-Za-z0-9]+\s?(?![A-Z]+)+[-._()<>;\/:A-Za-z0-9]+'
    #doi_pattern = r'\b10\.\s?\d{4,9}\/[-._()<>;\/:A-Za-z0-9\u200b]+\s?(?![A-Z]+)+[-._()<>;\/:A-Za-z0-9\u200b]+'
    #doi_pattern = r'10\.\s?\d{4,9}\/[-._()<>;\/:A-Za-z0-9\\u200b]+\s?(?:(?![A-Z])(?!\d{1,3}\.))+[-._()<>;\/:A-Za-z0-9\\u200b]+'
    doi_pattern = r'10\.\s?\d{4,9}\/[-._()<>;\/:A-Za-z0-9\u200b]+\s?(?:(?![A-Z]+)(?!\d{1,3}\.))+[-._()<>;\/:A-Za-z0-9\u200b]+'
    #doi_pattern = r'10\.\s?\d{4,9}\/[-._()<>;\/:A-Za-z0-9\u200b]+(?:(?![A-Z]+)(?!\d{1,3}\.))+[-._()<>;\/:A-Za-z0-9\u200b]+'
    found_dois = set(re.findall(doi_pattern, text))
    #print(f"Found {len(found_dois)} DOIs in the text. {' , '.join(found_dois)}")
    # Clean up whitespace
    found_dois = ["".join(scrub_doi(doi).split()) for doi in found_dois]  
    # Return unique DOIs
    return list(set(found_dois))  


def extract_dataset_ids(article_id: str, text: str, extracted_dois: list[str]) -> list[str]:
    """
    Checks the list of extracted_dois and returns only those that are related to datasets.

    Args:
        article_id (str): The ID of the article, used to normalize DOIs.
        text (str): The full text of the article, used to find context for dataset IDs.
        extracted_dois (list[str]): A list of DOIs extracted from the article.
    Returns:
        list[str]: A list of dataset IDs that are related to the article.
    Note:
        This function currently assumes that all DOIs in the extracted_dois list are potential dataset IDs.
        It checks if they are related to datasets by looking for keywords like 'dataset' or 'database' in the context.
        If you have specific patterns or additional criteria for dataset IDs, you can modify this logic accordingly.
    """
    # Normalize the article_id to match DOI format
    article_id_slashed = article_id.replace('_', '/')  # Replace dashes with slashes to match DOI format

    # Initialize a list to hold dataset-related DOIs
    # This will hold DOIs that are confirmed to be related to datasets
    dataset_ids = []
    for doi in extracted_dois:
        # print(f"Processing DOI: {doi}")
        # Skip if DOI is empty, NaN, or matches the article_id_slashed
        if pd.isna(doi) or not doi or doi.lower() == article_id_slashed.lower():
            continue
        # Normalize the DOI to a regex format
        regex_id = get_dataset_id_regex(doi)
        # Find occurrences of this DOI in the text with context
        occurrences = find_regex_with_context(text, regex_id)
        # Check if any string in occurrences contains 'data' related keywords (case-insensitive)
        #keywords = ['program data', 'data availability', 'the data', 'dataset', 'database']
        keywords = ['data release', 'download', 'program data', 'data availability', 'the data', 'dataset', 'database']
        if doi == '10.15468/dl.waw9qx':
            print(f"Found DOI: {doi} in article {article_id_slashed}")
        if occurrences and any(keyword in s for s in occurrences for keyword in keywords):
            dataset_ids.append(doi)
    return dataset_ids 


## 4. LLM Classification

In [None]:
# Global LLM model and tokenizer (load once)
llm_model = None
llm_tokenizer = None
device = "cuda" if torch and torch.cuda.is_available() else "cpu"

def load_llm():
    global llm_model, llm_tokenizer
    if not AutoModelForCausalLM or not QWEN_MODEL_NAME_OR_PATH:
        print("LLM components not available or path not set. Skipping LLM loading.")
        return False
    try:
        print(f"Loading Qwen tokenizer from: {QWEN_MODEL_NAME_OR_PATH}")
        llm_tokenizer = AutoTokenizer.from_pretrained(QWEN_MODEL_NAME_OR_PATH, trust_remote_code=True)
        print(f"Loading Qwen model from: {QWEN_MODEL_NAME_OR_PATH}")
        llm_model = AutoModelForCausalLM.from_pretrained(
            QWEN_MODEL_NAME_OR_PATH,
            device_map="auto", # Automatically uses GPU if available
            trust_remote_code=True
        ).eval() # Set to evaluation mode
        print(f"LLM loaded successfully on {llm_model.device}.")
        return True
    except Exception as e:
        print(f"Error loading LLM: {e}")
        return False

def generate_llm_classification(article_text_snippet, dataset_id):
    """
    Uses the LLM to classify dataset usage.
    article_text_snippet: A relevant portion of article text, or full text if manageable.
    """
    if not llm_model or not llm_tokenizer:
        print("LLM not loaded. Cannot classify.")
        return "Error: LLM not loaded"

    prompt = f"""
    You are an expert research assistant. Your task is to determine how a dataset was used in a research article.
    Read the following article context and the dataset identifier carefully.

    Article Context (excerpt):
    "{article_text_snippet[:4000]}"  # Truncate for context window, adjust as needed

    Dataset Identifier: "{dataset_id}"

    Question: Based on the provided article context, was the dataset (identified as "{dataset_id}"):
    1. Created by the authors primarily for the research described in THIS article? (If so, it's "Primary")
    2. An existing dataset that the authors obtained and used for their research in THIS article? (If so, it's "Secondary")

    Please respond with only one word: "Primary" or "Secondary".
    """
    
    # This is a simplified generation example for Qwen.
    # You'll need to adapt this to the specific chat/completion format Qwen expects.
    # For many chat models, it's a list of messages:
    # messages = [{"role": "user", "content": prompt}]
    # inputs = llm_tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(device)
    
    # Simpler non-chat completion style (check Qwen docs for best practice):
    inputs = llm_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=llm_tokenizer.model_max_length - 50).to(llm_model.device) # Reserve some tokens for generation

    try:
        with torch.no_grad(): # Important for inference
            outputs = llm_model.generate(
                **inputs,
                max_new_tokens=10, # We only expect "Primary" or "Secondary"
                pad_token_id=llm_tokenizer.eos_token_id # Important for some models
            )
        
        # Decode the generated tokens, skipping special tokens and the prompt
        response_text = llm_tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True).strip()
        
        print(f"LLM raw response for {dataset_id}: '{response_text}'")

        if "Primary" in response_text:
            return "Primary"
        elif "Secondary" in response_text:
            return "Secondary"
        else:
            print(f"Warning: LLM response for {dataset_id} not clearly Primary/Secondary: '{response_text}'")
            return "Uncertain" # Or handle as per your logic

    except Exception as e:
        print(f"Error during LLM generation for {dataset_id}: {e}")
        return "Error: LLM generation failed"



In [191]:
def has_data_availability_section(text: str) -> bool:
    """
    Checks if the text contains a 'Data Availability' section.
    Args:
        text (str): The text to check.
    Returns:
        bool: True if 'Data Availability' section is found, False otherwise.
    """
    if not text:
        return False
    # Normalize the text to lowercase for case-insensitive matching
    normalized_text = text.lower()
    # Check for common phrases indicating a Data Availability or References section
    data_availability_phrases = [
        "references",
        "data availability",
        "availability of data",
        "data sharing",
        "data access",
        "data availability statement",
        "data availability section"
    ]
    # Check if any of the phrases are present in the text
    for phrase in data_availability_phrases:
        if phrase in normalized_text:
            return True
    
    return False

## 5. Main Processing Logic

In [170]:
def process_article_text(article_id: str, article_text: str, article_format: str) -> list[dict[str, str]]:
    """ Processes the article text to extract dataset IDs and classify their usage.
    
    Args:
        article_id (str): The unique identifier for the article.
        article_text (str): The text content of the article.
        article_format (str): The format of the article (e.g., 'PDF', 'XML').

    Returns:
        dict[str, str]: A dictionary containing the article ID, format, dataset IDs, and their classifications.
    """
    data_availability_section = has_data_availability_section(article_text)
    # Extract dataset IDs from the article text
    extracted_dois = extract_dois_from_text(article_text)
    #print(f"Found {len(extracted_dois)} potential dataset(s)/DOI(s) in {article_id}: {extracted_dois}")
    dataset_ids = extract_dataset_ids(article_id, article_text, extracted_dois)
    results = []

    # Classify each dataset ID using the LLM
    for dataset_id in dataset_ids:
        #classification = generate_llm_classification(article_text, dataset_id)
        classification = "LLM_Disabled"
        results.append({
            'article_id': article_id,
            'dataset_id': f'https://doi.org/{dataset_id.replace('\u200b', '').lower()}',
            'dataset_id_raw': dataset_id,
            'article_format': article_format,
            'data_availability_section': data_availability_section,
            'classification': classification
        })
    # If no dataset IDs were found, still return the article ID and format
    if not results:
        results.append({
            'article_id': article_id,
            'dataset_id': 'Missing',
            'dataset_id_raw': 'Missing',
            'article_format': article_format,
            'data_availability_section': data_availability_section,
            'classification': 'Missing'
        })
    return results


In [36]:
def process_article(article_file_path: str) -> list[dict[str, str]]:
    """
    Processes a single article file to extract dataset IDs and classify their usage.

    Args:
        article_file_path (str): The path to the article file (PDF or XML).
        
    Returns:
        list[dict[str, str]]: A list of dictionaries containing the article ID, format, dataset IDs, and their classifications.
    """
    # Extract the article ID from the file name
    article_id = os.path.splitext(os.path.basename(article_file_path))[0]
    # Determine the article format based on the file extension
    article_format = os.path.splitext(article_file_path)[1].lower()
    # Read the article text
    article_text = read_article_text(article_file_path)
    # Process the article text to extract dataset IDs and classify their usage
    return process_article_text(article_id, article_text, article_format)


In [37]:

# --- 5. Main Processing Logic ---
def process_articles(articles_directory):
    results = []
    
    # if not load_llm(): # Attempt to load LLM once
    #     print("Proceeding without LLM classification.")

    article_files = get_all_article_files(articles_directory)

    for i, filepath in enumerate(article_files):
        print(f"\nProcessing article {i+1}/{len(article_files)}: {os.path.basename(filepath)}")
        article_results = process_article(filepath)
        results.extend(article_results)

    # Convert results to a DataFrame and sort by article_id, dataset_id, and article_format
    return pd.DataFrame(results).sort_values(by=["article_id", "dataset_id", "article_format"]).reset_index(drop=True)


In [219]:
test_result_df = process_articles(ARTICLES_TEST_DIR)
test_result_df = test_result_df[test_result_df['dataset_id'] != 'Missing'].reset_index(drop=True)
test_result_df = test_result_df.drop_duplicates(subset=['article_id', 'dataset_id'], keep="last").reset_index(drop=True)
test_result_df



Processing article 1/55: 10.1002_2017jc013030.pdf

Processing article 2/55: 10.1002_anie.201916483.pdf

Processing article 3/55: 10.1002_anie.202005531.pdf

Processing article 4/55: 10.1002_anie.202007717.pdf

Processing article 5/55: 10.1002_chem.201902131.pdf

Processing article 6/55: 10.1002_chem.201903120.pdf

Processing article 7/55: 10.1002_chem.202000235.pdf

Processing article 8/55: 10.1002_chem.202001412.pdf

Processing article 9/55: 10.1002_chem.202001668.pdf

Processing article 10/55: 10.1002_chem.202003167.pdf

Processing article 11/55: 10.1002_cssc.202201821.pdf

Processing article 12/55: 10.1002_ece3.3985.pdf

Processing article 13/55: 10.1002_ece3.4466.pdf

Processing article 14/55: 10.1002_ece3.5260.pdf

Processing article 15/55: 10.1002_ece3.5395.pdf

Processing article 16/55: 10.1002_ece3.6144.pdf

Processing article 17/55: 10.1002_ece3.6303.pdf

Processing article 18/55: 10.1002_ece3.6784.pdf

Processing article 19/55: 10.1002_ece3.961.pdf

Processing article 20/55:

Unnamed: 0,article_id,dataset_id,dataset_id_raw,article_format,data_availability_section,classification
0,10.1002_2017jc013030,https://doi.org/10.17882/47142,10.17882/47142,.xml,False,LLM_Disabled
1,10.1002_2017jc013030,https://doi.org/10.17882/49388,10.17882/49388,.xml,False,LLM_Disabled
2,10.1002_2017jc013030,https://doi.org/10.5194/essd-2017-58,10.5194/essd-2017-58,.xml,False,LLM_Disabled
3,10.1002_2017jc013030,https://doi.org/10.5194/essd-9-861-2017,10.5194/essd-9-861-2017,.pdf,True,LLM_Disabled
4,10.1002_cssc.202201821,https://doi.org/10.5281/zenodo.7074790,10.5281/zenodo.7074790,.xml,True,LLM_Disabled
5,10.1002_ece3.4466,https://doi.org/10.5061/dryad.r6nq870,10.5061/dryad.r6nq870,.xml,True,LLM_Disabled
6,10.1002_ece3.5260,https://doi.org/10.5061/dryad.2f62927,10.5061/dryad.2f62927,.xml,True,LLM_Disabled
7,10.1002_ece3.5395,https://doi.org/10.5441/001/1.c42j3js7,10.5441/001/1.c42j3js7,.xml,True,LLM_Disabled
8,10.1002_ece3.5395,https://doi.org/10.5441/001/1.v1cs4nn0,10.5441/001/1.v1cs4nn0,.xml,True,LLM_Disabled
9,10.1002_ece3.6144,https://doi.org/10.5061/dryad.zw3r22854,10.5061/dryad.zw3r22854,.xml,True,LLM_Disabled


In [220]:
train_data_avail_df = process_articles(ARTICLES_TRAIN_DIR)
train_data_avail_df


Processing article 1/924: 10.1002_2017jc013030.pdf

Processing article 2/924: 10.1002_anie.201916483.pdf

Processing article 3/924: 10.1002_anie.202005531.pdf

Processing article 4/924: 10.1002_anie.202007717.pdf

Processing article 5/924: 10.1002_chem.201902131.pdf

Processing article 6/924: 10.1002_chem.201903120.pdf

Processing article 7/924: 10.1002_chem.202000235.pdf

Processing article 8/924: 10.1002_chem.202001412.pdf

Processing article 9/924: 10.1002_chem.202001668.pdf

Processing article 10/924: 10.1002_chem.202003167.pdf

Processing article 11/924: 10.1002_cssc.202201821.pdf

Processing article 12/924: 10.1002_ece3.3985.pdf

Processing article 13/924: 10.1002_ece3.4466.pdf

Processing article 14/924: 10.1002_ece3.5260.pdf

Processing article 15/924: 10.1002_ece3.5395.pdf

Processing article 16/924: 10.1002_ece3.6144.pdf

Processing article 17/924: 10.1002_ece3.6303.pdf

Processing article 18/924: 10.1002_ece3.6784.pdf

Processing article 19/924: 10.1002_ece3.961.pdf

Proces

Unnamed: 0,article_id,dataset_id,dataset_id_raw,article_format,data_availability_section,classification
0,10.1002_2017jc013030,https://doi.org/10.17882/47142,10.17882/47142,.pdf,True,LLM_Disabled
1,10.1002_2017jc013030,https://doi.org/10.17882/47142,10.17882/47142,.xml,False,LLM_Disabled
2,10.1002_2017jc013030,https://doi.org/10.17882/49388,10.17882/49388,.pdf,True,LLM_Disabled
3,10.1002_2017jc013030,https://doi.org/10.17882/49388,10.17882/49388,.xml,False,LLM_Disabled
4,10.1002_2017jc013030,https://doi.org/10.5194/essd-2017-58,10.5194/essd-2017-58,.xml,False,LLM_Disabled
...,...,...,...,...,...,...
1241,10.7717_peerj.12422,Missing,Missing,.xml,True,Missing
1242,10.7717_peerj.12422,https://doi.org/10.7717/peerj.12422/fig-1,10.7717/peerj.12422/fig-1,.pdf,True,LLM_Disabled
1243,10.7717_peerj.12422,https://doi.org/10.7717/peerj.4255,10.7717/peerj.4255,.pdf,True,LLM_Disabled
1244,10.7717_peerj.13193,Missing,Missing,.pdf,True,Missing


In [222]:
#train_data_avail_df = train_data_avail_df[train_data_avail_df['dataset_id'] != 'Missing'].reset_index(drop=True)
train_data_avail_df = train_data_avail_df.drop_duplicates(subset=['article_id'], keep="first").reset_index(drop=True)
# drop columns that are not needed
train_data_avail_df = train_data_avail_df[['article_id', 'article_format', 'data_availability_section', 'classification']]
train_data_avail_df

Unnamed: 0,article_id,article_format,data_availability_section,classification
0,10.1002_2017jc013030,.pdf,True,LLM_Disabled
1,10.1002_anie.201916483,.pdf,False,Missing
2,10.1002_anie.202005531,.pdf,False,Missing
3,10.1002_anie.202007717,.pdf,False,Missing
4,10.1002_chem.201902131,.pdf,False,Missing
...,...,...,...,...
519,10.7554_elife.74937,.pdf,False,Missing
520,10.7717_peerj.10452,.pdf,True,LLM_Disabled
521,10.7717_peerj.11352,.pdf,True,LLM_Disabled
522,10.7717_peerj.12422,.xml,True,Missing


In [224]:
train_result_df = process_articles(ARTICLES_TRAIN_DIR)
train_result_df = train_result_df[train_result_df['dataset_id'] != 'Missing'].reset_index(drop=True)
train_result_df = train_result_df.drop_duplicates(subset=['article_id', 'dataset_id'], keep="last").reset_index(drop=True)
train_result_df


Processing article 1/924: 10.1002_2017jc013030.pdf

Processing article 2/924: 10.1002_anie.201916483.pdf

Processing article 3/924: 10.1002_anie.202005531.pdf

Processing article 4/924: 10.1002_anie.202007717.pdf

Processing article 5/924: 10.1002_chem.201902131.pdf

Processing article 6/924: 10.1002_chem.201903120.pdf

Processing article 7/924: 10.1002_chem.202000235.pdf

Processing article 8/924: 10.1002_chem.202001412.pdf

Processing article 9/924: 10.1002_chem.202001668.pdf

Processing article 10/924: 10.1002_chem.202003167.pdf

Processing article 11/924: 10.1002_cssc.202201821.pdf

Processing article 12/924: 10.1002_ece3.3985.pdf

Processing article 13/924: 10.1002_ece3.4466.pdf

Processing article 14/924: 10.1002_ece3.5260.pdf

Processing article 15/924: 10.1002_ece3.5395.pdf

Processing article 16/924: 10.1002_ece3.6144.pdf

Processing article 17/924: 10.1002_ece3.6303.pdf

Processing article 18/924: 10.1002_ece3.6784.pdf

Processing article 19/924: 10.1002_ece3.961.pdf

Proces

Unnamed: 0,article_id,dataset_id,dataset_id_raw,article_format,data_availability_section,classification
0,10.1002_2017jc013030,https://doi.org/10.17882/47142,10.17882/47142,.xml,False,LLM_Disabled
1,10.1002_2017jc013030,https://doi.org/10.17882/49388,10.17882/49388,.xml,False,LLM_Disabled
2,10.1002_2017jc013030,https://doi.org/10.5194/essd-2017-58,10.5194/essd-2017-58,.xml,False,LLM_Disabled
3,10.1002_2017jc013030,https://doi.org/10.5194/essd-9-861-2017,10.5194/essd-9-861-2017,.pdf,True,LLM_Disabled
4,10.1002_cssc.202201821,https://doi.org/10.5281/zenodo.7074790,10.5281/zenodo.7074790,.xml,True,LLM_Disabled
...,...,...,...,...,...,...
437,10.7717_peerj.11352,https://doi.org/10.7291/d11m38,10.7291/D11M38,.xml,True,LLM_Disabled
438,10.7717_peerj.11352,https://doi.org/10.7717/peerj.11352/supp-4,10.7717/peerj.11352/supp-4,.xml,True,LLM_Disabled
439,10.7717_peerj.11352,https://doi.org/10.7717/peerj.11352/supp-5,10.7717/peerj.11352/supp-5,.xml,True,LLM_Disabled
440,10.7717_peerj.12422,https://doi.org/10.7717/peerj.12422/fig-1,10.7717/peerj.12422/fig-1,.pdf,True,LLM_Disabled


In [225]:
train_labels_df = pd.read_csv(train_labels_file_path)
train_labels_df = train_labels_df[train_labels_df['type'] != 'Missing'].reset_index(drop=True)
train_labels_df

Unnamed: 0,article_id,dataset_id,type
0,10.1002_2017jc013030,https://doi.org/10.17882/49388,Primary
1,10.1002_ece3.4466,https://doi.org/10.5061/dryad.r6nq870,Primary
2,10.1002_ece3.5260,https://doi.org/10.5061/dryad.2f62927,Primary
3,10.1002_ece3.6144,https://doi.org/10.5061/dryad.zw3r22854,Primary
4,10.1002_ece3.6303,https://doi.org/10.5061/dryad.37pvmcvgb,Primary
...,...,...,...
714,10.7717_peerj.12422,https://doi.org/10.15468/dl.t3h8b4,Secondary
715,10.7717_peerj.12422,https://doi.org/10.15468/dl.yak5vd,Secondary
716,10.7717_peerj.13193,https://doi.org/10.6073/pasta/02e2764efb408a8b...,Secondary
717,10.7717_peerj.13193,https://doi.org/10.6073/pasta/275ad28a2f31356c...,Secondary


In [226]:

# Compare the difference between train_result_df and train_labels_df to see if there are any missing dataset_ids
missing_dataset_ids = train_labels_df[~train_labels_df['dataset_id'].isin(train_result_df['dataset_id'])].sort_values(by='dataset_id').reset_index(drop=True)
# merge the missing dataset_ids with the train_data_avail_df
missing_dataset_ids = missing_dataset_ids.merge(train_data_avail_df, on='article_id', how='left')
missing_dataset_ids

Unnamed: 0,article_id,dataset_id,type,article_format,data_availability_section,classification
0,10.1371_journal.pone.0262974,2nrj,Secondary,.pdf,True,LLM_Disabled
1,10.1093_nar_gkp1049,3.10.180.10,Secondary,.pdf,True,Missing
2,10.1093_nar_gkp1049,3.20.20.120,Secondary,.pdf,True,Missing
3,10.1093_nar_gkp1049,3.20.20.140,Secondary,.pdf,True,Missing
4,10.1093_nar_gkp1049,3.30.450.20,Secondary,.pdf,True,Missing
...,...,...,...,...,...,...
529,10.1038_s41597-022-01555-4,https://doi.org/10.7937/q1ee-j082,Secondary,.pdf,True,Missing
530,10.1007_s00259-022-06053-8,https://doi.org/10.7937/tcia.2019.30ilqfcl,Secondary,.xml,True,Missing
531,10.1038_s41597-022-01555-4,https://doi.org/10.7937/tcia.e3sv-re93,Secondary,.pdf,True,Missing
532,10.1038_s41597-022-01555-4,https://doi.org/10.7937/tcia.xc7a-qt20,Primary,.pdf,True,Missing


In [202]:

# Merge train results with labels to get the ground truth
train_result_df_2 = train_labels_df.merge(train_result_df, on=['article_id', 'dataset_id'], how='left')
# train_result_df_2 = train_result_df_2.rename(columns={'classification_y': 'ground_truth_classification', 'classification_x': 'predicted_classification'})
# train_result_df_2 = train_result_df_2[['article_id', 'dataset_id', 'dataset_id_raw', 'article_format', 'ground_truth_classification', 'predicted_classification']]  
# Convert nan values in 'ground_truth_classification' to 'Missing'
train_result_df_2['dataset_id_raw'] = train_result_df_2['dataset_id_raw'].fillna('Missing')
train_result_df_2
# Save the results to a CSV file
# train_result_df_2.to_csv(OUTPUT_CSV_PATH, index=False)
# print(f"Train results saved to {OUTPUT_CSV_PATH}")

Unnamed: 0,article_id,dataset_id,type,dataset_id_raw,article_format,data_availability_section,classification
0,10.1002_2017jc013030,https://doi.org/10.17882/49388,Primary,10.17882/49388,.xml,False,LLM_Disabled
1,10.1002_ece3.4466,https://doi.org/10.5061/dryad.r6nq870,Primary,10.5061/dryad.r6nq870,.xml,True,LLM_Disabled
2,10.1002_ece3.5260,https://doi.org/10.5061/dryad.2f62927,Primary,10.5061/dryad.2f62927,.xml,True,LLM_Disabled
3,10.1002_ece3.6144,https://doi.org/10.5061/dryad.zw3r22854,Primary,10.5061/dryad.zw3r22854,.xml,True,LLM_Disabled
4,10.1002_ece3.6303,https://doi.org/10.5061/dryad.37pvmcvgb,Primary,10.5061/dryad.37pvmcvgb,.xml,True,LLM_Disabled
...,...,...,...,...,...,...,...
714,10.7717_peerj.12422,https://doi.org/10.15468/dl.t3h8b4,Secondary,Missing,,,
715,10.7717_peerj.12422,https://doi.org/10.15468/dl.yak5vd,Secondary,Missing,,,
716,10.7717_peerj.13193,https://doi.org/10.6073/pasta/02e2764efb408a8b...,Secondary,Missing,,,
717,10.7717_peerj.13193,https://doi.org/10.6073/pasta/275ad28a2f31356c...,Secondary,Missing,,,


In [203]:
missing_values_df = train_result_df_2[train_result_df_2['classification'].isna()]
if not missing_values_df.empty:
    print(f"Warning: There are {len(missing_values_df)} missing values in the classification column.")
missing_values_df



Unnamed: 0,article_id,dataset_id,type,dataset_id_raw,article_format,data_availability_section,classification
8,10.1002_esp.5058,https://doi.org/10.5061/dryad.jh9w0vt9t,Primary,Missing,,,
9,10.1002_esp.5090,https://doi.org/10.5066/p9353101,Secondary,Missing,,,
12,10.1002_nafm.10870,https://doi.org/10.5066/p9gtumay,Primary,Missing,,,
13,10.1007_s00259-022-06053-8,https://doi.org/10.7937/k9/tcia.2017.7hs46erv,Secondary,Missing,,,
14,10.1007_s00259-022-06053-8,https://doi.org/10.7937/tcia.2019.30ilqfcl,Secondary,Missing,,,
...,...,...,...,...,...,...,...
714,10.7717_peerj.12422,https://doi.org/10.15468/dl.t3h8b4,Secondary,Missing,,,
715,10.7717_peerj.12422,https://doi.org/10.15468/dl.yak5vd,Secondary,Missing,,,
716,10.7717_peerj.13193,https://doi.org/10.6073/pasta/02e2764efb408a8b...,Secondary,Missing,,,
717,10.7717_peerj.13193,https://doi.org/10.6073/pasta/275ad28a2f31356c...,Secondary,Missing,,,


In [None]:
# compare the diferences between train results and train labels
train_result_df['match'] = train_result_df['ground_truth_classification'] == train_result_df['predicted_classification']
train_result_df['match'] = train_result_df['match'].replace({True: 'Match', False: 'Mismatch'})
train_result_df = train_result_df.sort_values(by=['article_id', 'dataset_id']).reset_index(drop=True)
print(train_result_df.head(10))


In [236]:
# Simple testing of the process_article function with a sample XML file
filepath = os.path.join(ARTICLES_TRAIN_DIR+'PDF', '10.1107_s2059798322005691.pdf')
result_2_df = process_article(filepath)
result_2_df

[{'article_id': '10.1107_s2059798322005691',
  'dataset_id': 'Missing',
  'dataset_id_raw': 'Missing',
  'article_format': '.pdf',
  'data_availability_section': True,
  'classification': 'Missing'}]

In [162]:
text = read_pdf_text_as_md(filepath)
print(text[:1000])  # Print the first 1000 characters of the text

Received: 18 April 2019 | Revised: 28 May 2019 | Accepted: 4 June 2019

DOI: 10.1002/ece3.5395

O R I G I N A L R E S E A R C H
# **“Closer‐to‐home” strategy benefits juvenile survival in a long‐** **distance migratory bird**

**[Yachang Cheng](mailto:﻿)** **[1,2]** | **Wolfgang Fiedler** **[1,2]** | **Martin Wikelski** **[1,2,3]** | **[Andrea Flack](mailto:﻿)** **[[1,2,3]](mailto:﻿)**


1 Department of Migration, Max Planck

Institute of Animal Behavior, Radolfzell,

Germany

2 Department of Biology, University of

Konstanz, Konstanz, Germany

3 Centre for the Advanced Study of

Collective Behaviour, University of

Konstanz, Konstanz, Germany

**Correspondence**

Yachang Cheng and Andrea Flack,

Department of Migration, Max Planck

Institute of Animal Behavior, D‐78315

Radolfzell, Germany.

[Emails: ycheng@ab.mpg.de (YC) and aflack@](mailto:ycheng@ab.mpg.de)
[ab.mpg.de (AF)](mailto:aflack@ab.mpg.de)

**1** | **INTRODUCTION**


**Abstract**

Human‐induced changes in the climate and en

In [164]:
text2 = read_pdf_text_fitz(filepath)
print(text2[:1000])  # Print the first 1000 characters of the text

 
Received: 18 April 2019 | Revised: 28 May 2019 | Accepted: 4 June 2019
DOI: 10.1002/ece3.5395  
O R I G I N A L  R E S E A R C H
“Closer‐to‐home” strategy benefits juvenile survival in a long‐
distance migratory bird
Yachang Cheng1,2
 |   Wolfgang Fiedler1,2 |   Martin Wikelski1,2,3 |   Andrea Flack1,2,3
Abstract
1Department of Migration, Max Planck 
Institute of Animal Behavior, Radolfzell, 
Germany
Human‐induced changes in the climate and environment that occur at an unprec‐
edented speed are challenging the existence of migratory species. Faced with these 
2Department of Biology, University of 
Konstanz, Konstanz, Germany
new challenges, species with diverse and flexible migratory behaviors may suffer less 
from population decline, as they may be better at responding to these changes by 
3Centre for the Advanced Study of 
Collective Behaviour, University of 
Konstanz, Konstanz, Germany
altering their migratory behavior. At the individual level, variations in migratory be‐
havior m

In [100]:
text = """

10.5061/dryad.zw3r2\u200b2854.

"""
#doi_pattern = r'\b10\.\s?\d{4,9}\/[-._()<>;\/:A-Za-z0-9\u200b]+\s?(?![A-Z]+)+[-._()<>;\/:A-Za-z0-9\u200b]+'
doi_pattern = r'10\.\s?\d{4,9}\/[-._()<>;\/:A-Za-z0-9\u200b]+\s?(?:(?![A-Z]+)(?!\d{1,3}\.))+[-._()<>;\/:A-Za-z0-9\u200b]+'
found_dois = set(re.findall(doi_pattern, text))
print(f"Found DOIs: {found_dois}")

regex_id = get_dataset_id_regex(found_dois.pop())
print(f"Regex ID: {regex_id}")


Found DOIs: {'10.5061/dryad.zw3r2\u200b2854.'}
Regex ID: 10\.\s?5061\/dryad\.\s?zw3r2\s?​2854\.\s?


In [None]:

# --- 6. Execution ---
if __name__ == "__main__":
    # Ensure the ARTICLES_DIR exists or adjust path
    if not os.path.isdir(ARTICLES_DIR):
        print(f"Articles directory not found: {ARTICLES_DIR}")
        print("Please create dummy files or point to a valid directory for testing.")
        # Create dummy files for a quick test if ARTICLES_DIR is missing
        # This part is for local testing, remove or adapt for Kaggle
        if ARTICLES_DIR == "/kaggle/input/research-articles-dataset/articles/": # Be careful with this
             print("Cannot create dummy files in /kaggle/input. Please provide data via Kaggle Datasets.")
        else: # Local testing
            os.makedirs(ARTICLES_DIR, exist_ok=True)
            with open(os.path.join(ARTICLES_DIR, "article1.pdf"), "w") as f: f.write("Dummy PDF with DOI 10.1234/foo.bar and dataset created by us.") # Needs actual PDF content
            with open(os.path.join(ARTICLES_DIR, "article2.xml"), "w") as f: f.write("<root><text>Used dataset 10.5678/baz.qux from another study.</text></root>")
    
    print("Starting article processing...")
    df_results = process_articles(ARTICLES_DIR)
    
    print("\n--- Results ---")
    print(df_results.head())
    
    df_results.to_csv(OUTPUT_CSV_PATH, index=False)
    print(f"\nResults saved to {OUTPUT_CSV_PATH}")

    # If you have training data, you can load it here and compare/evaluate
    # Example:
    # if os.path.exists(TRAINING_DATA_PATH):
    #     df_train = pd.read_csv(TRAINING_DATA_PATH)
    #     print("\nTraining Data Head:")
    #     print(df_train.head())
    #     # ... further evaluation logic ...