In [281]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#%pip install pandas

from typing import List, Optional

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Text processing
import re
import PyPDF2

In [282]:
print(os.getenv('KAGGLE_IS_COMPETITION_RERUN'))

None


## Data File Paths and Names

In [283]:
base_path = './kaggle/input/make-data-count-finding-data-references/'
data_type_names = ['train', 'test']
file_type_names = ['PDF', 'XML']
train_labels_file_path = base_path+'train_labels.csv'
sample_submission_file_path = base_path+'sample_submission.csv'

## Load Label Data

In [284]:
# Create DF from training labels and drop 'Missing' type
# This is the file that contains the mapping of article_id to dataset_id and type
train_labels_df = pd.read_csv(train_labels_file_path)
train_labels_df


Unnamed: 0,article_id,dataset_id,type
0,10.1002_2017jc013030,https://doi.org/10.17882/49388,Primary
1,10.1002_anie.201916483,Missing,Missing
2,10.1002_anie.202005531,Missing,Missing
3,10.1002_anie.202007717,Missing,Missing
4,10.1002_chem.201902131,Missing,Missing
...,...,...,...
1023,10.7717_peerj.12422,https://doi.org/10.15468/dl.t3h8b4,Secondary
1024,10.7717_peerj.12422,https://doi.org/10.15468/dl.yak5vd,Secondary
1025,10.7717_peerj.13193,https://doi.org/10.6073/pasta/02e2764efb408a8b...,Secondary
1026,10.7717_peerj.13193,https://doi.org/10.6073/pasta/275ad28a2f31356c...,Secondary


## Preprocess Label Data

In [285]:
# Function to normalize dataset IDs
# This function takes a dataset ID as input and normalizes it by removing the "doi.org/" prefix if it exists.
def normalze_dataset_id(id: str) -> str:
    # Regex to capture the DOI part after "doi.org/"
    # It handles optional "https://" and "www."
    dryad_marker = "/dryad."
    regex = r"(?:https://)?(?:www\.)?doi\.org/(.+)"
    match = re.search(regex, str(id).lower())
    if match:
        # The DOI is in the first capturing group
        full_doi_candidate = match.group(1)
        dryad_index = full_doi_candidate.find(dryad_marker)
        if dryad_index != -1:
            # If "dryad." is found, return the part after it
            return full_doi_candidate[:dryad_index + len(dryad_marker) + 5]
        else:
            # Otherwise, return the full DOI
            return full_doi_candidate
    # If no match is found, return the original ID
    else:
        return id
    
    

In [286]:
# Drop 'Missing' type from train_labels_df
train_labels_df = train_labels_df[train_labels_df['type'] != 'Missing'].reset_index(drop=True)

# Normalize dataset IDs in train_labels_df
train_labels_df['dataset_id_norm'] = train_labels_df['dataset_id'].apply(normalze_dataset_id)

train_labels_df

Unnamed: 0,article_id,dataset_id,type,dataset_id_norm
0,10.1002_2017jc013030,https://doi.org/10.17882/49388,Primary,10.17882/49388
1,10.1002_ece3.4466,https://doi.org/10.5061/dryad.r6nq870,Primary,10.5061/dryad.r6nq8
2,10.1002_ece3.5260,https://doi.org/10.5061/dryad.2f62927,Primary,10.5061/dryad.2f629
3,10.1002_ece3.6144,https://doi.org/10.5061/dryad.zw3r22854,Primary,10.5061/dryad.zw3r2
4,10.1002_ece3.6303,https://doi.org/10.5061/dryad.37pvmcvgb,Primary,10.5061/dryad.37pvm
...,...,...,...,...
714,10.7717_peerj.12422,https://doi.org/10.15468/dl.t3h8b4,Secondary,10.15468/dl.t3h8b4
715,10.7717_peerj.12422,https://doi.org/10.15468/dl.yak5vd,Secondary,10.15468/dl.yak5vd
716,10.7717_peerj.13193,https://doi.org/10.6073/pasta/02e2764efb408a8b...,Secondary,10.6073/pasta/02e2764efb408a8b66b7f486eac14dc0
717,10.7717_peerj.13193,https://doi.org/10.6073/pasta/275ad28a2f31356c...,Secondary,10.6073/pasta/275ad28a2f31356cf9c2648531a16a2b


In [287]:
# Number of unique articles and datasets
print(f"unique articles: {train_labels_df['article_id'].nunique()}")
print(f"unique datasets: {train_labels_df['dataset_id'].nunique()}")

unique articles: 214
unique datasets: 719


## Merge article_id's with related file names for articles that have a dataset_id

In [288]:
# Function to get files related to article_id's
def get_files(data_type_name: str) -> pd.DataFrame:
    # Get PDF files
    pdf_path = os.path.join(base_path + data_type_name, 'PDF')
    pdf_ext = '.pdf'
    pdf_files = [f for f in os.listdir(pdf_path) if f.endswith(pdf_ext)]
    pdf_df = pd.DataFrame({
        'article_id': [f.replace(pdf_ext, '') for f in pdf_files],
        'article_file_path_pdf': [os.path.join(pdf_path, f) for f in pdf_files],
    })

    # Get XML files
    xml_path = os.path.join(base_path + data_type_name, 'XML')
    xml_ext = '.xml'
    xml_files = [f for f in os.listdir(xml_path) if f.endswith(xml_ext)]
    xml_df = pd.DataFrame({
        'article_id': [f.replace(xml_ext, '') for f in xml_files],
        'article_file_path_xml': [os.path.join(xml_path, f) for f in xml_files],
    })

    # Merge on article_id to get one row per article_id
    merged_df = pd.merge(pdf_df, xml_df, on='article_id', how='outer', validate='one_to_one')

    # Add data_type_name column
    merged_df['data_type_name'] = data_type_name
    # Reorder columns to have article_id first
    merged_df = merged_df[['article_id', 'data_type_name', 'article_file_path_pdf', 'article_file_path_xml']]
    return merged_df

In [289]:
# Function to merge article_id's with related files for articles that have a dataset_id
# This will return a DataFrame with article_id, dataset_id, and file paths for both PDF and XML files
# It will drop any articles that do not have a dataset_id
def merge_articles_and_files(data_type_name: str) -> pd.DataFrame:
    files_df = get_files(data_type_name)
    return pd.merge(files_df, train_labels_df, on='article_id', how='left', validate='many_to_many').dropna(subset=['dataset_id']).sort_values(['article_id', 'dataset_id']).reset_index(drop=True)

In [290]:
train_df = merge_articles_and_files('train')
train_df

Unnamed: 0,article_id,data_type_name,article_file_path_pdf,article_file_path_xml,dataset_id,type,dataset_id_norm
0,10.1002_2017jc013030,train,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.17882/49388,Primary,10.17882/49388
1,10.1002_ece3.4466,train,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.5061/dryad.r6nq870,Primary,10.5061/dryad.r6nq8
2,10.1002_ece3.5260,train,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.5061/dryad.2f62927,Primary,10.5061/dryad.2f629
3,10.1002_ece3.6144,train,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.5061/dryad.zw3r22854,Primary,10.5061/dryad.zw3r2
4,10.1002_ece3.6303,train,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.5061/dryad.37pvmcvgb,Primary,10.5061/dryad.37pvm
...,...,...,...,...,...,...,...
714,10.7717_peerj.12422,train,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.15468/dl.t3h8b4,Secondary,10.15468/dl.t3h8b4
715,10.7717_peerj.12422,train,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.15468/dl.yak5vd,Secondary,10.15468/dl.yak5vd
716,10.7717_peerj.13193,train,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.6073/pasta/02e2764efb408a8b...,Secondary,10.6073/pasta/02e2764efb408a8b66b7f486eac14dc0
717,10.7717_peerj.13193,train,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.6073/pasta/275ad28a2f31356c...,Secondary,10.6073/pasta/275ad28a2f31356cf9c2648531a16a2b


In [291]:
# Number of unique training articles and datasets
print(f"unique articles: {train_df['article_id'].nunique()}")
print(f"unique datasets: {train_df['dataset_id'].nunique()}")

unique articles: 214
unique datasets: 719


In [292]:
test_df = merge_articles_and_files('test')
test_df

Unnamed: 0,article_id,data_type_name,article_file_path_pdf,article_file_path_xml,dataset_id,type,dataset_id_norm
0,10.1002_2017jc013030,test,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.17882/49388,Primary,10.17882/49388
1,10.1002_ece3.4466,test,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.5061/dryad.r6nq870,Primary,10.5061/dryad.r6nq8
2,10.1002_ece3.5260,test,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.5061/dryad.2f62927,Primary,10.5061/dryad.2f629
3,10.1002_ece3.6144,test,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.5061/dryad.zw3r22854,Primary,10.5061/dryad.zw3r2
4,10.1002_ece3.6303,test,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.5061/dryad.37pvmcvgb,Primary,10.5061/dryad.37pvm
5,10.1002_ece3.9627,test,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.5061/dryad.b8gtht7h3,Primary,10.5061/dryad.b8gth
6,10.1002_ecs2.1280,test,./kaggle/input/make-data-count-finding-data-re...,,https://doi.org/10.5061/dryad.p3fg9,Primary,10.5061/dryad.p3fg9
7,10.1002_ecs2.4619,test,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.25349/d9qw5x,Primary,10.25349/d9qw5x
8,10.1002_esp.5058,test,./kaggle/input/make-data-count-finding-data-re...,,https://doi.org/10.5061/dryad.jh9w0vt9t,Primary,10.5061/dryad.jh9w0
9,10.1002_esp.5090,test,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.5066/p9353101,Secondary,10.5066/p9353101


In [293]:
# Number of unique test articles and datasets
print(f"unique articles: {test_df['article_id'].nunique()}")
print(f"unique datasets: {test_df['dataset_id'].nunique()}")

unique articles: 12
unique datasets: 13


In [294]:
# Display the row for a specific article_id in the training DataFrame
# train_df[train_df['article_id'] == '10.1029_2019jg005297']
train_df[train_df['article_id'] == '10.1002_mp.14424']


Unnamed: 0,article_id,data_type_name,article_file_path_pdf,article_file_path_xml,dataset_id,type,dataset_id_norm
10,10.1002_mp.14424,train,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.7937/k9/tcia.2015.pf0m9rei,Secondary,10.7937/k9/tcia.2015.pf0m9rei
11,10.1002_mp.14424,train,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.7937/tcia.2020.6c7y-gq39,Primary,10.7937/tcia.2020.6c7y-gq39


In [295]:
# Display the row for a specific article_id in the test DataFrame
test_df[test_df['article_id'] == '10.1002_mp.14424']

Unnamed: 0,article_id,data_type_name,article_file_path_pdf,article_file_path_xml,dataset_id,type,dataset_id_norm
10,10.1002_mp.14424,test,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.7937/k9/tcia.2015.pf0m9rei,Secondary,10.7937/k9/tcia.2015.pf0m9rei
11,10.1002_mp.14424,test,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.7937/tcia.2020.6c7y-gq39,Primary,10.7937/tcia.2020.6c7y-gq39


## Extract text from files

In [296]:
def extract_text_from_pdf_pypdf2(pdf_file_path):
    """
    Extracts text from a PDF file using PyPDF2.
    """
    text = ""
    try:
        with open(pdf_file_path, 'rb') as pdf_file_obj:
            pdf_reader = PyPDF2.PdfReader(pdf_file_obj)
            num_pages = len(pdf_reader.pages)
            for page_num in range(num_pages):
                page_obj = pdf_reader.pages[page_num]
                text += page_obj.extract_text()
    except Exception as e:
        print(f"Error reading {pdf_file_path} with PyPDF2: {e}")
        # Optionally, you could try to extract text even if there are warnings/errors
        # For example, by passing strict=False to PdfReader, though this might hide issues
        # pdf_reader = PyPDF2.PdfReader(pdf_file_obj, strict=False)
    return text

In [297]:
def read_xml_file_contents_as_string(xml_file_path: str, encoding: str = 'utf-8') -> Optional[str]:
    """
    Reads the entire content of an XML (or any text) file into a single string.

    Args:
        xml_file_path (str): The path to the XML file.
        encoding (str): The character encoding to use when reading the file.
                        Defaults to 'utf-8', which is common for XML.

    Returns:
        Optional[str]: A string containing the full content of the file,
                       or None if an error occurs (e.g., file not found).
    """
    try:
        with open(xml_file_path, 'r', encoding=encoding) as file:
            content = file.read()
        return content
    except FileNotFoundError:
        print(f"Error: File not found at '{xml_file_path}'")
        return None
    except IOError as e:
        print(f"Error reading file '{xml_file_path}': {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while reading '{xml_file_path}': {e}")
        return None

In [298]:
def find_string_with_context(main_string: str, search_string: str, context_chars: int = 100) -> List[str]:
    """
    Finds all occurrences of search_string within main_string and returns
    a context window for each. The context window includes the search_string
    itself, surrounded by up to 'context_chars' characters from before and
    after its occurrence in the main_string.

    Args:
        main_string (str): The string to search within.
        search_string (str): The string to search for.
        context_chars (int): The number of characters to include before and after
                             the search_string in the context window. Defaults to 100.

    Returns:
        List[str]: A list of strings, where each string is an occurrence of
                   search_string surrounded by its context. Returns an empty
                   list if search_string is not found, or if either
                   main_string or search_string is empty.
    """
    if not main_string or not search_string:
        return []

    occurrences_with_context: List[str] = []
    current_search_start_index: int = 0
    len_main: int = len(main_string)
    len_search: int = len(search_string)
    main_string_lower: str = "".join(main_string.lower().splitlines())  # Join lines to remove breaks and convert to lowercase

    while current_search_start_index < len_main:
        found_at_index: int = main_string_lower.find(search_string, current_search_start_index)
        
        if found_at_index == -1:  # No more occurrences
            break
        
        # Determine the start of the context snippet
        # It's 'context_chars' before the found string, but not less than 0
        context_slice_start: int = max(0, found_at_index - context_chars)
        
        # Determine the end of the context snippet
        # It's 'context_chars' after the end of the found string, but not beyond main_string length
        context_slice_end: int = min(len_main, found_at_index + len_search + context_chars)
        
        # Extract the substring that includes the search_string and its context
        extracted_snippet: str = main_string_lower[context_slice_start:context_slice_end]
        occurrences_with_context.append(extracted_snippet)
        
        # Move the starting point for the next search to just after the current find
        # This allows finding overlapping occurrences if search_string can overlap itself
        current_search_start_index = found_at_index + 1
            
    return occurrences_with_context

In [299]:
def get_file_text_related_to_dataset_id(article_id: str) -> pd.DataFrame:
    """
    Get text related to all dataset_id's for the given article_id.
    """
    # Filter the DataFrame for the given dataset_id
    filtered_df = train_df[train_df['article_id'] == article_id].reset_index(drop=True)
    
    # Initialize a list to hold the results
    results = []
    pdf_full_text = None
    xml_full_text = None
    
    # Iterate over each row in the filtered DataFrame
    for _, row in filtered_df.iterrows():
        dataset_id_norm = row['dataset_id_norm']
        
        # Extract text from the PDF file
        if not pdf_full_text:
            pdf_file_path = row['article_file_path_pdf']
            pdf_full_text = extract_text_from_pdf_pypdf2(pdf_file_path) if pdf_file_path else ''
        pdf_text = find_string_with_context(pdf_full_text, dataset_id_norm, context_chars=100) if pdf_full_text else ''
        
        # Extract text from the XML file
        if not xml_full_text:
            xml_file_path = row['article_file_path_xml']
            xml_full_text = read_xml_file_contents_as_string(xml_file_path) if xml_file_path else ''
        xml_text = find_string_with_context(xml_full_text, dataset_id_norm, context_chars=100) if xml_full_text else ''

        # Append the result to the list
        results.append({
            'article_id': article_id,
            'dataset_id_norm': dataset_id_norm,
            'pdf_text': pdf_text,
            'xml_text': xml_text,
        })
    
    # Create a DataFrame from the results
    return pd.DataFrame(results)

In [300]:
file_text_df = get_file_text_related_to_dataset_id('10.1002_esp.5058')
file_text_df

An unexpected error occurred while reading 'nan': expected str, bytes or os.PathLike object, not float


Unnamed: 0,article_id,dataset_id_norm,pdf_text,xml_text
0,10.1002_esp.5058,10.5061/dryad.jh9w0,[openly available [usgs data release] at http:...,


In [301]:
def get_file_text(orig_df: pd.DataFrame) -> pd.DataFrame:
    """
    Get text related to all dataset_id's for each article_id in the given orig_df.
    """
    new_df = orig_df.copy()
    pdf_full_text = None
    xml_full_text = None
    curr_article_id = None

    pdf_texts = []
    xml_texts = []

    def get_pdf_text(row, pdf_full_text):
        if not pdf_full_text:
            pdf_file_path = row['article_file_path_pdf']
            pdf_full_text = extract_text_from_pdf_pypdf2(pdf_file_path) if pdf_file_path else ''
        pdf_text = find_string_with_context(pdf_full_text, row['dataset_id_norm'], context_chars=100) if pdf_full_text else ''
        return pdf_text, pdf_full_text

    def get_xml_text(row, xml_full_text):
        if not xml_full_text:
            xml_file_path = row['article_file_path_xml']
            if pd.notna(xml_file_path) and isinstance(xml_file_path, str) and xml_file_path:
                xml_full_text = read_xml_file_contents_as_string(xml_file_path)
            else:
                xml_full_text = ''
        xml_text = find_string_with_context(xml_full_text, row['dataset_id_norm'], context_chars=100) if xml_full_text else ''
        return xml_text, xml_full_text

    for _, row in new_df.iterrows():
        article_id = row['article_id']
        if curr_article_id != article_id:
            pdf_full_text = None
            xml_full_text = None
            curr_article_id = article_id

        pdf_text, pdf_full_text = get_pdf_text(row, pdf_full_text)
        xml_text, xml_full_text = get_xml_text(row, xml_full_text)

        pdf_texts.append(pdf_text)
        xml_texts.append(xml_text)

    new_df['pdf_text'] = pdf_texts
    new_df['xml_text'] = xml_texts

    return new_df

In [302]:
file_text_df_2 = get_file_text(test_df)
file_text_df_2

Unnamed: 0,article_id,data_type_name,article_file_path_pdf,article_file_path_xml,dataset_id,type,dataset_id_norm,pdf_text,xml_text
0,10.1002_2017jc013030,test,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.17882/49388,Primary,10.17882/49388,[al argo ﬂoat measurements for biogeochemical ...,[f> (doi:10.17882/47142)</s><s>and <ref target...
1,10.1002_ece3.4466,test,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.5061/dryad.r6nq870,Primary,10.5061/dryad.r6nq8,[cation.data accessibilitythe dataset supporti...,[rting this article are available in dryad <ex...
2,10.1002_ece3.5260,test,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.5061/dryad.2f62927,Primary,10.5061/dryad.2f629,[or probe design experiments and adephaga_2.9k...,[ed/bam files for probe design experiments and...
3,10.1002_ece3.6144,test,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.5061/dryad.zw3r22854,Primary,10.5061/dryad.zw3r2,[gator.data availability statementdata are ava...,[available at dryad digital repository at: <ex...
4,10.1002_ece3.6303,test,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.5061/dryad.37pvmcvgb,Primary,10.5061/dryad.37pvm,[ data that support the findings of this study...,[udy have been deposited in dryad with doi:<ex...
5,10.1002_ece3.9627,test,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.5061/dryad.b8gtht7h3,Primary,10.5061/dryad.b8gth,[has\tearned\topen\tdata\tand\topen\tmaterials...,[nd open materials badges. data and materials ...
6,10.1002_ecs2.1280,test,./kaggle/input/make-data-count-finding-data-re...,,https://doi.org/10.5061/dryad.p3fg9,Primary,10.5061/dryad.p3fg9,[fo data availabilitydata associated with this...,
7,10.1002_ecs2.4619,test,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.25349/d9qw5x,Primary,10.25349/d9qw5x,[ilability statementdata and novel code (owens...,"[<ref target=""#b50"" type=""bibr"">(owens et al.,..."
8,10.1002_esp.5058,test,./kaggle/input/make-data-count-finding-data-re...,,https://doi.org/10.5061/dryad.jh9w0vt9t,Primary,10.5061/dryad.jh9w0,[openly available [usgs data release] at http:...,
9,10.1002_esp.5090,test,./kaggle/input/make-data-count-finding-data-re...,./kaggle/input/make-data-count-finding-data-re...,https://doi.org/10.5066/p9353101,Secondary,10.5066/p9353101,[015) running onthe university of iowa argon s...,[on the university of iowa argon supercomputer...
