In [55]:
import os
import spacy
import pandas as pd
import spacy

NLP_SPACY = spacy.load("en_core_web_sm")



In [56]:
# Import classes from local utility file
import mdc_data_processing_utils

# If mdc_data_processing_utils.py has been changed and saved.
# To load the changes without restarting the kernel:
import importlib
importlib.reload(mdc_data_processing_utils)

# Now, any calls to functions from mdc_data_processing_utils
# will use the newly reloaded code.
from mdc_data_processing_utils import (
    ArticleData,
    MdcFileTextExtractor,
    ConcurrentFileProcessor,
)


In [57]:
# Define constants for file paths and model configurations
BASE_INPUT_DIR = './kaggle/input/make-data-count-finding-data-references'
BASE_OUTPUT_DIR = "./kaggle/working"

# Define directories for articles in train and test sets
TRAIN_DATA_DIR = os.path.join(BASE_INPUT_DIR, 'train')
TEST_DATA_DIR = os.path.join(BASE_INPUT_DIR, 'test')
TRAIN_LABELS_PATH = os.path.join(BASE_INPUT_DIR, 'train_labels.csv')

In [58]:
def load_file_paths(dataset_type_dir: str) -> pd.DataFrame: 
    pdf_path = os.path.join(dataset_type_dir, 'PDF')
    xml_path = os.path.join(dataset_type_dir, 'XML')
    dataset_type = os.path.basename(dataset_type_dir)
    pdf_files = [f for f in os.listdir(pdf_path) if f.endswith('.pdf')]
    xml_files = [f for f in os.listdir(xml_path) if f.endswith('.xml')]
    df_pdf = pd.DataFrame({
        'article_id': [f.replace('.pdf', '') for f in pdf_files],
        'pdf_file_path': [os.path.join(pdf_path, f) for f in pdf_files]
    })
    df_xml = pd.DataFrame({
        'article_id': [f.replace('.xml', '') for f in xml_files],
        'xml_file_path': [os.path.join(xml_path, f) for f in xml_files]
    })
    merge_df = pd.merge(df_pdf, df_xml, on='article_id', how='outer', suffixes=('_pdf', '_xml'), validate="one_to_many")
    merge_df['dataset_type'] = dataset_type
    return merge_df

# Load the labeled training data CSV file
print(f"Loading labeled training data from: {TRAIN_LABELS_PATH}")
train_labels_df = pd.read_csv(TRAIN_LABELS_PATH)
print(f"Training labels shape: {train_labels_df.shape}")

# Group training data by article_id to get all datasets for each article
# This creates a dictionary where keys are article_ids and values are lists of dataset dicts
grouped_training_data = {}
for article_id, group_df in train_labels_df.groupby('article_id'):
    grouped_training_data[article_id] = group_df[['dataset_id', 'type']].to_dict('records')

# Example usage of grouped_training_data
print(f"Example grouped training data for article_id '10.1002_2017jc013030': {grouped_training_data['10.1002_2017jc013030']}")

# Just for testing, always set to the TEST_DATA_DIR
base_file_dir = TEST_DATA_DIR

# Load file paths for base directory
file_paths_df = load_file_paths(base_file_dir)
file_paths_df['pdf_file_path'] = file_paths_df['pdf_file_path'].fillna('')
file_paths_df['xml_file_path'] = file_paths_df['xml_file_path'].fillna('')
file_paths_df = file_paths_df[file_paths_df['article_id'] == '10.1002_2017jc013030']

# Merge the file paths with the grouped_training_data
file_paths_df['ground_truth_dataset_info'] = file_paths_df['article_id'].map(grouped_training_data)
file_paths_df['ground_truth_dataset_info'] = file_paths_df['ground_truth_dataset_info'].fillna('')

# Reduce the file paths DataFrame to only those with ground truth dataset info and get a sample
# This is to ensure we have a manageable dataset for training
file_paths_df = file_paths_df[file_paths_df['ground_truth_dataset_info'].astype(bool)]
file_paths_df = file_paths_df.reset_index(drop=True)
# file_paths_df = file_paths_df.sample(frac=.1, random_state=42).reset_index(drop=True)  # Shuffle the DataFrame
print(f"Files paths shape: {file_paths_df.shape}")
# display(file_paths_df.sample(3))

Loading labeled training data from: ./kaggle/input/make-data-count-finding-data-references\train_labels.csv
Training labels shape: (1028, 3)
Example grouped training data for article_id '10.1002_2017jc013030': [{'dataset_id': 'https://doi.org/10.17882/49388', 'type': 'Primary'}]
Files paths shape: (1, 5)


In [59]:
filepaths = list(file_paths_df['pdf_file_path'])
ground_truth_list_of_lists = list(file_paths_df['ground_truth_dataset_info'])
print(len(filepaths))
print(len(ground_truth_list_of_lists))
# print(gt_list_of_lists[4][1])


1
1


In [60]:

# --- Demonstration ---
if __name__ == "__main__":

    # --- DEMO 1: Process filepaths concurrently for inference ---
    print("\n\n===== DEMO 1: Processing PDFs for inference with Markdown Timeout =====")
    concurrent_pdf_processor = ConcurrentFileProcessor(NLP_SPACY, max_workers=4) 
    concurrent_pdf_processor.process_files_for_inference(
        filepaths,
        markdown_timeout_seconds=3.0 # Specific argument for PDF logic
    )

    # --- DEMO 2: Process PDFs concurrently with timeout ---
    print("\n\n===== DEMO 2: Processing PDFs for training with Markdown Timeout =====")
    concurrent_pdf_processor.process_files_for_training(
        filepaths,
        ground_truth_list_of_lists,
        markdown_timeout_seconds=3.0 # Specific argument for PDF logic
    )





===== DEMO 1: Processing PDFs for inference with Markdown Timeout =====

--- Starting Concurrent File Processing For Inference Data ---


  0%|          | 0/1 [00:00<?, ?it/s]

Processing 10.1002_2017jc013030.pdf...
Saved article_data for 10.1002_2017jc013030.pdf.
Inference Data processing finished in 2.51 seconds.


===== DEMO 2: Processing PDFs for training with Markdown Timeout =====

--- Starting Concurrent File Processing For Training Data ---


  0%|          | 0/1 [00:00<?, ?it/s]

Processing 10.1002_2017jc013030.pdf...
Saved article_data for 10.1002_2017jc013030.pdf.
Training Data processing finished in 2.44 seconds.
