In [1]:
import pymupdf4llm
import os
import re
import json
import spacy

# Import classes from local utility file
import mdc_data_processing_utils

# If mdc_data_processing_utils.py has been changed and saved.
# To load the changes without restarting the kernel:
import importlib
importlib.reload(mdc_data_processing_utils)

# Now, any calls to functions from mdc_data_processing_utils
# will use the newly reloaded code.
from mdc_data_processing_utils import (
    ArticleData,
    DatasetCitation,
    LlmTrainingData,
    SubmissionData,
    MdcFileTextExtractor,
)

# Load spaCy model for sentence segmentation and potentially other NLP tasks
# python -m spacy download en_core_web_sm 
NLP_SPACY = spacy.load("en_core_web_sm")



# Define constants for file paths and model configurations
BASE_INPUT_DIR = './kaggle/input/make-data-count-finding-data-references'
BASE_OUTPUT_DIR = "./kaggle/working"

# Define directories for articles in train and test sets
TRAIN_DATA_DIR = os.path.join(BASE_INPUT_DIR, 'train')
TEST_DATA_DIR = os.path.join(BASE_INPUT_DIR, 'test')

#pdf_file_path = os.path.join(TRAIN_DATA_DIR, 'PDF', '10.1002_2017jc013030.pdf')
pdf_file_path = os.path.join(TEST_DATA_DIR, 'PDF', '10.1002_ece3.5260.pdf')
pdf_file_path = os.path.join(TRAIN_DATA_DIR, 'PDF', '10.7554_elife.29944.pdf')
pdf_file_path = os.path.join(TRAIN_DATA_DIR, 'PDF', '10.20944_preprints202009.0353.v1.pdf')

file_extractor = MdcFileTextExtractor('10.20944_preprints202009.0353.v1', pdf_file_path)
md_text = file_extractor.extract_text_from_file()
print(f"md_text: {md_text}")

article_data = file_extractor.extract_article_data_from_text(full_text=md_text, nlp=NLP_SPACY)
print(f"article_data: {article_data}")

abstract_match = re.search(r"Abstract\s*(.*?)(?=\n\n|\Z)", md_text, re.IGNORECASE | re.DOTALL)
abstract = abstract_match.group(1).split('\n\n')[0] if abstract_match else "No Abstract"
print(abstract)

idx_1 = md_text.find('###')
idx_2 = md_text.find('Abstract')
idx_2e = md_text.find('\n\n', idx_2)
abstract = md_text[idx_2:idx_2e]
print(abstract[:400])
# print(abstract)
idx_3 = md_text.find('References')
print(f"{idx_1},{idx_2},{idx_3}")
md_text = md_text[idx_2: idx_3]

# md_text = md_text[idx_1:idx_3]
print(f"md Text len: {len(md_text)}")
print(f"md Text: {md_text}")

Extracting md text from file: ./kaggle/input/make-data-count-finding-data-references\train\PDF\10.20944_preprints202009.0353.v1.pdf
Error converting PDF to markdown: Can't get local object 'WeakValueDictionary.__init__.<locals>.remove'. Returing plain text.
pages.stop: 19
article_data: ArticleData(article_id='10.20944_preprints202009.0353.v1', article_doi='10.20944/preprints202009.0353.v1', title='', author='', abstract='Monarch butterflies are a species of conservation priority due to declining overwintering populations in both eastern and western North America. Declines in western overwintering monarchs-more than 99% since monitoring began-are especially acute. However, the degree to which the western monarch is a distinct biological entity is uncertain.', dataset_citations=[])
Abstract Monarch butterflies are a species of conservation priority due to declining overwintering populations in both eastern and western North America. Declines in western overwintering monarchs-more than 99