# Simple EDA

In [8]:
# Standard Imports
import os
import re
import pandas as pd
import lxml.etree as etree
from lxml.etree import _Element as Element # Type hinting for lxml.etree.Element
import fitz # PyMuPDF for PDF processing
import spacy
import json
from tqdm.auto import tqdm
from dataclasses import dataclass, field, asdict
from typing import List, Set, Dict, Union



In [3]:
# Define constants for file paths and model configurations
BASE_INPUT_DIR = './kaggle/input/make-data-count-finding-data-references'
ARTICLE_TRAIN_DIR = os.path.join(BASE_INPUT_DIR, 'train')
ARTICLE_TEST_DIR = os.path.join(BASE_INPUT_DIR, 'test')

# Define directories for articles in train and test sets
LABELED_TRAINING_DATA_CSV_PATH = os.path.join(BASE_INPUT_DIR, 'train_labels.csv')

# Define the base model path
#QWEN_BASE_MODEL_PATH = kagglehub.model_download("qwen-lm/qwen-3/transformers/0.6b")

# Output directory for the fine-tuned model and results
BASE_OUTPUT_DIR = "./kaggle/working"
FINE_TUNED_MODEL_OUTPUT_DIR = os.path.join(BASE_OUTPUT_DIR, "qwen_finetuned_dataset_classifier")
FINAL_RESULTS_CSV_PATH = os.path.join(BASE_OUTPUT_DIR, "article_dataset_classification.csv")

# Load a spaCy model (e.g., 'en_core_web_sm')
# python -m spacy download en_core_web_sm 
NLP_SPACY = spacy.load("en_core_web_sm")

## Data Extraction

In [16]:
@dataclass
class DatasetCitation:
    dataset_ids: Set[str] = field(default_factory=set)  # Set to store unique dataset IDs
    citation_context: str = ""

    def add_dataset_id(self, dataset_id: str):
        self.dataset_ids.add(dataset_id)

    def add_citation_context(self, context: str):
        """Adds a citation context to the dataset citation."""
        if context:
            context = context.replace('\n', '').replace('[', '').replace(']', '')
            context = re.sub(r'\s+', ' ', context.strip())
            self.citation_context += context

    def has_dataset(self) -> bool:
        """Returns True if there is both dataset IDs and citation context."""
        return bool(self.dataset_ids and self.citation_context.strip())

@dataclass
class ArticleData:
    article_id: str = ""
    article_doi: str = ""
    title: str = ""
    author: str = ""
    abstract: str = ""
    datasets: Set[str] = field(default_factory=set)  # Set to store unique dataset IDs
    citation_context: str = ""

    def __post_init__(self):
        # Custom initialization
        if self.article_id and not self.article_doi:
            # If article_id is provided but not article_doi, set article_doi
            self.article_doi = self.article_id.replace("_", "/").lower()

    def add_dataset(self, dataset_id: str):
        """Adds a dataset citation to the article."""
        self.datasets.add(dataset_id)

    def add_dataset_citation(self, dataset_citation: DatasetCitation):
        """Adds a dataset citation context to the article."""
        if dataset_citation.has_dataset():
            self.citation_context += dataset_citation.citation_context + "||"
            for dataset_id in dataset_citation.dataset_ids:
                self.add_dataset(dataset_id)
        
    def to_dict(self):
        d = asdict(self)
        d["datasets"] = list(self.datasets)
        return d

    def to_json(self):
        return json.dumps(self.to_dict(), separators=(',', ':'))

    def has_data(self) -> bool:
        """Returns True if there is any data availability or dataset citation."""
        return bool(self.datasets or self.citation_context.strip())

In [None]:
# --- 2. Information Extraction (IE) - Dataset Identification ---
NON_STD_UNICODE_DASHES = re.compile(r'[\u2010\u2011\u2012\u2013\u2014]')
NON_STD_UNICODE_TICKS = re.compile(r'[\u201c\u201d]')
def clean_text(text: str) -> str:
    """
    Clean the input text by removing non-standard unicode dashes and extra whitespace.
    
    Args:
        text (str): The text to clean.
        
    Returns:
        str: The cleaned text.
    """
    if not text:
        return ""
    # Replace all non-standard unicode dashes with '-'
    text = text.replace('\u200b', '').replace('-\n', '-').replace('_\n', '_').replace('/\n', '/')
    text = NON_STD_UNICODE_DASHES.sub('-', text)
    text = NON_STD_UNICODE_TICKS.sub("'", text)
    # Remove extra whitespace
    return re.sub(r'\s+', ' ', text).strip()

# Regex patterns for common dataset identifiers
# DOI_PATTERN = r'10\.\d{4,5}/[-._;()/:A-Za-z0-9\u002D\u2010\u2011\u2012\u2013\u2014\u2015]+'	DOI_PATTERN
# DOI_PATTERN = r'10\.\s?\d{4,5}\/[-._()<>;\/:A-Za-z0-9]+\s?(?:(?![A-Z]+)(?!\d{1,3}\.))+[-._()<>;\/:A-Za-z0-9]+'
#DOI_PATTERN = r'\bhttps://doi.org/10\.\d{4,5}\/[-._\/:A-Za-z0-9]+'
DOI_PATTERN = r'\b10\.\d{4,5}\/[-._\/:A-Za-z0-9]+'
EPI_PATTERN = r'\bEPI[-_A-Z0-9]{2,}'
SAM_PATTERN = r'\bSAMN[0-9]{2,}'          # SAMN07159041
IPR_PATTERN = r'\bIPR[0-9]{2,}'
CHE_PATTERN = r'\bCHEMBL[0-9]{2,}'
PRJ_PATTERN = r'\bPRJ[A-Z0-9]{2,}'
E_G_PATTERN = r'\bE-[A-Z]{4}-[0-9]{2,}'   # E-GEOD-19722 or E-PROT-100
ENS_PATTERN = r'\bENS[A-Z]{4}[0-9]{2,}'
CVC_PATTERN = r'\bCVCL_[A-Z0-9]{2,}'
EMP_PATTERN = r'\bEMPIAR-[0-9]{2,}'
PXD_PATTERN = r'\bPXD[0-9]{2,}'
HPA_PATTERN = r'\bHPA[0-9]{2,}'
SRR_PATTERN = r'\bSRR[0-9]{2,}'
GSE_PATTERN = r'\b(GSE|GSM|GDS|GPL)\d{4,6}\b' # Example for GEO accession numbers (e.g., GSE12345, GSM12345)
GNB_PATTERN = r'\b[A-Z]{1,2}\d{5,6}\b' # GenBank accession numbers (e.g., AB123456, AF000001)
CAB_PATTERN = r'\bCAB[0-9]{2,}'

# Combine all patterns into a list
DATASET_ID_PATTERNS = [
    DOI_PATTERN,
    EPI_PATTERN,
    SAM_PATTERN,
    IPR_PATTERN,
    CHE_PATTERN,
    PRJ_PATTERN,
    E_G_PATTERN,
    ENS_PATTERN,
    CVC_PATTERN,
    EMP_PATTERN,
    PXD_PATTERN,
    HPA_PATTERN,
    SRR_PATTERN,
    GSE_PATTERN,
    GNB_PATTERN,
    CAB_PATTERN,
]

# Compile all patterns for efficiency
COMPILED_DATASET_ID_REGEXES = [re.compile(p) for p in DATASET_ID_PATTERNS]

# Data related keywords to look for in the text
# These keywords help to ensure that the text is relevant to datasets
DATA_RELATED_KEYWORDS = ['data release', 'data associated', 'data availability', 'data access', 'download', 'program data', 'the data', 'dataset', 'database', 'repository', 'data source', 'data access', 'archive', 'arch.', 'digital']

def is_text_data_related(text: str) -> bool:
    if not text:
        return False
    
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in DATA_RELATED_KEYWORDS)

def extract_dataset_citation(text: str, context_chars: int = 250) -> DatasetCitation:
    """
    Extract dataset identifiers with context from the given text.
    
    Args:
        text (str): The text to search for dataset identifiers.
        context_chars (int): Number of characters to include before and after the match for context.
        
    Returns:
        DatasetCitation: with a list of extracted dataset identifiers with context.
    """
    dataset_citation = DatasetCitation()
    if not text:
        return dataset_citation

    text = clean_text(text)
    if is_text_data_related(text):
        dataset_citation.add_citation_context(text)
        for regex in COMPILED_DATASET_ID_REGEXES:
            matches = regex.finditer(text, re.IGNORECASE)
            for match in matches:
                dataset_id = text[match.start() : match.end()]
                dataset_citation.add_dataset_id(dataset_id)
    
    return dataset_citation

# Use NLP to get sentences from the given text
def get_sentences_from_text(text: str, nlp=NLP_SPACY) -> str:
    if not text:
        return ""
    
    # Replace all non-standard unicode dashes with '-'
    text = clean_text(text)
    text = text.replace('\n', ' ').strip()
    doc_spacy = nlp(text)
    return " ".join([sent.text for sent in doc_spacy.sents])