## **Configuration & Imports**

In [13]:
"""
Configuration & Imports
========================
Load all required modules and set pipeline configuration.
"""

# Standard Library
import os
import sys
import json
import pickle
import pandas as pd
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Any

# Add src directory to Python path
current_dir = os.getcwd()
if current_dir not in sys.path:
    sys.path.insert(0, current_dir)

# Third-party
from tqdm.auto import tqdm

# Project Modules - Parser
from parser.node_normalizer import normalize_node, cleanup_latex
from parser.id_assigner import assign_ids
from parser.content_index import build_content_index
from parser.deduplicator import deduplicate_tree
from parser.reference_extractor import deduplicate_references

# Project Modules - Matcher
from matcher.reference_cleaner import clean_bibtex_entry, clean_arxiv_reference
from matcher.reference_matcher import find_best_match

# Configuration
RAW_ROOT = "../30-paper"
INTERMEDIATE_DIR = "intermediate"
OUTPUT_DIR = "output"
MANUAL_LABELS_FILE = "manual_groundtruth.json"

# Create directories
os.makedirs(INTERMEDIATE_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("All imports loaded successfully")
print(f"Configuration: RAW_ROOT='{RAW_ROOT}', INTERMEDIATE_DIR='{INTERMEDIATE_DIR}', OUTPUT_DIR='{OUTPUT_DIR}'")

All imports loaded successfully
Configuration: RAW_ROOT='../30-paper', INTERMEDIATE_DIR='intermediate', OUTPUT_DIR='output'


---

## **STEP 1 - Load Intermediate Data**

Load parsed trees, references, and arXiv database from previous pipeline.

In [14]:
"""
STEP 1: Load Intermediate Data
================================
Load parsed trees and references from Parser Core pipeline.
"""

# Load Parsed Trees
parsed_trees_path = Path(INTERMEDIATE_DIR) / "parsed_trees.pkl"
if not parsed_trees_path.exists():
    raise FileNotFoundError(f"Required file not found: {parsed_trees_path}")

with open(parsed_trees_path, "rb") as f:
    parsed_trees = pickle.load(f)

# Load Raw References
raw_references_path = Path(INTERMEDIATE_DIR) / "raw_references.pkl"
if raw_references_path.exists():
    with open(raw_references_path, "rb") as f:
        raw_references = pickle.load(f)
else:
    raw_references = {}

# Load arXiv References Database
arxiv_references = {}

for pub_id in tqdm(parsed_trees, desc="STEP 1: Loading arXiv database", unit="pub"):
    pub_folder = Path(RAW_ROOT) / pub_id["publication_id"]
    ref_path = pub_folder / "references.json"
    
    if ref_path.exists():
        with open(ref_path, 'r', encoding='utf-8') as f:
            refs = json.load(f)
            # Convert dict to list format with arxiv_id
            ref_list = [
                {'arxiv_id': arxiv_id, **ref_data} 
                for arxiv_id, ref_data in refs.items()
            ]
            arxiv_references[pub_id["publication_id"]] = ref_list

# Summary
total_raw_refs = sum(len(refs) for refs in raw_references.values())
total_arxiv = sum(len(refs) for refs in arxiv_references.values())

print(f"{'='*80}")
print(f"STEP 1: Data Loading Complete")
print(f"{'='*80}")
print(f"  Total publications: {len(parsed_trees)}")
print(f"  Raw references: {total_raw_refs}")
print(f"  arXiv entries: {total_arxiv}")
print(f"{'='*80}")

STEP 1: Loading arXiv database:   0%|          | 0/42 [00:00<?, ?pub/s]

STEP 1: Data Loading Complete
  Total publications: 42
  Raw references: 1946
  arXiv entries: 781


---

## **STEP 2 - Tree Standardization & Deduplication**

Normalize, assign IDs, and deduplicate content trees across versions.

In [15]:
"""
STEP 2: Tree Standardization & Deduplication
=============================================
Normalize, assign IDs, and merge multiple versions into single tree.
"""

# Group trees by publication
pub_groups = defaultdict(list)
for item in parsed_trees:
    pub_groups[item["publication_id"]].append(item)

# Normalize, deduplicate, and assign IDs
final_trees = {}

for pub_id, versions in tqdm(pub_groups.items(), desc="STEP 2: Processing trees", unit="pub"):
    # Sort versions by version number (v1, v2, ...)
    versions.sort(key=lambda x: int(x["version"].split("v")[-1]))
    
    # Use first version as base
    base = versions[0]
    base_root = base["root"]
    
    # Normalize content
    normalize_node(base_root)
    
    # Assign global IDs
    assign_ids(base_root, pub_id, base["version"])
    
    # Build content index for deduplication
    content_index = build_content_index(base_root)
    
    # Merge remaining versions into base
    for v in versions[1:]:
        root = v["root"]
        normalize_node(root)
        assign_ids(root, pub_id, v["version"])
        deduplicate_tree(
            target_root=base_root,
            source_root=root,
            content_index=content_index
        )
    
    final_trees[pub_id] = base_root

# Deduplicate references
final_references = {}
reference_id_counter = 1

for pub_id in tqdm(raw_references.keys(), desc="STEP 2: Deduplicating references", unit="pub"):
    refs = raw_references[pub_id]
    deduplicated = deduplicate_references(refs)
    
    # Assign unique global IDs
    for ref in deduplicated:
        ref['ref_id'] = f"REF-{reference_id_counter:06d}"
        reference_id_counter += 1
    
    final_references[pub_id] = deduplicated

# Summary
total_unique_refs = sum(len(refs) for refs in final_references.values())

print(f"\nSTEP 2 Complete: {len(final_trees)} publications standardized, {total_unique_refs} unique references")

STEP 2: Processing trees:   0%|          | 0/29 [00:00<?, ?pub/s]

STEP 2: Deduplicating references:   0%|          | 0/24 [00:00<?, ?pub/s]


STEP 2 Complete: 29 publications standardized, 1946 unique references


---

## **STEP 3 - Reference Cleaning & Normalization**

Clean and normalize BibTeX and arXiv references for matching.

In [16]:
"""
STEP 3: Reference Cleaning & Normalization
===========================================
Apply text normalization to BibTeX and arXiv references.
"""

# Clean BibTeX references
cleaned_bibtex = {}

for pub_id in tqdm(final_references.keys(), desc="STEP 3: Cleaning BibTeX", unit="pub"):
    cleaned_entries = []
    for ref in final_references[pub_id]:
        cleaned_ref = clean_bibtex_entry(ref)
        # Preserve original metadata
        cleaned_ref['ref_id'] = ref['ref_id']
        cleaned_ref['key'] = ref.get('key', '')
        cleaned_entries.append(cleaned_ref)
    cleaned_bibtex[pub_id] = cleaned_entries

# Clean arXiv references
cleaned_arxiv = {}

for pub_id in tqdm(arxiv_references.keys(), desc="STEP 3: Cleaning arXiv", unit="pub"):
    cleaned_entries = []
    for ref in arxiv_references[pub_id]:
        cleaned_ref = clean_arxiv_reference(ref)
        cleaned_entries.append(cleaned_ref)
    cleaned_arxiv[pub_id] = cleaned_entries

# Summary
total_bibtex_cleaned = sum(len(refs) for refs in cleaned_bibtex.values())
total_arxiv_cleaned = sum(len(refs) for refs in cleaned_arxiv.values())

print(f"\nSTEP 3 Complete: {total_bibtex_cleaned} BibTeX entries cleaned, {total_arxiv_cleaned} arXiv entries cleaned")

STEP 3: Cleaning BibTeX:   0%|          | 0/24 [00:00<?, ?pub/s]

STEP 3: Cleaning arXiv:   0%|          | 0/29 [00:00<?, ?pub/s]


STEP 3 Complete: 1946 BibTeX entries cleaned, 781 arXiv entries cleaned


---

## **STEP 4 - Labeling & Dataset Construction**

Generate labeled dataset using heuristic matching and manual ground truth.

In [17]:
"""
STEP 4: Labeling & Dataset Construction
========================================
Generate labeled pairs using heuristic matching and manual labels.
"""

# Automatic heuristic matching
automatic_labels = []
match_stats = {
    'total_bibtex': 0,
    'matched': 0,
    'unmatched': 0,
    'high_confidence': 0,  # >= 0.9
    'medium_confidence': 0  # 0.7 - 0.9
}

for pub_id in tqdm(cleaned_bibtex.keys(), desc="STEP 4: Heuristic matching", unit="pub"):
    if pub_id not in cleaned_arxiv:
        continue
    
    bibtex_entries = cleaned_bibtex[pub_id]
    arxiv_pool = cleaned_arxiv[pub_id]
    
    for bib_entry in bibtex_entries:
        match_stats['total_bibtex'] += 1
        
        # Find best match using similarity scoring (threshold=0.7)
        result = find_best_match(bib_entry, arxiv_pool, threshold=0.7)
        
        if result:
            arxiv_id, score, breakdown = result
            
            automatic_labels.append({
                'pub_id': pub_id,
                'bib_key': bib_entry.get('key', ''),
                'bib_ref_id': bib_entry.get('ref_id', ''),
                'arxiv_id': arxiv_id,
                'confidence_score': score,
                'title_score': breakdown.get('title_score', 0.0),
                'author_score': breakdown.get('author_score', 0.0),
                'year_score': breakdown.get('year_score', 0.0),
                'label': 1,  # Positive match
                'source': 'automatic',
                'bib_title_clean': bib_entry.get('normalized_title', ''),
                'bib_authors_clean': ', '.join(bib_entry.get('normalized_authors', [])),
                'bib_year': bib_entry.get('normalized_year', '')
            })
            
            match_stats['matched'] += 1
            if score >= 0.9:
                match_stats['high_confidence'] += 1
            else:
                match_stats['medium_confidence'] += 1
        else:
            match_stats['unmatched'] += 1

# Load manual ground truth (if exists)
manual_labels = []
manual_labels_path = Path(MANUAL_LABELS_FILE)

if manual_labels_path.exists():
    try:
        with open(manual_labels_path, 'r', encoding='utf-8') as f:
            manual_data = json.load(f)
        
        # Convert from dict format to list format
        for pub_id, labels in manual_data.items():
            for bib_key, arxiv_id in labels.items():
                # Find corresponding cleaned entry
                bib_entry = None
                if pub_id in cleaned_bibtex:
                    bib_entry = next((e for e in cleaned_bibtex[pub_id] if e.get('key') == bib_key), None)
                
                manual_labels.append({
                    'pub_id': pub_id,
                    'bib_key': bib_key,
                    'bib_ref_id': bib_entry.get('ref_id', '') if bib_entry else '',
                    'arxiv_id': arxiv_id,
                    'confidence_score': 1.0,
                    'title_score': None,
                    'author_score': None,
                    'year_score': None,
                    'label': 1,
                    'source': 'manual',
                    'bib_title_clean': bib_entry.get('normalized_title', '') if bib_entry else '',
                    'bib_authors_clean': ', '.join(bib_entry.get('normalized_authors', [])) if bib_entry else '',
                    'bib_year': bib_entry.get('normalized_year', '') if bib_entry else ''
                })
    except Exception as e:
        pass  # Silent fail

# Combine all labels (manual overrides automatic)
all_labels = {}

# Add automatic labels first
for label in automatic_labels:
    key = (label['pub_id'], label['bib_key'])
    all_labels[key] = label

# Override with manual labels (higher priority)
for label in manual_labels:
    key = (label['pub_id'], label['bib_key'])
    all_labels[key] = label

combined_labels = list(all_labels.values())

# Enrich with candidate information from cleaned_arxiv
enriched_labels = []

for label in tqdm(combined_labels, desc="STEP 4: Enriching labels", unit="label"):
    pub_id = label['pub_id']
    arxiv_id = label['arxiv_id']
    
    # Find corresponding arXiv entry
    candidate_info = {}
    if pub_id in cleaned_arxiv:
        arxiv_entry = next(
            (e for e in cleaned_arxiv[pub_id] if e.get('arxiv_id') == arxiv_id), 
            None
        )
        if arxiv_entry:
            candidate_info = {
                'candidate_arxiv_id': arxiv_id,
                'candidate_title_clean': arxiv_entry.get('normalized_title', ''),
                'candidate_authors_clean': ', '.join(arxiv_entry.get('normalized_authors', [])),
                'candidate_year': arxiv_entry.get('normalized_year', '')
            }
    
    # Merge all information
    enriched_label = {
        'pub_id': label['pub_id'],
        'bib_key': label['bib_key'],
        'bib_ref_id': label['bib_ref_id'],
        'bib_title_clean': label['bib_title_clean'],
        'bib_authors_clean': label['bib_authors_clean'],
        'bib_year': label['bib_year'],
        **candidate_info,
        'label': label['label'],
        'confidence_score': label['confidence_score'],
        'title_score': label.get('title_score'),
        'author_score': label.get('author_score'),
        'year_score': label.get('year_score'),
        'source': label['source']
    }
    
    enriched_labels.append(enriched_label)

# Create DataFrame
labeled_data = pd.DataFrame(enriched_labels)

# Reorder columns for clarity
column_order = [
    'pub_id', 'bib_key', 'bib_ref_id',
    'bib_title_clean', 'bib_authors_clean', 'bib_year',
    'candidate_arxiv_id', 'candidate_title_clean', 
    'candidate_authors_clean', 'candidate_year',
    'label', 'confidence_score',
    'title_score', 'author_score', 'year_score',
    'source'
]

existing_columns = [col for col in column_order if col in labeled_data.columns]
labeled_data = labeled_data[existing_columns]

# Summary
total_labels = len(labeled_data)
positive_labels = len(labeled_data[labeled_data['label'] == 1])
match_rate = (match_stats['matched'] / match_stats['total_bibtex'] * 100) if match_stats['total_bibtex'] > 0 else 0
manual_count = len(labeled_data[labeled_data['source'] == 'manual'])

print(f"{'='*80}")
print(f"STEP 4: Labeling Complete")
print(f"{'='*80}")
print(f"  Total labeled pairs: {total_labels}")
print(f"  Positive matches: {positive_labels} ({match_rate:.1f}%)")
print(f"  High confidence (>=0.9): {match_stats['high_confidence']}")
print(f"  Manual labels: {manual_count}")
print(f"{'='*80}")

STEP 4: Heuristic matching:   0%|          | 0/24 [00:00<?, ?pub/s]

STEP 4: Enriching labels:   0%|          | 0/51 [00:00<?, ?label/s]

STEP 4: Labeling Complete
  Total labeled pairs: 51
  Positive matches: 51 (2.6%)
  High confidence (>=0.9): 16
  Manual labels: 0


---

## **STEP 5 - Export Results**

Export structured JSON files and labeled dataset for modeling.

In [18]:
"""
STEP 5: Export Results
=======================
Export structured JSON files and labeled dataset for modeling.
"""

def serialize_node(node: Any) -> Dict[str, Any]:
    """
    Recursively serialize node to dict with CLEANED text only.
    Prioritizes node.full_text (cleaned) over node.content (raw LaTeX).
    """
    # Priority: full_text (cleaned) > content (fallback) > empty
    text_content = ""
    if hasattr(node, 'full_text') and node.full_text:
        text_content = node.full_text
    elif hasattr(node, 'content') and node.content:
        # Fallback: clean raw content if full_text not available
        text_content = cleanup_latex(node.content)
    
    # Build base node data
    node_data = {
        "id": getattr(node, 'id', ''),
        "type": getattr(node, 'node_type', 'unknown'),
        "text": text_content,
    }
    
    # Type-specific fields (already cleaned by normalize_node)
    if node.node_type in {"section", "subsection", "subsubsection"}:
        if hasattr(node, "title") and node.title:
            node_data["title"] = node.title
    
    if node.node_type in {"figure", "table"}:
        if hasattr(node, "caption") and node.caption:
            caption = node.caption
            if '\\' in caption:  # Safety check
                caption = cleanup_latex(caption)
            node_data["caption"] = caption
        
        if hasattr(node, "label") and node.label:
            label = node.label
            if '\\' in label:  # Safety check
                label = cleanup_latex(label)
            node_data["label"] = label
    
    # Recursive children
    if hasattr(node, 'children') and node.children:
        node_data["children"] = [serialize_node(child) for child in node.children]
    else:
        node_data["children"] = []
    
    return node_data


# Export structured JSON files
json_exported = 0
metadata_exported = 0

for pub_id, root in tqdm(final_trees.items(), desc="STEP 5: Exporting JSON", unit="pub"):
    # Content tree + references
    content_json = {
        "publication_id": pub_id,
        "content_tree": serialize_node(root),
        "references": final_references.get(pub_id, [])
    }
    
    output_json = Path(OUTPUT_DIR) / f"{pub_id}.json"
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(content_json, f, ensure_ascii=False, indent=2)
    json_exported += 1
    
    # Copy metadata from raw
    raw_metadata = Path(RAW_ROOT) / pub_id / "metadata.json"
    if raw_metadata.exists():
        with open(raw_metadata, 'r', encoding='utf-8') as f:
            metadata = json.load(f)
        output_metadata = Path(OUTPUT_DIR) / f"{pub_id}.metadata.json"
        with open(output_metadata, "w", encoding="utf-8") as f:
            json.dump(metadata, f, ensure_ascii=False, indent=2)
        metadata_exported += 1

# Export labeled dataset for modeling (CSV)
csv_output = Path(INTERMEDIATE_DIR) / "labeled_dataset.csv"
labeled_data.to_csv(csv_output, index=False, encoding='utf-8')

# Save intermediate pickle files
with open(f"{INTERMEDIATE_DIR}/cleaned_bibtex.pkl", "wb") as f:
    pickle.dump(cleaned_bibtex, f)

with open(f"{INTERMEDIATE_DIR}/cleaned_arxiv.pkl", "wb") as f:
    pickle.dump(cleaned_arxiv, f)

with open(f"{INTERMEDIATE_DIR}/final_trees.pkl", "wb") as f:
    pickle.dump(final_trees, f)

with open(f"{INTERMEDIATE_DIR}/final_references.pkl", "wb") as f:
    pickle.dump(final_references, f)

print(f"{'='*80}")
print(f"STEP 5: Export Complete")
print(f"{'='*80}")
print(f"  JSON files exported: {json_exported}")
print(f"  Metadata files exported: {metadata_exported}")
print(f"  Labeled dataset: {csv_output} ({len(labeled_data)} rows)")
print(f"  Intermediate pickles: 4 files saved")
print(f"{'='*80}")

STEP 5: Exporting JSON:   0%|          | 0/29 [00:00<?, ?pub/s]

STEP 5: Export Complete
  JSON files exported: 29
  Metadata files exported: 29
  Labeled dataset: intermediate\labeled_dataset.csv (51 rows)
  Intermediate pickles: 4 files saved
