## **Configuration & Imports**

In [6]:
"""
Configuration & Imports
========================
Load all required modules and set pipeline configuration.
"""

# Standard Library
import os
import sys
import json
import pickle
import random
import pandas as pd
from pathlib import Path
from collections import defaultdict
from typing import Dict, List, Any

# Add src directory to Python path
current_dir = os.getcwd()
if current_dir not in sys.path:
    sys.path.insert(0, current_dir)

# Third-party
from tqdm.auto import tqdm

# Project Modules - Parser
from parser.node_normalizer import normalize_node, cleanup_latex
from parser.id_assigner import assign_ids
from parser.content_index import build_content_index
from parser.deduplicator import deduplicate_tree
from parser.reference_extractor import deduplicate_references

# Project Modules - Matcher
from matcher.reference_cleaner import clean_bibtex_entry, clean_arxiv_reference
from matcher.reference_matcher import find_best_match, calculate_similarity_components, find_hard_negative

# Configuration
RAW_ROOT = "../../30-paper"
INTERMEDIATE_DIR = "intermediate"
STUDENT_ID = "22127XXX"  # TODO: Change to your student ID
OUTPUT_DIR = f"../{STUDENT_ID}"  # Submission folder structure
MANUAL_LABELS_FILE = "manual_groundtruth.json"

# Create directories
os.makedirs(INTERMEDIATE_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("All imports loaded successfully")
print(f"Configuration: RAW_ROOT='{RAW_ROOT}', INTERMEDIATE_DIR='{INTERMEDIATE_DIR}', OUTPUT_DIR='{OUTPUT_DIR}'")

All imports loaded successfully
Configuration: RAW_ROOT='../../30-paper', INTERMEDIATE_DIR='intermediate', OUTPUT_DIR='../22127XXX'


---

## **STEP 1 - Load Intermediate Data**

Load parsed trees, references, and arXiv database from previous pipeline.

In [7]:
"""
STEP 1: Load Intermediate Data
================================
Load parsed trees and references from Parser Core pipeline.
"""

# Load Parsed Trees
parsed_trees_path = Path(INTERMEDIATE_DIR) / "parsed_trees.pkl"
if not parsed_trees_path.exists():
    raise FileNotFoundError(f"Required file not found: {parsed_trees_path}")

with open(parsed_trees_path, "rb") as f:
    parsed_trees = pickle.load(f)

# Load Raw References
raw_references_path = Path(INTERMEDIATE_DIR) / "raw_references.pkl"
if raw_references_path.exists():
    with open(raw_references_path, "rb") as f:
        raw_references = pickle.load(f)
else:
    raw_references = {}

# Load arXiv References Database
arxiv_references = {}

for pub_id in tqdm(parsed_trees, desc="STEP 1: Loading arXiv database", unit="pub"):
    pub_folder = Path(RAW_ROOT) / pub_id["publication_id"]
    ref_path = pub_folder / "references.json"
    
    if ref_path.exists():
        with open(ref_path, 'r', encoding='utf-8') as f:
            refs = json.load(f)
            # Convert dict to list format with arxiv_id
            ref_list = [
                {'arxiv_id': arxiv_id, **ref_data} 
                for arxiv_id, ref_data in refs.items()
            ]
            arxiv_references[pub_id["publication_id"]] = ref_list

# Summary
total_raw_refs = sum(len(refs) for refs in raw_references.values())
total_arxiv = sum(len(refs) for refs in arxiv_references.values())

print(f"{'='*80}")
print(f"STEP 1: Data Loading Complete")
print(f"{'='*80}")
print(f"  Total publications: {len(parsed_trees)}")
print(f"  Raw references: {total_raw_refs}")
print(f"  arXiv entries: {total_arxiv}")
print(f"{'='*80}")

STEP 1: Loading arXiv database:   0%|          | 0/42 [00:00<?, ?pub/s]

STEP 1: Data Loading Complete
  Total publications: 42
  Raw references: 1946
  arXiv entries: 781


---

## **STEP 2 - Tree Standardization & Deduplication**

Normalize, assign IDs, and deduplicate content trees across versions.

In [8]:
"""
STEP 2: Tree Standardization & Deduplication
=============================================
Normalize, assign IDs, and merge multiple versions into single tree.
"""

# Group trees by publication
pub_groups = defaultdict(list)
for item in parsed_trees:
    pub_groups[item["publication_id"]].append(item)

# Normalize, deduplicate, and assign IDs
final_trees = {}

for pub_id, versions in tqdm(pub_groups.items(), desc="STEP 2: Processing trees", unit="pub"):
    # Sort versions by version number (v1, v2, ...)
    versions.sort(key=lambda x: int(x["version"].split("v")[-1]))
    
    # Use first version as base
    base = versions[0]
    base_root = base["root"]
    
    # Normalize content
    normalize_node(base_root)
    
    # Assign global IDs
    assign_ids(base_root, pub_id, base["version"])
    
    # Build content index for deduplication
    content_index = build_content_index(base_root)
    
    # Merge remaining versions into base
    for v in versions[1:]:
        root = v["root"]
        normalize_node(root)
        assign_ids(root, pub_id, v["version"])
        deduplicate_tree(
            target_root=base_root,
            source_root=root,
            content_index=content_index
        )
    
    final_trees[pub_id] = base_root

# Deduplicate references
final_references = {}
reference_id_counter = 1

for pub_id in tqdm(raw_references.keys(), desc="STEP 2: Deduplicating references", unit="pub"):
    refs = raw_references[pub_id]
    deduplicated = deduplicate_references(refs)
    
    # Assign unique global IDs
    for ref in deduplicated:
        ref['ref_id'] = f"REF-{reference_id_counter:06d}"
        reference_id_counter += 1
    
    final_references[pub_id] = deduplicated

# Summary
total_unique_refs = sum(len(refs) for refs in final_references.values())

print(f"\nSTEP 2 Complete: {len(final_trees)} publications standardized, {total_unique_refs} unique references")

STEP 2: Processing trees:   0%|          | 0/29 [00:00<?, ?pub/s]

STEP 2: Deduplicating references:   0%|          | 0/24 [00:00<?, ?pub/s]


STEP 2 Complete: 29 publications standardized, 1946 unique references


---

## **STEP 3 - Reference Cleaning & Normalization**

Clean and normalize BibTeX and arXiv references for matching.

In [9]:
"""
STEP 3: Reference Cleaning & Normalization
===========================================
Apply text normalization to BibTeX and arXiv references.
"""

# Clean BibTeX references
cleaned_bibtex = {}

for pub_id in tqdm(final_references.keys(), desc="STEP 3: Cleaning BibTeX", unit="pub"):
    cleaned_entries = []
    for ref in final_references[pub_id]:
        cleaned_ref = clean_bibtex_entry(ref)
        # Preserve original metadata
        cleaned_ref['ref_id'] = ref['ref_id']
        cleaned_ref['key'] = ref.get('key', '')
        cleaned_entries.append(cleaned_ref)
    cleaned_bibtex[pub_id] = cleaned_entries

# Clean arXiv references
cleaned_arxiv = {}

for pub_id in tqdm(arxiv_references.keys(), desc="STEP 3: Cleaning arXiv", unit="pub"):
    cleaned_entries = []
    for ref in arxiv_references[pub_id]:
        cleaned_ref = clean_arxiv_reference(ref)
        cleaned_entries.append(cleaned_ref)
    cleaned_arxiv[pub_id] = cleaned_entries

# Summary
total_bibtex_cleaned = sum(len(refs) for refs in cleaned_bibtex.values())
total_arxiv_cleaned = sum(len(refs) for refs in cleaned_arxiv.values())

print(f"\nSTEP 3 Complete: {total_bibtex_cleaned} BibTeX entries cleaned, {total_arxiv_cleaned} arXiv entries cleaned")

STEP 3: Cleaning BibTeX:   0%|          | 0/24 [00:00<?, ?pub/s]

STEP 3: Cleaning arXiv:   0%|          | 0/29 [00:00<?, ?pub/s]


STEP 3 Complete: 1946 BibTeX entries cleaned, 781 arXiv entries cleaned


---

## **STEP 4 - Labeling & Dataset Construction**

Generate labeled dataset using heuristic matching and manual ground truth.

In [10]:
"""
STEP 4.1: Manual Labeling Process (Exhaustive Negative Sampling)
=================================================================
Generate ground truth from hardcoded manual data.
Strategy: For each Reference, create 1 Positive + ALL Negatives from arXiv pool.
"""

# Fixed manual labels (Priority 1 - Ground Truth)
FIXED_MANUAL_DATA = {
    "2211-13768": {
        "2008MNRAS.391.1685S": "0809-0898",
        "2014MNRAS.441.3359D": "1402-7073",
        "2021MNRAS.503..920C": "2007-02958",
        "2022arXiv220405981K": "2204-05981",
        "Agrawal11611004611": "1610-04611"
    },
    "2211-13757": {
        "3DiM": "2210-04628",
        "acronym": "2011-09584",
        "attention": "1706-03762",
        "autosdf": "2203-09516",
        "cascaded-point-completion": "2004-03327"
    },
    "2211-13767": {
        "Bapat2018": "1812-02746",
        "Bittel_2021": "2101-07267",
        "Brady2021": "2107-01218",
        "Crosson_2021": "2008-09913",
        "Farhi2016": "1602-07674"
    },
    "2211-13755": {
        "AANet": "2004-09548",
        "ACVNet": "2203-02146",
        "AcfNet": "1909-03751",
        "AnyNet": "1810-11408",
        "BI3D": "2005-07274"
    },
    "2211-13766": {
        "BittencourtDamping2022": "2301-11920",
        "Marius_Schrodinger_2022": "2211-00449",
        "asjad2022magnon": "2203-10767",
        "bourcin2022strong": "2209-14643",
        "chan2011laser": "1106-3614"
    }
}

print(f"{'='*80}")
print("STEP 4.1: Processing MANUAL Labels (Exhaustive Negative Sampling)")
print(f"{'='*80}")

manual_pairs = []

# Process each manual label
for pub_id, labels in FIXED_MANUAL_DATA.items():
    arxiv_pool = cleaned_arxiv.get(pub_id, [])
    if not arxiv_pool:
        continue
    
    for bib_key, positive_arxiv_id in labels.items():
        # Find cleaned entries
        bib_entry = next((e for e in cleaned_bibtex.get(pub_id, []) if e.get('key') == bib_key), None)
        if not bib_entry:
            continue
        
        # Common BibTeX fields
        bib_base = {
            'pub_id': pub_id,
            'bib_key': bib_key,
            'bib_ref_id': bib_entry.get('ref_id', ''),
            'bib_title_clean': bib_entry.get('normalized_title', ''),
            'bib_authors_clean': ', '.join(bib_entry.get('normalized_authors', [])),
            'bib_author_tokens': bib_entry.get('author_tokens', []),
            'bib_year': bib_entry.get('normalized_year', ''),
            'source': 'manual'
        }
        
        # 1. CREATE POSITIVE PAIR (Label = 1)
        positive_arxiv = next((e for e in arxiv_pool if e.get('arxiv_id') == positive_arxiv_id), None)
        if positive_arxiv:
            manual_pairs.append({
                **bib_base,
                'candidate_arxiv_id': positive_arxiv_id,
                'candidate_title_clean': positive_arxiv.get('normalized_title', ''),
                'candidate_authors_clean': ', '.join(positive_arxiv.get('normalized_authors', [])),
                'candidate_author_tokens': positive_arxiv.get('author_tokens', []),
                'candidate_year': positive_arxiv.get('normalized_year', ''),
                'label': 1,
                'pair_type': 'positive_manual'
            })
        
        # 2. CREATE ALL NEGATIVE PAIRS (Exhaustive)
        negative_arxiv_pool = [a for a in arxiv_pool if a.get('arxiv_id') != positive_arxiv_id]
        
        for neg_arxiv in negative_arxiv_pool:
            neg_scores = calculate_similarity_components(bib_entry, neg_arxiv)
            manual_pairs.append({
                **bib_base,
                'candidate_arxiv_id': neg_arxiv.get('arxiv_id', ''),
                'candidate_title_clean': neg_arxiv.get('normalized_title', ''),
                'candidate_authors_clean': ', '.join(neg_arxiv.get('normalized_authors', [])),
                'candidate_author_tokens': neg_arxiv.get('author_tokens', []),
                'candidate_year': neg_arxiv.get('normalized_year', ''),
                'label': 0,
                'pair_type': 'exhaustive_negative'
            })

# Statistics
manual_pos = sum(1 for p in manual_pairs if p['label'] == 1)
manual_neg = sum(1 for p in manual_pairs if p['label'] == 0)

print(f"\n  Manual Positives: {manual_pos}")
print(f"  Manual Negatives (Exhaustive): {manual_neg}")
print(f"  Negative/Positive Ratio: {manual_neg/manual_pos:.1f}:1")
print(f"  Total Manual Pairs: {len(manual_pairs)}")


STEP 4.1: Processing MANUAL Labels (Exhaustive Negative Sampling)

  Manual Positives: 21
  Manual Negatives (Exhaustive): 1126
  Negative/Positive Ratio: 53.6:1
  Total Manual Pairs: 1147


In [11]:
"""
STEP 4.2: Adaptive Automatic Matching (Exhaustive Negative Sampling)
=====================================================================
Enforce 10% quota using adaptive thresholds.
Strategy: For each matched Reference, create 1 Positive + ALL Negatives.
"""

print(f"{'='*80}")
print("STEP 4.2: Adaptive AUTOMATIC Matching (10% Quota + Exhaustive Negatives)")
print(f"{'='*80}")

random.seed(42)

# 1. Calculate Quota
total_references = sum(len(refs) for refs in cleaned_bibtex.values())
manual_ref_ids = set((m['pub_id'], m['bib_ref_id']) for m in manual_pairs if m['label'] == 1)
target_auto_count = int(total_references * 0.10)

print(f"  Total references: {total_references}")
print(f"  Target automatic pairs (10%): {target_auto_count}")

# 2. Collect ALL potential matches
all_potential_matches = []
for pub_id in tqdm(cleaned_bibtex.keys(), desc="Collecting potential matches", unit="pub"):
    if pub_id not in cleaned_arxiv:
        continue
    
    arxiv_pool = cleaned_arxiv[pub_id]
    
    for bib_entry in cleaned_bibtex[pub_id]:
        if (pub_id, bib_entry.get('ref_id', '')) in manual_ref_ids:
            continue
        
        # Find best match using module function
        best_match = find_best_match(bib_entry, arxiv_pool, threshold=0.0)
        
        if best_match:
            arxiv_id, score, breakdown = best_match
            best_arxiv = next(a for a in arxiv_pool if a['arxiv_id'] == arxiv_id)
            if score > 0.6:  # Minimum sanity threshold
                all_potential_matches.append({
                    'pub_id': pub_id,
                    'bib_entry': bib_entry,
                    'best_arxiv': best_arxiv,
                    'positive_arxiv_id': arxiv_id,
                    'arxiv_pool': arxiv_pool,  # Store for negative generation
                    'score': score,
                    'breakdown': breakdown
                })

# 3. Apply Adaptive Threshold (Select top N matches)
all_potential_matches.sort(key=lambda x: x['score'], reverse=True)
selected_matches = all_potential_matches[:target_auto_count]

print(f"  Selected {len(selected_matches)} automatic matches")
if selected_matches:
    print(f"  Score range: [{selected_matches[-1]['score']:.3f} - {selected_matches[0]['score']:.3f}]")

# 4. Generate Exhaustive Pairs (Positive + ALL Negatives)
automatic_pairs = []

for match in tqdm(selected_matches, desc="Generating exhaustive auto pairs", unit="match"):
    pub_id = match['pub_id']
    bib = match['bib_entry']
    positive_arxiv = match['best_arxiv']
    positive_arxiv_id = match['positive_arxiv_id']
    arxiv_pool = match['arxiv_pool']
    
    # Common BibTeX fields
    bib_base = {
        'pub_id': pub_id,
        'bib_key': bib.get('key'),
        'bib_ref_id': bib.get('ref_id'),
        'bib_title_clean': bib.get('normalized_title'),
        'bib_authors_clean': ', '.join(bib.get('normalized_authors', [])),
        'bib_author_tokens': bib.get('author_tokens', []),
        'bib_year': bib.get('normalized_year'),
        'source': 'automatic'
    }
    
    # 1. CREATE POSITIVE PAIR (Label = 1)
    automatic_pairs.append({
        **bib_base,
        'candidate_arxiv_id': positive_arxiv_id,
        'candidate_title_clean': positive_arxiv.get('normalized_title'),
        'candidate_authors_clean': ', '.join(positive_arxiv.get('normalized_authors', [])),
        'candidate_author_tokens': positive_arxiv.get('author_tokens', []),
        'candidate_year': positive_arxiv.get('normalized_year'),
        'label': 1,
        'pair_type': 'positive_auto'
    })
    
    # 2. CREATE ALL NEGATIVE PAIRS (Exhaustive)
    negative_arxiv_pool = [a for a in arxiv_pool if a.get('arxiv_id') != positive_arxiv_id]
    
    for neg_arxiv in negative_arxiv_pool:
        neg_scores = calculate_similarity_components(bib, neg_arxiv)
        automatic_pairs.append({
            **bib_base,
            'candidate_arxiv_id': neg_arxiv.get('arxiv_id'),
            'candidate_title_clean': neg_arxiv.get('normalized_title'),
            'candidate_authors_clean': ', '.join(neg_arxiv.get('normalized_authors', [])),
            'candidate_author_tokens': neg_arxiv.get('author_tokens', []),
            'candidate_year': neg_arxiv.get('normalized_year'),
            'label': 0,
            'pair_type': 'exhaustive_negative'
        })

# Statistics
auto_pos = sum(1 for p in automatic_pairs if p['label'] == 1)
auto_neg = sum(1 for p in automatic_pairs if p['label'] == 0)

print(f"\n  Automatic Positives: {auto_pos}")
print(f"  Automatic Negatives (Exhaustive): {auto_neg}")
print(f"  Negative/Positive Ratio: {auto_neg/auto_pos:.1f}:1")
print(f"  Total Automatic Pairs: {len(automatic_pairs)}")


STEP 4.2: Adaptive AUTOMATIC Matching (10% Quota + Exhaustive Negatives)
  Total references: 1946
  Target automatic pairs (10%): 194


Collecting potential matches:   0%|          | 0/24 [00:00<?, ?pub/s]

  Selected 194 automatic matches
  Score range: [0.956 - 1.000]


Generating exhaustive auto pairs:   0%|          | 0/194 [00:00<?, ?match/s]


  Automatic Positives: 194
  Automatic Negatives (Exhaustive): 7816
  Negative/Positive Ratio: 40.3:1
  Total Automatic Pairs: 8010


In [12]:
"""
STEP 4.3: Final Dataset Assembly (Exhaustive Negative Strategy)
================================================================
Combine, Shuffle, and Validate Quotas.
"""

print(f"\n{'='*80}")
print("STEP 4.3: Final Dataset Assembly (Exhaustive Negative Strategy)")
print(f"{'='*80}")

# 1. Combine
all_pairs = manual_pairs + automatic_pairs
labeled_data = pd.DataFrame(all_pairs)

# 2. Define Final Schema (Raw Features + Label ONLY)
# Remove all debug score columns (confidence_score, title_score, author_score, year_score)
final_columns = [
    'pub_id', 'bib_key',
    'bib_title_clean', 'bib_authors_clean', 'bib_author_tokens', 'bib_year',
    'candidate_arxiv_id', 'candidate_title_clean', 
    'candidate_authors_clean', 'candidate_author_tokens', 'candidate_year',
    'source', 'pair_type',
    'label'  # Label at the end
]

# 3. Select & Reorder Columns (Keep only final schema)
labeled_data = labeled_data[[c for c in final_columns if c in labeled_data.columns]]

# 4. Shuffle Dataset
labeled_data = labeled_data.sample(frac=1, random_state=42).reset_index(drop=True)

# 5. Validation Stats
total = len(labeled_data)
pos = len(labeled_data[labeled_data['label'] == 1])
neg = len(labeled_data[labeled_data['label'] == 0])
auto_pos = len(labeled_data[(labeled_data['source'] == 'automatic') & (labeled_data['label'] == 1)])
quota_met = auto_pos >= target_auto_count

print(f"\n{'─'*80}")
print(f"  FINAL DATASET STATISTICS (EXHAUSTIVE NEGATIVE SAMPLING)")
print(f"{'─'*80}")
print(f"  Total pairs: {total:,}")
print(f"  Positive samples: {pos:,}")
print(f"  Negative samples: {neg:,}")
print(f"  Negative/Positive Ratio: {neg/pos:.1f}:1")
print(f"\n  Manual source: {len(labeled_data[labeled_data['source'] == 'manual']):,} pairs")
print(f"  Automatic source: {len(labeled_data[labeled_data['source'] == 'automatic']):,} pairs")
print(f"\n  QUOTA CHECK:")
print(f"  Target (10% of references): {target_auto_count}")
print(f"  Actual automatic positives: {auto_pos}")
print(f"  Status: {'✓ QUOTA MET' if quota_met else '✗ QUOTA NOT MET'}")
print(f"\n  SCHEMA INFO:")
print(f"  Total columns: {len(labeled_data.columns)}")
print(f"  Columns: {list(labeled_data.columns)}")
print(f"{'='*80}")
print(f"\n  ⚠️  NOTE: Class Imbalance is EXPECTED with Exhaustive Sampling.")
print(f"  This reflects real-world Citation Matching scenario (1 correct : N incorrect).")
print(f"  ✓ Score columns removed: confidence_score, title_score, author_score, year_score")
print(f"{'='*80}")


STEP 4.3: Final Dataset Assembly (Exhaustive Negative Strategy)

────────────────────────────────────────────────────────────────────────────────
  FINAL DATASET STATISTICS (EXHAUSTIVE NEGATIVE SAMPLING)
────────────────────────────────────────────────────────────────────────────────
  Total pairs: 9,157
  Positive samples: 215
  Negative samples: 8,942
  Negative/Positive Ratio: 41.6:1

  Manual source: 1,147 pairs
  Automatic source: 8,010 pairs

  QUOTA CHECK:
  Target (10% of references): 194
  Actual automatic positives: 194
  Status: ✓ QUOTA MET

  SCHEMA INFO:
  Total columns: 14
  Columns: ['pub_id', 'bib_key', 'bib_title_clean', 'bib_authors_clean', 'bib_author_tokens', 'bib_year', 'candidate_arxiv_id', 'candidate_title_clean', 'candidate_authors_clean', 'candidate_author_tokens', 'candidate_year', 'source', 'pair_type', 'label']

  ⚠️  NOTE: Class Imbalance is EXPECTED with Exhaustive Sampling.
  This reflects real-world Citation Matching scenario (1 correct : N incorrect).


---

## **STEP 5 - Export Results**

Export structured JSON files and labeled dataset for modeling.

In [13]:
"""
STEP 5: Export Results
=======================
Export files following submission folder structure.
"""

def serialize_node(node: Any) -> Dict[str, Any]:
    """
    Recursively serialize node to dict with CLEANED text only.
    Prioritizes node.full_text (cleaned) over node.content (raw LaTeX).
    """
    # Priority: full_text (cleaned) > content (fallback) > empty
    text_content = ""
    if hasattr(node, 'full_text') and node.full_text:
        text_content = node.full_text
    elif hasattr(node, 'content') and node.content:
        # Fallback: clean raw content if full_text not available
        text_content = cleanup_latex(node.content)
    
    # Build base node data
    node_data = {
        "id": getattr(node, 'id', ''),
        "type": getattr(node, 'node_type', 'unknown'),
        "text": text_content,
    }
    
    # Type-specific fields (already cleaned by normalize_node)
    if node.node_type in {"section", "subsection", "subsubsection"}:
        if hasattr(node, "title") and node.title:
            node_data["title"] = node.title
    
    if node.node_type in {"figure", "table"}:
        if hasattr(node, "caption") and node.caption:
            caption = node.caption
            if '\\' in caption:  # Safety check
                caption = cleanup_latex(caption)
            node_data["caption"] = caption
        
        if hasattr(node, "label") and node.label:
            label = node.label
            if '\\' in label:  # Safety check
                label = cleanup_latex(label)
            node_data["label"] = label
    
    # Recursive children
    if hasattr(node, 'children') and node.children:
        node_data["children"] = [serialize_node(child) for child in node.children]
    else:
        node_data["children"] = []
    
    return node_data


# Export structured JSON files (submission format)
json_exported = 0
metadata_exported = 0

for pub_id, root in tqdm(final_trees.items(), desc="STEP 5: Exporting JSON", unit="pub"):
    # Create subfolder for each publication
    pub_folder = Path(OUTPUT_DIR) / pub_id
    pub_folder.mkdir(parents=True, exist_ok=True)
    
    # 1. Export content tree: <pub_id>/<pub_id>.json
    content_json = {
        "publication_id": pub_id,
        "content_tree": serialize_node(root)
    }
    
    output_json = pub_folder / f"{pub_id}.json"
    with open(output_json, "w", encoding="utf-8") as f:
        json.dump(content_json, f, ensure_ascii=False, indent=2)
    json_exported += 1
    
    # 2. Export metadata: <pub_id>/metadata.json
    raw_metadata = Path(RAW_ROOT) / pub_id / "metadata.json"
    if raw_metadata.exists():
        with open(raw_metadata, 'r', encoding='utf-8') as f:
            metadata = json.load(f)
        output_metadata = pub_folder / "metadata.json"
        with open(output_metadata, "w", encoding="utf-8") as f:
            json.dump(metadata, f, ensure_ascii=False, indent=2)
        metadata_exported += 1

# Export labeled dataset for modeling (CSV)
csv_output = Path(INTERMEDIATE_DIR) / "labeled_dataset.csv"
labeled_data.to_csv(csv_output, index=False, encoding='utf-8')

# Save intermediate pickle files
with open(f"{INTERMEDIATE_DIR}/cleaned_bibtex.pkl", "wb") as f:
    pickle.dump(cleaned_bibtex, f)

with open(f"{INTERMEDIATE_DIR}/cleaned_arxiv.pkl", "wb") as f:
    pickle.dump(cleaned_arxiv, f)

with open(f"{INTERMEDIATE_DIR}/final_trees.pkl", "wb") as f:
    pickle.dump(final_trees, f)

with open(f"{INTERMEDIATE_DIR}/final_references.pkl", "wb") as f:
    pickle.dump(final_references, f)

print(f"{'='*80}")
print(f"STEP 5: Export Complete")
print(f"{'='*80}")
print(f"  Submission folder: {OUTPUT_DIR}/")
print(f"  Content trees exported: {json_exported}")
print(f"  Metadata files exported: {metadata_exported}")
print(f"  Labeled dataset: {csv_output} ({len(labeled_data)} rows)")
print(f"  Intermediate pickles: 4 files saved")
print(f"{'='*80}")
print(f"\n  Note: pred.json files will be generated in Notebook 03 (Modeling)")


STEP 5: Exporting JSON:   0%|          | 0/29 [00:00<?, ?pub/s]

STEP 5: Export Complete
  Submission folder: ../22127XXX/
  Content trees exported: 29
  Metadata files exported: 29
  Labeled dataset: intermediate\labeled_dataset.csv (9157 rows)
  Intermediate pickles: 4 files saved

  Note: pred.json files will be generated in Notebook 03 (Modeling)
