## **Configuration & Imports**

In [1]:
"""
Configuration & Imports
========================
Load all required modules and set pipeline configuration.
"""

# Standard Library
import os
import sys
import json
import pickle
from typing import Dict, List

# Third-party
from tqdm.auto import tqdm

# Project Modules
from scanner.dataset_scanner import scan_dataset, save_scan_result
from parser.version_resolver import resolve_version
from parser.hierarchy_parser import parse_tex_files
from parser.reference_extractor import (
    extract_references_from_tex_files,
    deduplicate_references
)

# Configuration
RAW_ROOT = "../30-paper"
INTERMEDIATE_DIR = "intermediate"

# Create output directory
os.makedirs(INTERMEDIATE_DIR, exist_ok=True)

print("All imports loaded successfully")
print(f"Configuration: RAW_ROOT='{RAW_ROOT}', INTERMEDIATE_DIR='{INTERMEDIATE_DIR}'")

All imports loaded successfully
Configuration: RAW_ROOT='../30-paper', INTERMEDIATE_DIR='intermediate'


---

## **STEP 0 - Dataset Scanner & Validator**

Scan and validate all publication folders to ensure data integrity.

In [2]:
"""
STEP 0: Dataset Scanning
=========================
Scan publication folders for metadata and LaTeX sources.
"""

scan_result = scan_dataset(RAW_ROOT)

# Summary statistics
total_pubs = len(scan_result)
ready_pubs = sum(1 for v in scan_result.values() if v["status"] == "READY")
total_versions = sum(len(v["versions"]) for v in scan_result.values())

print(f"{'='*60}")
print(f"STEP 0: Dataset Scanning Complete")
print(f"{'='*60}")
print(f"  Total publications: {total_pubs}")
print(f"  Ready publications: {ready_pubs}")
print(f"  Total versions: {total_versions}")
print(f"{'='*60}")

STEP 0: Dataset Scanning Complete
  Total publications: 30
  Ready publications: 29
  Total versions: 42


---

## **STEP 1 - Version-level Multi-file Resolver**

Resolve multi-file LaTeX structure for each version.

In [3]:
"""
STEP 1: Version Resolution
===========================
Resolve LaTeX file dependencies for each version.
"""

step1_results = []
ready_items = [(pub_id, info) for pub_id, info in scan_result.items() if info["status"] == "READY"]

for pub_id, info in tqdm(ready_items, desc="STEP 1: Resolving versions", unit="pub"):
    for version in info["versions"]:
        version_path = f"{RAW_ROOT}/{pub_id}/tex/{version}"
        result = resolve_version(
            publication_id=pub_id,
            version_name=version,
            version_path=version_path
        )
        step1_results.append(result)

# Summary
resolved = sum(1 for r in step1_results if r["status"] == "RESOLVED")
print(f"\nSTEP 1 Complete: {resolved}/{len(step1_results)} versions resolved")

STEP 1: Resolving versions:   0%|          | 0/29 [00:00<?, ?pub/s]


STEP 1 Complete: 42/42 versions resolved


In [4]:
"""
Save STEP 1 Results
====================
"""

with open(f"{INTERMEDIATE_DIR}/step1_results.pkl", "wb") as f:
    pickle.dump(step1_results, f)

print(f"Saved {len(step1_results)} results to {INTERMEDIATE_DIR}/step1_results.pkl")

Saved 42 results to intermediate/step1_results.pkl


---

## **STEP 2 - LaTeX Hierarchy Parser**

Parse LaTeX files into hierarchical tree structures.

In [5]:
"""
STEP 2: Hierarchy Parsing
==========================
Parse LaTeX content into tree structure.
"""

parsed_trees = []
errors = []

for version_info in tqdm(step1_results, desc="STEP 2: Parsing LaTeX", unit="version"):
    if version_info["status"] != "RESOLVED":
        continue
    
    version_path = f"{RAW_ROOT}/{version_info['publication_id']}/tex/{version_info['version']}"
    
    try:
        root_node = parse_tex_files(
            version_path=version_path,
            tex_files=version_info["used_tex_files"]
        )
        parsed_trees.append({
            "publication_id": version_info["publication_id"],
            "version": version_info["version"],
            "root": root_node
        })
    except Exception as e:
        errors.append(f"{version_info['publication_id']}/{version_info['version']}: {str(e)}")

# Summary
print(f"\nSTEP 2 Complete: {len(parsed_trees)} trees parsed")
if errors:
    print(f"{len(errors)} errors encountered")

STEP 2: Parsing LaTeX:   0%|          | 0/42 [00:00<?, ?version/s]


STEP 2 Complete: 42 trees parsed


In [6]:
"""
Save STEP 2 Results
====================
"""

with open(f"{INTERMEDIATE_DIR}/parsed_trees.pkl", "wb") as f:
    pickle.dump(parsed_trees, f)

print(f"Saved {len(parsed_trees)} trees to {INTERMEDIATE_DIR}/parsed_trees.pkl")

Saved 42 trees to intermediate/parsed_trees.pkl


---

## **STEP 2.5 - Raw Reference Extraction**

Extract bibliography entries from LaTeX sources.

In [7]:
"""
STEP 2.5: Reference Extraction
===============================
Extract BibTeX references from LaTeX files.
"""

raw_references = {}

for version_info in tqdm(step1_results, desc="STEP 2.5: Extracting references", unit="version"):
    if version_info["status"] != "RESOLVED":
        continue
    
    pub_id = version_info["publication_id"]
    version_path = f"{RAW_ROOT}/{pub_id}/tex/{version_info['version']}"
    
    try:
        references = extract_references_from_tex_files(
            version_path=version_path,
            tex_files=version_info["used_tex_files"]
        )
        if references:
            if pub_id not in raw_references:
                raw_references[pub_id] = []
            raw_references[pub_id].extend(references)
    except Exception as e:
        pass  # Silent fail

# Deduplicate per publication
deduplicated_references = {
    pub_id: deduplicate_references(refs)
    for pub_id, refs in raw_references.items()
}

# Summary
total_refs = sum(len(refs) for refs in deduplicated_references.values())
print(f"\nSTEP 2.5 Complete:")
print(f"  - Publications with references: {len(deduplicated_references)}")
print(f"  - Total unique references: {total_refs}")

STEP 2.5: Extracting references:   0%|          | 0/42 [00:00<?, ?version/s]

[INFO] Parsing .bib file: ../30-paper/2211-13748/tex/2211-13748v1\mybibliography.bib
[INFO] Parsing .bib file: ../30-paper/2211-13750/tex/2211-13750v1\comparingsinglettestingschemes.bib
[INFO] Parsing .bib file: ../30-paper/2211-13750/tex/2211-13750v2\comparingsinglettestingschemes4.bib
[INFO] Parsing .bib file: ../30-paper/2211-13751/tex/2211-13751v1\bibliography_v3.bib
[INFO] Parsing .bib file: ../30-paper/2211-13752/tex/2211-13752v1\egbib.bib
[INFO] Parsing .bib file: ../30-paper/2211-13754/tex/2211-13754v1\refs.bib
[WARN] Bibliography file not found: ../30-paper/2211-13755/tex/2211-13755v1\egbib.bib
[INFO] Parsing .bib file: ../30-paper/2211-13755/tex/2211-13755v2\egbib.bib
[INFO] Parsing .bib file: ../30-paper/2211-13757/tex/2211-13757v1\bib.bib
[INFO] Parsing .bib file: ../30-paper/2211-13757/tex/2211-13757v2\bib.bib
[WARN] Bibliography file not found: ../30-paper/2211-13758/tex/2211-13758v1\example.bib
[INFO] Parsing .bib file: ../30-paper/2211-13758/tex/2211-13758v1\bibliograph

In [8]:
"""
Save STEP 2.5 Results
======================
"""

with open(f"{INTERMEDIATE_DIR}/raw_references.pkl", "wb") as f:
    pickle.dump(deduplicated_references, f)

print(f"Saved {len(deduplicated_references)} reference sets to {INTERMEDIATE_DIR}/raw_references.pkl")

Saved 24 reference sets to intermediate/raw_references.pkl
