## Imports & Configuration

In [1]:
import os
import sys
import json
import pickle
import gc
from tqdm import tqdm
from typing import Dict, List
from collections import defaultdict

# Add project root to path for imports
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Import pipeline modules
from src.scanner.dataset_scanner import scan_dataset
from src.parser.version_resolver import resolve_version
from src.parser.hierarchy_parser import parse_tex_files, save_tree_to_cache, load_tree_from_cache
from src.parser.id_assigner import fast_normalize_and_id, build_content_index, deduplicate_tree

# Configuration
RAW_ROOT = "../30-paper"
OUTPUT_DIR = "23127453"
CACHE_DIR = ".cache"
USING_SHA256_HASH = False

# Create directories
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(CACHE_DIR, exist_ok=True)

## STEP 0 - Dataset Scanning

Scan the raw dataset to identify valid publications with LaTeX sources.

In [2]:
# Scan the dataset
scan_result = scan_dataset(RAW_ROOT)

# Print statistics
total_pubs = len(scan_result)
ready_pubs = sum(1 for info in scan_result.values() if info["status"] == "READY")
no_tex_pubs = sum(1 for info in scan_result.values() if info["status"] == "NO_TEX")
invalid_pubs = sum(1 for info in scan_result.values() if info["status"] == "INVALID")

print(f"\n{'='*60}")
print(f"DATASET SCAN RESULTS")
print(f"{'='*60}")
print(f"Total publications:     {total_pubs}")
print(f"  ├─ READY:             {ready_pubs}")
print(f"  ├─ NO_TEX:            {no_tex_pubs}")
print(f"  └─ INVALID:           {invalid_pubs}")
print(f"{'='*60}\n")

# Preview first few
print("Sample publications:")
for i, (pub_id, info) in enumerate(list(scan_result.items())[:5]):
    print(f"  {pub_id}: {info['status']} - {len(info['versions'])} version(s)")


DATASET SCAN RESULTS
Total publications:     30
  ├─ READY:             29
  ├─ NO_TEX:            1
  └─ INVALID:           0

Sample publications:
  2211-13747: NO_TEX - 0 version(s)
  2211-13748: READY - 1 version(s)
  2211-13749: READY - 1 version(s)
  2211-13750: READY - 2 version(s)
  2211-13751: READY - 1 version(s)


## STEP 1 - Version Resolution

Resolve LaTeX file structure for each version of each publication.

In [3]:
step1_results = []

# Filter READY publications
ready_items = [
    (pub_id, info)
    for pub_id, info in scan_result.items()
    if info["status"] == "READY"
]

# Resolve each version
for pub_id, info in tqdm(ready_items, desc="STEP 1: Version Resolution"):
    for version in info["versions"]:
        version_path = os.path.join(RAW_ROOT, pub_id, "tex", version)
        
        result = resolve_version(
            publication_id=pub_id,
            version_name=version,
            version_path=version_path
        )
        
        step1_results.append(result)

# Statistics
resolved_count = sum(1 for r in step1_results if r["status"] == "RESOLVED")
print(f"\nResolved {resolved_count}/{len(step1_results)} versions")

STEP 1: Version Resolution: 100%|██████████| 29/29 [00:00<00:00, 437.63it/s]


Resolved 42/42 versions





## STEP 2 - LaTeX Hierarchy Parsing

Parse each LaTeX version into a hierarchical tree structure and cache to disk.

In [4]:
step2_results = []

for version_info in tqdm(step1_results, desc="STEP 2: Parsing LaTeX"):
    if version_info["status"] != "RESOLVED":
        continue
    
    version_path = os.path.join(
        RAW_ROOT,
        version_info["publication_id"],
        "tex",
        version_info["version"]
    )
    
    try:
        # Parse LaTeX files into hierarchy tree
        root_node = parse_tex_files(
            version_path=version_path,
            tex_files=version_info["used_tex_files"]
        )
        
        # Save to disk cache (memory efficient)
        cache_path = save_tree_to_cache(
            pub_id=version_info["publication_id"],
            version=version_info["version"],
            root=root_node
        )
        
        step2_results.append({
            "publication_id": version_info["publication_id"],
            "version": version_info["version"],
            "cache_path": cache_path
        })
        
        # Free memory
        del root_node
        gc.collect()
        
    except Exception as e:
        print(f"\n[ERROR] {version_info['publication_id']}/{version_info['version']}: {e}")

print(f"\nParsed {len(step2_results)} versions successfully")

STEP 2: Parsing LaTeX: 100%|██████████| 42/42 [00:01<00:00, 24.37it/s]


Parsed 42 versions successfully





## STEP 3 - Deduplication & ID Assignment

Merge versions per publication with content-based deduplication.

In [5]:
# Initialize hash cache if using SHA256
if USING_SHA256_HASH:
    _hash_cache = {}

# Group versions by publication
pub_groups = defaultdict(list)
for item in step2_results:
    pub_groups[item["publication_id"]].append(item)

final_trees = {}

# Process each publication
for pub_id, versions in tqdm(pub_groups.items(), desc="STEP 3: Deduplication"):
    if not versions:
        continue
    
    # Sort versions chronologically
    versions.sort(
        key=lambda x: int(x["version"].split("v")[-1]) if "v" in x["version"] else 0
    )
    
    # Load and normalize base tree (first version)
    base_info = versions[0]
    base_root = load_tree_from_cache(base_info["cache_path"])
    fast_normalize_and_id(base_root, pub_id, base_info["version"])
    
    # Build content index for deduplication
    content_index = build_content_index(base_root, USING_SHA256_HASH)
    
    # Merge subsequent versions
    for v_info in versions[1:]:
        root = load_tree_from_cache(v_info["cache_path"])
        fast_normalize_and_id(root, pub_id, v_info["version"])
        deduplicate_tree(root, content_index, USING_SHA256_HASH)
        del root
    
    final_trees[pub_id] = base_root
    
    # Periodic garbage collection
    if len(final_trees) % 50 == 0:
        gc.collect()

# Cleanup
if USING_SHA256_HASH:
    del _hash_cache
gc.collect()

print(f"\nMerged {len(final_trees)} publications with deduplication")

STEP 3: Deduplication: 100%|██████████| 29/29 [00:02<00:00, 12.01it/s]



Merged 29 publications with deduplication


## Validation - Quick Check

Verify the parsing results for a sample publication.

In [6]:
# Sample one publication for inspection
if final_trees:
    sample_pub = next(iter(final_trees))
    root = final_trees[sample_pub]
    
    print(f"Sample Publication: {sample_pub}")
    print(f"  Root type:        {root.node_type}")
    print(f"  Title:            {root.title if hasattr(root, 'title') else 'N/A'}")
    print(f"  Children:         {len(root.children)}")
    print(f"  ID:               {root.id if hasattr(root, 'id') else 'N/A'}")
    
    # Count node types
    def count_nodes(node, counts=None):
        if counts is None:
            counts = defaultdict(int)
        counts[node.node_type] += 1
        for child in node.children:
            count_nodes(child, counts)
        return counts
    
    node_counts = count_nodes(root)
    print("\n  Node type distribution:")
    for node_type, count in sorted(node_counts.items()):
        print(f"    {node_type:15s}: {count:5d}")
else:
    print("No publications parsed successfully.")

Sample Publication: 2211-13748
  Root type:        document
  Title:            None
  Children:         16
  ID:               2211-13748_2211-13748v1_000000

  Node type distribution:
    document       :     1
    enumerate      :     4
    figure         :     4
    section        :     7
    sentence       :   209
    subsection     :    10
    subsubsection  :    14
    table          :     3


## STEP 4 - Export to JSON

Export final trees to structured JSON files for downstream processing.

In [7]:
# Serialization helper
def serialize_node(node):
    """Recursively serialize a node to dictionary."""
    return {
        "id": node.id,
        "type": node.node_type,
        "full_text": getattr(node, "full_text", ""),
        "children": [serialize_node(child) for child in node.children]
    }

# Export statistics
export_count = 0
missing_metadata = []
missing_references = []

print("Exporting publications to output directory...")
print(f"Target: {os.path.abspath(OUTPUT_DIR)}\n")

for pub_id, root in tqdm(final_trees.items(), desc="STEP 4: Exporting", unit="pub"):
    
    # Create publication subdirectory
    pub_output_dir = os.path.join(OUTPUT_DIR, pub_id)
    os.makedirs(pub_output_dir, exist_ok=True)
    
    # ===== 1. Export Content Tree (Parsed Hierarchy) =====
    content_json = {
        "publication_id": pub_id,
        "content_tree": serialize_node(root)
    }
    
    content_path = os.path.join(pub_output_dir, f"{pub_id}.json")
    with open(content_path, "w", encoding="utf-8") as f:
        json.dump(content_json, f, ensure_ascii=False, indent=2)
    
    # ===== 2. Copy Metadata (Original from Semantic Scholar) =====
    raw_metadata_path = os.path.join(RAW_ROOT, pub_id, "metadata.json")
    metadata_path = os.path.join(pub_output_dir, "metadata.json")
    
    if os.path.exists(raw_metadata_path):
        with open(raw_metadata_path, "r", encoding="utf-8") as f:
            metadata = json.load(f)
        with open(metadata_path, "w", encoding="utf-8") as f:
            json.dump(metadata, f, ensure_ascii=False, indent=2)
    else:
        missing_metadata.append(pub_id)
    
    # ===== 3. Copy References (Original from Semantic Scholar) =====
    raw_references_path = os.path.join(RAW_ROOT, pub_id, "references.json")
    references_path = os.path.join(pub_output_dir, "references.json")
    
    if os.path.exists(raw_references_path):
        with open(raw_references_path, "r", encoding="utf-8") as f:
            references = json.load(f)
        with open(references_path, "w", encoding="utf-8") as f:
            json.dump(references, f, ensure_ascii=False, indent=2)
    else:
        missing_references.append(pub_id)
    
    export_count += 1

print(f"Exported {export_count} publications to: {OUTPUT_DIR}")

if missing_metadata:
    print(f"\nWARNING: {len(missing_metadata)} publication(s) missing metadata.json")
    print(f"   {', '.join(missing_metadata[:5])}" + (" ..." if len(missing_metadata) > 5 else ""))

if missing_references:
    print(f"\nWARNING: {len(missing_references)} publication(s) missing references.json")
    print(f"   {', '.join(missing_references[:5])}" + (" ..." if len(missing_references) > 5 else ""))

Exporting publications to output directory...
Target: c:\Users\Feng Wang\OneDrive\Documents\HK7 HCMUS\Introduction to Data\Project\Milestone 2\-IntroDS-_milestone2\src\23127453



STEP 4: Exporting: 100%|██████████| 29/29 [00:00<00:00, 227.05pub/s]

Exported 29 publications to: 23127453



