In [None]:
import os
import sys
import json

# --- path setup ---
# Depending on where you launch this notebook, relative paths might vary.
# We assume this notebook is in 'notebooks/' and the source code is in '23127011/src'.

current_dir = os.getcwd()
project_base = os.path.abspath(os.path.join(current_dir, '..'))
src_container = os.path.join(project_base, '23127011')

if src_container not in sys.path:
    sys.path.append(src_container)

print(f"Current Directory: {current_dir}")
print(f"Project Base: {project_base}")
print(f"Added to sys.path: {src_container}")

In [None]:
from src.pipeline import run_processing_pipeline

In [None]:
# --- Configuration ---
DATA_RAW_PATH = os.path.join(project_base, 'data_raw')
DATA_OUTPUT_PATH = os.path.join(project_base, 'data_output')

print(f"Input: {DATA_RAW_PATH}")
print(f"Output: {DATA_OUTPUT_PATH}")

In [None]:
# --- Execution ---
run_processing_pipeline(DATA_RAW_PATH, DATA_OUTPUT_PATH)

## Verification
Let's inspect the generated files to ensure everything worked correctly.

In [None]:
hierarchy_path = os.path.join(DATA_OUTPUT_PATH, 'hierarchy.json')
refs_path = os.path.join(DATA_OUTPUT_PATH, 'refs.bib')

# 1. Check Output Files
print(f"Checking {hierarchy_path}: {'Found' if os.path.exists(hierarchy_path) else 'Missing'}")
print(f"Checking {refs_path}: {'Found' if os.path.exists(refs_path) else 'Missing'}")

In [None]:
# 2. Inspect Hierarchy Snippet
if os.path.exists(hierarchy_path):
    with open(hierarchy_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    print(f"\nHIERARCHY INFO:")
    print(f"  Versions: {list(data.get('hierarchy', {}).keys())}")
    elements = data.get('elements', {})
    print(f"  Total Elements: {len(elements)}")
    
    print("\n--- Sample Elements ---")
    # Print first 5 items
    for i, (k, v) in enumerate(elements.items()):
        if i >= 5: break
        print(f"[{k}]: {repr(v)[:80]}...")

In [None]:
# 3. Inspect References Snippet
if os.path.exists(refs_path):
    print("\n--- refs.bib (First 15 lines) ---")
    with open(refs_path, 'r', encoding='utf-8') as f:
        for i in range(15):
            print(f.readline().strip())