# End-to-End Pipeline Runner
This notebook sequentially executes the ingestion, full pipeline, and term decay analysis scripts located in `final_submission/src`.

Order of execution:
1. Verify environment & script presence
2. Run data ingestion (`ingest_provided_data.py`)
3. Run full pipeline (`full_pipeline.py`)
4. Run term decay analysis (`term_decay_analysis.py`)
5. Summarize produced artifacts
6. (Optional) Parameterized execution template

In [None]:
# Section 1: Set Working Directory & Verify Scripts
import os, pathlib, sys, json, datetime
from pprint import pprint

PROJECT_ROOT = pathlib.Path.cwd()
# Try to locate the final_submission/src directory relative to notebook location
candidate = PROJECT_ROOT / 'final_submission' / 'src'
if not candidate.is_dir():
    # If notebook placed at project root, adjust
    alt = pathlib.Path('final_submission/src').resolve()
    if alt.is_dir():
        candidate = alt

SRC = candidate.resolve()
os.chdir(SRC)
print(f"Working directory set to: {SRC}")

required_scripts = ['ingest_provided_data.py','full_pipeline.py','term_decay_analysis.py']
missing = [s for s in required_scripts if not (SRC / s).is_file()]
assert not missing, f"Missing scripts: {missing}"
print("All required scripts present.")

print("Listing src contents (truncated):")
for p in list(SRC.iterdir())[:25]:
    print('-', p.name)

# Append src to sys.path if not present
if str(SRC) not in sys.path:
    sys.path.insert(0, str(SRC))
print("sys.path updated.")

In [None]:
# Section 2: Run Data Ingestion Script
import subprocess, time, pathlib, os, json
from pprint import pprint

start = time.time()
print("Running ingestion script: ingest_provided_data.py")
result = subprocess.run([sys.executable, 'ingest_provided_data.py'], capture_output=True, text=True)
elapsed = time.time() - start
print(f"Ingestion completed in {elapsed:.2f}s with return code {result.returncode}\n")
print("--- STDOUT (first 3000 chars) ---")
print(result.stdout[:3000])
print("--- STDERR (first 1000 chars) ---")
print(result.stderr[:1000])

# Show key data directories if produced
expected_dirs = [pathlib.Path('../../data/raw'), pathlib.Path('../../data/processed'), pathlib.Path('../../data/interim')]
for d in expected_dirs:
    if d.is_dir():
        files = list(d.glob('*.parquet'))[:10]
        print(f"Directory: {d} (showing up to 10 parquet files)")
        for f in files:
            print('  -', f.name)
    else:
        print(f"(Not found) {d}")

In [None]:
# Section 3: Run Enhanced Modeling Script (replaces previous full pipeline modeling step)
import subprocess, time, json, pathlib, os, sys

start = time.time()
print("Running enhanced modeling script: run_enhanced_modeling.py")
result = subprocess.run([sys.executable, 'run_enhanced_modeling.py'], capture_output=True, text=True)
elapsed = time.time() - start
print(f"Enhanced modeling completed in {elapsed:.2f}s with return code {result.returncode}\n")
print("--- STDOUT (first 3000 chars) ---")
print(result.stdout[:3000])
print("--- STDERR (first 1000 chars) ---")
print(result.stderr[:1000])

# Inspect enhanced modeling artifacts
interim_dir = pathlib.Path('data/interim')
models_dir = pathlib.Path('models')

if interim_dir.is_dir():
    enhanced_json = list(interim_dir.glob('enhanced_modeling_results.json'))
    if enhanced_json:
        try:
            with open(enhanced_json[0], 'r', encoding='utf-8') as f:
                data = json.load(f)
            print(f"\nLoaded enhanced modeling results summary: {enhanced_json[0]}")
            keys_preview = {k: data[k] for k in list(data.keys())[:8]}
            print(json.dumps(keys_preview, indent=2)[:1500])
        except Exception as e:
            print(f"Failed to read enhanced results: {e}
")
    else:
        print("No enhanced_modeling_results.json found yet.")

if models_dir.is_dir():
    print("\nModel artifacts (showing up to 15):")
    for p in list(models_dir.glob('*.json'))[:10] + list(models_dir.glob('*.pkl'))[:5]:
        print('  -', p.name)
else:
    print("Models directory not found.")

In [None]:
# Section 4: Run Term Decay Analysis Script
import subprocess, time, pathlib

start = time.time()
print("Running term decay analysis script: term_decay_analysis.py")
result = subprocess.run([sys.executable, 'term_decay_analysis.py'], capture_output=True, text=True)
elapsed = time.time() - start
print(f"Term decay analysis completed in {elapsed:.2f}s with return code {result.returncode}\n")
print("--- STDOUT (first 3000 chars) ---")
print(result.stdout[:3000])
print("--- STDERR (first 800 chars) ---")
print(result.stderr[:800])

# Attempt to display any generated decay related JSON/CSV/PNG artifacts
possible_dirs = [pathlib.Path('../../data/interim'), pathlib.Path('../../data/processed'), pathlib.Path('../../outputs'), pathlib.Path('../../reports')]
for d in possible_dirs:
    if d.is_dir():
        decay_files = [p for p in d.rglob('*') if p.suffix.lower() in {'.json','.csv','.png'} and 'decay' in p.name.lower()][:10]
        if decay_files:
            print(f"Found decay artifacts in {d}:")
            for f in decay_files:
                print('  -', f.relative_to(d))

In [None]:
# Section 5: Capture and Display Key Outputs
import pathlib, json, pandas as pd
from IPython.display import display

root_candidates = [pathlib.Path('../../data'), pathlib.Path('../../models'), pathlib.Path('../../outputs')]
for root in root_candidates:
    if root.is_dir():
        print(f"\nScanning: {root}")
        for p in list(root.rglob('*'))[:40]:
            if p.is_file():
                print('-', p.relative_to(root))

# Attempt to load a metrics or summary JSON if present
candidate_jsons = []
for root in root_candidates:
    if root.is_dir():
        candidate_jsons.extend([p for p in root.rglob('*.json') if 'performance' in p.name.lower() or 'summary' in p.name.lower() or 'decay' in p.name.lower()])

loaded = False
for cj in candidate_jsons[:5]:
    try:
        with open(cj,'r',encoding='utf-8') as f:
            data = json.load(f)
        print(f"\nLoaded JSON summary: {cj}")
        if isinstance(data, dict):
            print(json.dumps({k: data[k] for k in list(data)[:15]}, indent=2)[:1500])
        loaded = True
        break
    except Exception as e:
        print(f"Failed to load {cj}: {e}")

# Display head of a processed parquet if exists
processed_parquets = list(pathlib.Path('../../data/processed').glob('*.parquet'))[:3]
for pp in processed_parquets:
    try:
        df = pd.read_parquet(pp)
        print(f"\nParquet sample: {pp}")
        display(df.head(3))
    except Exception as e:
        print(f"Could not read {pp}: {e}")