# DBT Source Handling Demo

This notebook demonstrates that the DAG execution logic now correctly handles sources.

## Problem Fixed

Previously, nodes that depended on sources would never execute because:
1. Sources were not marked as "completed"
2. The DAG executor would wait forever for source dependencies to complete
3. This blocked the entire execution chain

## Solution

1. **Sources are pre-marked as completed** - They are assumed to exist
2. **Sources are included in dependency graph** - But with empty dependencies
3. **Sources are not executed** - They are skipped during execution
4. **Nodes dependent on sources can now execute** - Their dependencies are satisfied

In [None]:
from pathlib import Path
from pprint import pprint

from ingen_fab.packages.dbt.runtime.dynamic import (
    DynamicModelLoader,
    DynamicDAGExecutor,
    DAGAnalyzer
)

In [None]:
# Project setup
project_path = Path("./sample_project/dbt_project")
print(f"Using project: {project_path}")
print(f"Project exists: {project_path.exists()}")

## 1. Examine Sources in Manifest

In [None]:
# Load manifest and examine sources
loader = DynamicModelLoader(project_path)
manifest = loader.manifest

sources = manifest.get("sources", {})
nodes = manifest.get("nodes", {})

print(f"Total sources: {len(sources)}")
print(f"Total nodes: {len(nodes)}")

print("\nSource examples:")
for i, (source_id, source_data) in enumerate(list(sources.items())[:3], 1):
    print(f"{i}. {source_id}")
    print(f"   Database: {source_data.get('database')}")
    print(f"   Schema: {source_data.get('schema')}")
    print(f"   Name: {source_data.get('name')}")

## 2. Demonstrate Source Dependencies

In [None]:
# Find models that depend on sources
models = loader.get_nodes_by_type("model")
source_dependent_models = []

for model_id in models:
    deps = loader.get_dependencies(model_id)
    source_deps = [dep for dep in deps if dep in sources]
    if source_deps:
        source_dependent_models.append((model_id, source_deps))

print(f"Models that depend on sources: {len(source_dependent_models)}")
print("\nExamples:")
for i, (model, deps) in enumerate(source_dependent_models[:5], 1):
    print(f"{i}. {model}")
    print(f"   Source dependencies: {len(deps)}")
    for dep in deps[:2]:
        print(f"     - {dep}")

## 3. Test DAG Executor Initialization

In [None]:
# Create mock Spark session for testing
class MockSparkSession:
    def sql(self, query):
        print(f"Would execute: {query[:50]}...")
        return None

mock_spark = MockSparkSession()

# Create DAG executor
executor = DynamicDAGExecutor(mock_spark, project_path)

print(f"Total nodes in execution tracking: {len(executor.execution_status)}")

# Check status of sources
source_statuses = {
    source_id: executor.execution_status.get(source_id, "NOT_FOUND")
    for source_id in list(sources.keys())[:5]
}

print("\nSource execution status:")
for source_id, status in source_statuses.items():
    print(f"  {source_id}: {status}")

# Verify all sources are marked as completed
all_sources_completed = all(
    executor.execution_status.get(source_id) == "completed"
    for source_id in sources
)
print(f"\n✅ All sources marked as completed: {all_sources_completed}")

## 4. Test Ready Nodes Detection

In [None]:
# Get nodes ready for execution
ready_nodes = executor.get_ready_nodes()
print(f"Nodes ready to execute: {len(ready_nodes)}")

# Analyze ready nodes
ready_by_type = {}
source_only_deps = []
no_deps = []

for node_id in ready_nodes:
    node = loader.get_node(node_id)
    if node:
        resource_type = node.get("resource_type", "unknown")
        ready_by_type[resource_type] = ready_by_type.get(resource_type, 0) + 1
        
        # Check dependencies
        deps = loader.get_dependencies(node_id)
        source_deps = [dep for dep in deps if dep in sources]
        other_deps = [dep for dep in deps if dep not in sources]
        
        if source_deps and not other_deps:
            source_only_deps.append(node_id)
        elif not deps:
            no_deps.append(node_id)

print("\nReady nodes by type:")
for rtype, count in ready_by_type.items():
    print(f"  {rtype}: {count}")

print(f"\nNodes with only source dependencies: {len(source_only_deps)}")
print(f"Nodes with no dependencies: {len(no_deps)}")

print("\nExample nodes ready to execute (depend only on sources):")
for i, node_id in enumerate(source_only_deps[:3], 1):
    deps = loader.get_dependencies(node_id)
    source_deps = [dep for dep in deps if dep in sources]
    print(f"{i}. {node_id}")
    print(f"   Source dependencies: {len(source_deps)}")

## 5. Test Source Skipping During Execution

In [None]:
# Test that sources are skipped during execution
sample_source = list(sources.keys())[0]
print(f"Testing execution of source: {sample_source}")

try:
    result = executor.execute_node(sample_source)
    print(f"✅ Source execution handled correctly")
    print(f"   Result: {result}")
    print(f"   Status: {executor.execution_status[sample_source]}")
except Exception as e:
    print(f"❌ Error executing source: {e}")

## 6. Demonstrate Working Dependency Chain

In [None]:
# Show that the dependency chain now works
if source_only_deps:
    sample_model = source_only_deps[0]
    print(f"Testing model that depends on sources: {sample_model}")
    
    # Check dependencies
    deps = loader.get_dependencies(sample_model)
    print(f"Dependencies: {deps}")
    
    # Check if all dependencies are satisfied
    deps_satisfied = all(
        executor.execution_status.get(dep) == "completed"
        for dep in deps
    )
    print(f"All dependencies satisfied: {deps_satisfied}")
    
    # Check if model is ready
    is_ready = sample_model in executor.get_ready_nodes()
    print(f"Model is ready to execute: {is_ready}")
    
    if is_ready:
        print("\n✅ SUCCESS: Models dependent on sources can now execute!")
    else:
        print("\n❌ ISSUE: Model still not ready despite sources being completed")

## 7. Validation Summary

In [None]:
# Comprehensive validation
validation_results = []

# Test 1: Sources in dependency graph
deps = loader.build_dependency_graph()
sources_in_graph = sum(1 for node in deps if node in sources)
validation_results.append((
    "Sources included in dependency graph",
    sources_in_graph == len(sources),
    f"{sources_in_graph}/{len(sources)}"
))

# Test 2: Sources marked as completed
sources_completed = sum(
    1 for source_id in sources
    if executor.execution_status.get(source_id) == "completed"
)
validation_results.append((
    "Sources marked as completed",
    sources_completed == len(sources),
    f"{sources_completed}/{len(sources)}"
))

# Test 3: Models dependent on sources are ready
ready_source_models = sum(
    1 for model_id, _ in source_dependent_models
    if model_id in ready_nodes
)
validation_results.append((
    "Source-dependent models ready",
    ready_source_models > 0,
    f"{ready_source_models}/{len(source_dependent_models)}"
))

# Test 4: Source execution is handled
sample_source = list(sources.keys())[0]
try:
    executor.execute_node(sample_source)
    source_exec_handled = executor.execution_status[sample_source] == "completed"
except:
    source_exec_handled = False

validation_results.append((
    "Source execution handled correctly",
    source_exec_handled,
    "skipped as expected" if source_exec_handled else "failed"
))

print("\n" + "=" * 60)
print("VALIDATION SUMMARY")
print("=" * 60)

all_passed = True
for test_name, passed, details in validation_results:
    status = "✅ PASS" if passed else "❌ FAIL"
    print(f"{status} {test_name}: {details}")
    if not passed:
        all_passed = False

print("\n" + "=" * 60)
if all_passed:
    print("🎉 ALL TESTS PASSED! Source handling is working correctly.")
    print("\nNodes that depend on sources can now execute properly!")
else:
    print("❌ Some validations failed. Source handling needs more work.")
print("=" * 60)