# DBT DAG Utilities Demo

This notebook demonstrates the comprehensive DAG analysis and visualization utilities
available in the dynamic runtime package.

## Features

- DAG structure analysis
- Node information retrieval
- Dependency and lineage tracking
- Impact analysis
- Visualization tools
- Execution planning

In [None]:
# Import required libraries
from pathlib import Path
import json
from pprint import pprint

# Import DAG utilities
from ingen_fab.packages.dbt.runtime.dynamic import (
    DAGAnalyzer,
    DAGVisualizer,
    get_dag_info,
    get_node_details,
    find_nodes,
    analyze_impact,
    create_dag_report
)

In [None]:
# Set your dbt project path
dbt_project_path = Path("./sample_project")

# Verify the path exists
if not dbt_project_path.exists():
    print(f"Project path {dbt_project_path} does not exist!")
else:
    print(f"Using dbt project at: {dbt_project_path.absolute()}")

## 1. Quick DAG Information

Use the convenience functions for quick access to DAG information.

In [None]:
# Get overall DAG summary
dag_summary = get_dag_info(dbt_project_path)

print("DAG Summary:")
print(f"Total Nodes: {dag_summary['total_nodes']}")
print(f"Node Types: {dag_summary['node_types']}")
print(f"DAG Depth: {dag_summary['dag_depth']}")
print(f"Has Cycles: {dag_summary['has_cycles']}")
print(f"Root Nodes: {dag_summary['root_nodes']}")
print(f"Leaf Nodes: {dag_summary['leaf_nodes']}")

In [None]:
# Find nodes by pattern
pattern = "dim"  # Search for dimension tables
matching_nodes = find_nodes(dbt_project_path, pattern)

print(f"\nNodes matching '{pattern}':")
for node in matching_nodes[:10]:
    print(f"  - {node}")
    
if len(matching_nodes) > 10:
    print(f"  ... and {len(matching_nodes) - 10} more")

## 2. Detailed Node Analysis

Get comprehensive information about specific nodes.

In [None]:
# Create analyzer instance for more detailed work
analyzer = DAGAnalyzer(dbt_project_path)

# Get all nodes and select one for analysis
all_nodes = analyzer.loader.get_all_nodes()
models = analyzer.loader.get_nodes_by_type("model")

if models:
    sample_node = models[0]
    node_info = analyzer.get_node_info(sample_node)
    
    print(f"Detailed information for node: {sample_node}")
    print("=" * 60)
    print(f"Name: {node_info['name']}")
    print(f"Type: {node_info['resource_type']}")
    print(f"Database: {node_info['database']}")
    print(f"Schema: {node_info['schema']}")
    print(f"Path: {node_info['path']}")
    print(f"\nDependencies:")
    print(f"  Direct upstream: {len(node_info['direct_dependencies'])} nodes")
    print(f"  Direct downstream: {len(node_info['direct_dependents'])} nodes")
    print(f"  Total upstream: {node_info['total_upstream']} nodes")
    print(f"  Total downstream: {node_info['total_downstream']} nodes")
    print(f"\nNode Properties:")
    print(f"  Is Root: {node_info['is_root']}")
    print(f"  Is Leaf: {node_info['is_leaf']}")
    print(f"  SQL Statements: {node_info['sql_count']}")

## 3. Lineage Analysis

Trace the complete lineage of data through the DAG.

In [None]:
if models:
    # Get lineage for a sample model
    lineage = analyzer.get_node_lineage(models[0])
    
    print(f"Lineage for {models[0]}:")
    print("=" * 60)
    
    print("\nUpstream (dependencies):")
    print(f"  Total: {lineage['upstream']['total']} nodes")
    print("  By type:")
    for node_type, nodes in lineage['upstream']['by_type'].items():
        print(f"    - {node_type}: {len(nodes)} nodes")
    
    print("\nDownstream (dependents):")
    print(f"  Total: {lineage['downstream']['total']} nodes")
    print("  By type:")
    for node_type, nodes in lineage['downstream']['by_type'].items():
        print(f"    - {node_type}: {len(nodes)} nodes")

## 4. Impact Analysis

Understand the impact of changes to specific nodes.

In [None]:
if models:
    # Analyze impact of changes to a model
    impact = analyzer.get_impact_analysis(models[0])
    
    print(f"Impact Analysis for {models[0]}:")
    print("=" * 60)
    print(f"Total nodes impacted: {impact['total_impact']}")
    print(f"Immediate impact: {impact['immediate_impact']} nodes")
    
    print("\nImpact by resource type:")
    for node_type, nodes in impact['impacted_by_type'].items():
        print(f"  - {node_type}: {len(nodes)} nodes")
    
    if impact['critical_downstream']:
        print("\nCritical downstream nodes (high further impact):")
        for critical in impact['critical_downstream'][:5]:
            print(f"  - {critical['node_id']}: impacts {critical['further_impact']} more nodes")

## 5. Execution Planning

Understand the execution order and parallelization opportunities.

In [None]:
# Get execution order
execution_order = analyzer.get_execution_order()

print(f"Execution Plan: {len(execution_order)} stages")
print("=" * 60)

for i, stage in enumerate(execution_order[:5], 1):  # Show first 5 stages
    print(f"\nStage {i}: {len(stage)} nodes can run in parallel")
    
    # Show node types in this stage
    stage_types = {}
    for node_id in stage:
        node = analyzer.loader.get_node(node_id)
        if node:
            rtype = node.get('resource_type', 'unknown')
            stage_types[rtype] = stage_types.get(rtype, 0) + 1
    
    print("  Node types:")
    for node_type, count in stage_types.items():
        print(f"    - {node_type}: {count}")
    
    # Show sample nodes
    print("  Sample nodes:")
    for node_id in stage[:3]:
        node = analyzer.loader.get_node(node_id)
        if node:
            print(f"    - {node.get('name', node_id)}")

if len(execution_order) > 5:
    print(f"\n... and {len(execution_order) - 5} more stages")

In [None]:
# Find the critical path
critical_path = analyzer.get_critical_path()

print(f"Critical Path: {len(critical_path)} nodes")
print("=" * 60)
print("The longest path through the DAG:")

for i, node_id in enumerate(critical_path[:10], 1):
    node = analyzer.loader.get_node(node_id)
    if node:
        name = node.get('name', node_id)
        rtype = node.get('resource_type', 'unknown')
        print(f"{i}. {name} ({rtype})")

if len(critical_path) > 10:
    print(f"... and {len(critical_path) - 10} more nodes")

## 6. DAG Visualization

Generate various visualizations of the DAG structure.

In [None]:
# Create visualizer
visualizer = DAGVisualizer(dbt_project_path)

# Print DAG statistics
visualizer.print_dag_stats()

In [None]:
# Generate ASCII tree for a node
if models:
    print(f"\nDownstream tree for {models[0]}:")
    print("=" * 60)
    tree = visualizer.generate_ascii_tree(models[0], direction="downstream", max_depth=3)
    print(tree)

In [None]:
# Generate upstream tree
if models and len(models) > 1:
    print(f"\nUpstream tree for {models[1]}:")
    print("=" * 60)
    tree = visualizer.generate_ascii_tree(models[1], direction="upstream", max_depth=3)
    print(tree)

In [None]:
# Generate Mermaid diagram (for use in documentation)
if models:
    # Generate diagram for a subset of nodes
    mermaid = visualizer.generate_mermaid_diagram(
        target_nodes=models[:3],  # Focus on first 3 models
        max_nodes=20,
        include_tests=False
    )
    
    print("Mermaid Diagram (copy to Mermaid Live Editor):")
    print("=" * 60)
    print(mermaid)

In [None]:
# Generate dependency matrix
if models:
    print("\nDependency Matrix:")
    print("=" * 60)
    matrix = visualizer.generate_dependency_matrix(models[:5])
    print(matrix)

## 7. Cycle Detection

Check for and identify cycles in the DAG.

In [None]:
# Detect cycles
has_cycles, cycles = analyzer.detect_cycles()

print("Cycle Detection:")
print("=" * 60)

if has_cycles:
    print(f"⚠️ Found {len(cycles)} cycle(s) in the DAG!")
    for i, cycle in enumerate(cycles, 1):
        print(f"\nCycle {i}:")
        for node in cycle:
            print(f"  → {node}")
else:
    print("✅ No cycles detected - DAG is valid!")

## 8. Export and Reporting

Export DAG information for further analysis or documentation.

In [None]:
# Export DAG to JSON
output_path = Path("dag_export.json")
dag_json = analyzer.export_dag_to_json(output_path)

print(f"DAG exported to {output_path}")
print(f"File size: {len(dag_json)} characters")

# Show sample of exported data
dag_data = json.loads(dag_json)
print(f"\nExported {len(dag_data['nodes'])} nodes")
print(f"Exported {len(dag_data['edges'])} edges")

In [None]:
# Generate comprehensive report
report_path = Path("dag_report.md")
report = create_dag_report(dbt_project_path, report_path)

print(f"Report generated and saved to {report_path}")
print("\nReport Preview:")
print("=" * 60)
print(report[:1000])  # Show first 1000 characters
print("\n... (truncated)")

## 9. Advanced Queries

Examples of more complex DAG queries and analysis.

In [None]:
# Find all test nodes that depend on a specific model
if models:
    target_model = models[0]
    all_tests = analyzer.loader.get_nodes_by_type("test")
    
    dependent_tests = []
    for test_node in all_tests:
        dependencies = analyzer.loader.get_dependencies(test_node)
        if target_model in dependencies:
            dependent_tests.append(test_node)
    
    print(f"Tests that depend on {target_model}:")
    for test in dependent_tests:
        test_info = analyzer.loader.get_node(test)
        if test_info:
            print(f"  - {test_info.get('name', test)}")

In [None]:
# Find models with no tests
models_with_tests = set()
all_tests = analyzer.loader.get_nodes_by_type("test")

for test_node in all_tests:
    dependencies = analyzer.loader.get_dependencies(test_node)
    for dep in dependencies:
        if dep.startswith("model."):
            models_with_tests.add(dep)

all_models = set(analyzer.loader.get_nodes_by_type("model"))
models_without_tests = all_models - models_with_tests

print(f"Models without tests: {len(models_without_tests)} of {len(all_models)}")
if models_without_tests:
    print("\nSample models without tests:")
    for model in list(models_without_tests)[:10]:
        model_info = analyzer.loader.get_node(model)
        if model_info:
            print(f"  - {model_info.get('name', model)}")

In [None]:
# Find isolated nodes (no dependencies and no dependents)
isolated_nodes = []
for node_id in analyzer.loader.get_all_nodes():
    deps = analyzer.loader.get_dependencies(node_id)
    dependents = analyzer.loader.get_dependents(node_id)
    
    if not deps and not dependents:
        isolated_nodes.append(node_id)

if isolated_nodes:
    print(f"Found {len(isolated_nodes)} isolated nodes:")
    for node in isolated_nodes[:10]:
        node_info = analyzer.loader.get_node(node)
        if node_info:
            rtype = node_info.get('resource_type', 'unknown')
            name = node_info.get('name', node)
            print(f"  - {name} ({rtype})")
else:
    print("No isolated nodes found!")

## 10. Performance Analysis

Analyze DAG characteristics that might impact performance.

In [None]:
# Identify potential bottlenecks (nodes with many dependents)
bottlenecks = []
for node_id in analyzer.loader.get_all_nodes():
    dependents = analyzer.loader.get_dependents(node_id)
    if len(dependents) > 3:  # Threshold for bottleneck
        bottlenecks.append((node_id, len(dependents)))

bottlenecks.sort(key=lambda x: x[1], reverse=True)

print("Potential Bottlenecks (nodes with many direct dependents):")
print("=" * 60)
for node_id, dep_count in bottlenecks[:10]:
    node_info = analyzer.loader.get_node(node_id)
    if node_info:
        name = node_info.get('name', node_id)
        rtype = node_info.get('resource_type', 'unknown')
        print(f"{name} ({rtype}): {dep_count} direct dependents")

In [None]:
# Calculate parallelization efficiency
execution_order = analyzer.get_execution_order()
total_nodes = sum(len(stage) for stage in execution_order)
stages = len(execution_order)

if stages > 0:
    avg_parallelism = total_nodes / stages
    max_parallelism = max(len(stage) for stage in execution_order)
    
    print("Parallelization Analysis:")
    print("=" * 60)
    print(f"Total execution stages: {stages}")
    print(f"Average nodes per stage: {avg_parallelism:.2f}")
    print(f"Maximum parallel nodes: {max_parallelism}")
    print(f"\nParallelization efficiency: {(avg_parallelism / max_parallelism * 100):.1f}%")
    
    # Show stage distribution
    print("\nStage size distribution:")
    for i, stage in enumerate(execution_order[:10], 1):
        bar = "█" * min(len(stage), 50)
        print(f"  Stage {i:2d}: {bar} ({len(stage)} nodes)")

## Summary

This notebook demonstrated the comprehensive DAG utilities available for:

1. **Analysis**: Understanding DAG structure, dependencies, and characteristics
2. **Navigation**: Finding nodes, tracing lineage, and exploring relationships
3. **Impact Assessment**: Understanding the effects of changes
4. **Visualization**: Multiple ways to visualize DAG structure
5. **Optimization**: Identifying bottlenecks and parallelization opportunities
6. **Validation**: Detecting cycles and structural issues
7. **Reporting**: Generating comprehensive reports and exports

These utilities make it easy to understand and work with complex dbt DAGs dynamically.