In [10]:
import requests
import json
import time
from datetime import datetime

# Airflow configuration
AIRFLOW_URL = "http://localhost:6969"
AIRFLOW_USER = "admin"
AIRFLOW_PASSWORD = "admin"

# DAG configuration
DAG_ID = "osm_collection"

# OSM Collection DAG Test

This notebook triggers the `osm_collection` DAG via webhook with a test polygon.

In [11]:
# Test polygon (lat, lon pairs)
test_polygon = [
    [43.73421, -80.41607], 
    [43.69177, -80.46434], 
    [43.63798, -80.42520], 
    [43.65827, -80.33387], 
    [43.71535, -80.32865], 
    [43.74788, -80.38736], 
    [43.73421, -80.41607]
]

# Convert to GeoJSON format (swap to lon, lat for GeoJSON)
geojson_polygon = {
    "type": "Polygon",
    "coordinates": [
        [[lon, lat] for lat, lon in test_polygon]
    ]
}

print("GeoJSON Polygon:")
print(json.dumps(geojson_polygon, indent=2))

GeoJSON Polygon:
{
  "type": "Polygon",
  "coordinates": [
    [
      [
        -80.41607,
        43.73421
      ],
      [
        -80.46434,
        43.69177
      ],
      [
        -80.4252,
        43.63798
      ],
      [
        -80.33387,
        43.65827
      ],
      [
        -80.32865,
        43.71535
      ],
      [
        -80.38736,
        43.74788
      ],
      [
        -80.41607,
        43.73421
      ]
    ]
  ]
}


In [12]:
import requests

BASE_URL = "http://localhost:6969"
USERNAME = "airflow"
PASSWORD = "airflow"

resp = requests.post(
    f"{BASE_URL}/auth/token",
    json={"username": USERNAME, "password": PASSWORD},
    headers={"Content-Type": "application/json"},
    timeout=10,
)
resp.raise_for_status()

access_token = resp.json()["access_token"]
print(access_token)

# Example authenticated call (Public API example uses /api/v2/...)
api_resp = requests.get(
    f"{BASE_URL}/api/v2/dags",
    headers={"Authorization": f"Bearer {access_token}"},
    timeout=10,
)
api_resp.raise_for_status()
print(api_resp.json())


eyJhbGciOiJIUzUxMiIsInR5cCI6IkpXVCJ9.eyJzdWIiOiIxIiwianRpIjoiMzE3MWZlNjQxNThmNDY1ZTk2YjZmNzkyMDY3NjlkMTIiLCJpc3MiOltdLCJhdWQiOiJhcGFjaGUtYWlyZmxvdyIsIm5iZiI6MTc3MDg1ODAxOCwiZXhwIjoxNzcwOTQ0NDE4LCJpYXQiOjE3NzA4NTgwMTh9.zDWXn1Ob63G0Y_M5cW883L5lgL2uvcNdTjud1U3sj46cxmpGZslJ-69Nw-rz_-CSVu3lifwAPXg4HvLHR7TACQ
{'dags': [{'dag_id': 'asset1_producer', 'dag_display_name': 'asset1_producer', 'is_paused': True, 'is_stale': False, 'last_parsed_time': '2026-02-12T01:00:06.057896Z', 'last_parse_duration': 0.05803735999870696, 'last_expired': None, 'bundle_name': 'example_dags', 'bundle_version': None, 'relative_fileloc': 'example_asset_decorator.py', 'fileloc': '/home/airflow/.local/lib/python3.12/site-packages/airflow/example_dags/example_asset_decorator.py', 'description': None, 'timetable_summary': None, 'timetable_description': 'Never, external triggers only', 'tags': [], 'max_active_tasks': 16, 'max_active_runs': 16, 'max_consecutive_failed_dag_runs': 0, 'has_task_concurrency_limits': False, 'ha

In [13]:
jwt_token = access_token

In [14]:
def check_dag_exists_jwt(dag_id, token):
    """Check if a specific DAG exists using JWT authentication."""
    url = f"{AIRFLOW_URL}/api/v2/dags/{dag_id}"
    
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json"
    }
    
    try:
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code == 200:
            data = response.json()
            is_paused = data.get("is_paused", True)
            print(f"✓ DAG '{dag_id}' exists")
            print(f"  Paused: {is_paused}")
            if is_paused:
                print("  ⚠ Warning: DAG is paused")
            return True, is_paused
        elif response.status_code == 404:
            print(f"✗ DAG '{dag_id}' not found")
            return False, None
        else:
            print(f"✗ Error checking DAG: {response.status_code}")
            print(f"  Response: {response.text}")
            return False, None
            
    except Exception as e:
        print(f"✗ Error checking DAG: {e}")
        return False, None


def unpause_dag_jwt(dag_id, token):
    """Unpause a DAG using JWT authentication."""
    url = f"{AIRFLOW_URL}/api/v2/dags/{dag_id}"
    
    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json"
    }
    
    payload = {"is_paused": False}
    
    try:
        response = requests.patch(url, headers=headers, json=payload, timeout=10)
        
        if response.status_code == 200:
            print(f"✓ Successfully unpaused DAG: {dag_id}")
            return True
        else:
            print(f"✗ Failed to unpause DAG: {response.status_code}")
            print(f"  Response: {response.text}")
            return False
            
    except Exception as e:
        print(f"✗ Error unpausing DAG: {e}")
        return False


# Check DAG and unpause if needed
jwt_token = access_token
if jwt_token:
    print("\n" + "="*60)
    print("Checking DAG Status with JWT")
    print("="*60)
    
    dag_exists, is_paused = check_dag_exists_jwt(DAG_ID, jwt_token)
    
    if dag_exists and is_paused:
        print("\nAttempting to unpause DAG...")
        unpause_dag_jwt(DAG_ID, jwt_token)
    
    print("="*60)


Checking DAG Status with JWT
✓ DAG 'osm_collection' exists
  Paused: False


In [15]:
import json, time
from datetime import datetime, timezone
import requests

def _utc_logical_date_now_z() -> str:
    # Airflow REST examples commonly use UTC timestamps like "YYYY-MM-DDTHH:MM:SSZ"
    # (seconds precision is plenty for manual triggers)
    return datetime.now(timezone.utc).replace(microsecond=0).strftime("%Y-%m-%dT%H:%M:%SZ")

def trigger_dag_jwt(dag_id, polygon_data, token, logical_date: str | None = None, dag_run_id: str | None = None):
    """Trigger DAG using JWT authentication."""
    url = f"{AIRFLOW_URL}/api/v2/dags/{dag_id}/dagRuns"

    headers = {
        "Authorization": f"Bearer {token}",
        "Content-Type": "application/json",
    }

    payload = {
        # REQUIRED on your server (422 shows it's required by the API validation)
        "logical_date": logical_date or _utc_logical_date_now_z(),
        # Optional but useful: set your own run id so you can reference it later
        **({"dag_run_id": dag_run_id} if dag_run_id else {}),
        "conf": {
            "polygon": {
                "type": "geojson",
                "data": polygon_data,
            }
        },
    }

    print("Payload being sent:")
    print(json.dumps(payload, indent=2))
    print("\n" + "=" * 60)

    try:
        response = requests.post(url, headers=headers, json=payload, timeout=10)

        if response.status_code in (200, 201):
            data = response.json()
            run_id = data.get("dag_run_id") or data.get("run_id")  # version-tolerant
            print(f"✓ Successfully triggered DAG: {dag_id}")
            print(f"  DAG Run ID: {run_id}")
            print(f"  State: {data.get('state')}")
            print(f"  Logical Date: {data.get('logical_date')}")
            return True, run_id, data

        print(f"✗ Failed to trigger DAG: {response.status_code}")
        print(f"  Response: {response.text}")
        return False, None, None

    except Exception as e:
        print(f"✗ Error triggering DAG: {e}")
        return False, None, None


def get_dag_run_status_jwt(dag_id, dag_run_id, token):
    url = f"{AIRFLOW_URL}/api/v2/dags/{dag_id}/dagRuns/{dag_run_id}"
    headers = {"Authorization": f"Bearer {token}"}
    try:
        r = requests.get(url, headers=headers, timeout=10)
        if r.status_code == 200:
            return r.json()
        print(f"Failed to get DAG run status: {r.status_code} {r.text}")
        return None
    except Exception as e:
        print(f"Error getting DAG run status: {e}")
        return None


def get_task_instances_jwt(dag_id, dag_run_id, token):
    url = f"{AIRFLOW_URL}/api/v2/dags/{dag_id}/dagRuns/{dag_run_id}/taskInstances"
    headers = {"Authorization": f"Bearer {token}"}
    try:
        r = requests.get(url, headers=headers, timeout=10)
        if r.status_code == 200:
            return r.json().get("task_instances", [])
        print(f"Failed to get task instances: {r.status_code} {r.text}")
        return []
    except Exception as e:
        print(f"Error getting task instances: {e}")
        return []


def monitor_dag_jwt(dag_id, dag_run_id, token, max_wait_seconds=900, poll_interval=5):
    start_time = time.time()
    previous_state = None

    print("\n" + "=" * 60)
    print("Monitoring DAG Execution")
    print("=" * 60)

    while time.time() - start_time < max_wait_seconds:
        status = get_dag_run_status_jwt(dag_id, dag_run_id, token)
        if not status:
            print("Failed to get status")
            break

        current_state = status.get("state")
        if current_state != previous_state:
            elapsed = int(time.time() - start_time)
            print(f"\n[{elapsed}s] State: {current_state}")

            tasks = get_task_instances_jwt(dag_id, dag_run_id, token)
            for t in tasks:
                tid = t.get("task_id")
                st = t.get("state")
                if st:
                    print(f"  → {tid}: {st}")

            previous_state = current_state

        if current_state in ("success", "failed"):
            print("\n" + "=" * 60)
            print(f"DAG Execution Complete: {current_state.upper()}")
            print("=" * 60)
            return current_state

        time.sleep(poll_interval)

    print("\n" + "=" * 60)
    print("Monitoring timeout reached")
    print("=" * 60)
    return previous_state


# Trigger DAG if we have a token
if jwt_token and dag_exists:
    print("\n" + "=" * 60)
    print(f"Triggering DAG: {DAG_ID} (using JWT)")
    print("=" * 60)

    success_jwt, dag_run_id_jwt, response_data_jwt = trigger_dag_jwt(
        DAG_ID, geojson_polygon, jwt_token
    )

    if success_jwt:
        final_state = monitor_dag_jwt(DAG_ID, dag_run_id_jwt, jwt_token)
        if final_state == "success":
            print("\n✓ OSM Collection pipeline completed successfully!")
        elif final_state == "failed":
            print("\n✗ Pipeline failed. Check Airflow logs for details.")
    else:
        print("\nFailed to trigger DAG using JWT")
elif not jwt_token:
    print("\n⚠ No JWT token available. Use Airflow CLI to trigger:")
    print(f"\n  docker compose exec -T airflow-scheduler airflow dags trigger {DAG_ID} --conf '{{...}}'")
elif not dag_exists:
    print("\n✗ DAG does not exist")



Triggering DAG: osm_collection (using JWT)
Payload being sent:
{
  "logical_date": "2026-02-12T01:00:18Z",
  "conf": {
    "polygon": {
      "type": "geojson",
      "data": {
        "type": "Polygon",
        "coordinates": [
          [
            [
              -80.41607,
              43.73421
            ],
            [
              -80.46434,
              43.69177
            ],
            [
              -80.4252,
              43.63798
            ],
            [
              -80.33387,
              43.65827
            ],
            [
              -80.32865,
              43.71535
            ],
            [
              -80.38736,
              43.74788
            ],
            [
              -80.41607,
              43.73421
            ]
          ]
        ]
      }
    }
  }
}

✓ Successfully triggered DAG: osm_collection
  DAG Run ID: manual__2026-02-12T01:00:18+00:00
  State: queued
  Logical Date: 2026-02-12T01:00:18Z

Monitoring DAG Execution

[0s] 

## Verify Airflow Connection

Check if Airflow is accessible and the DAG exists.

In [16]:
def trigger_dag(dag_id, polygon_data):
    """
    Trigger the OSM Collection DAG via REST API.
    
    Args:
        dag_id: The DAG ID to trigger
        polygon_data: GeoJSON polygon object
    
    Returns:
        tuple: (success: bool, dag_run_id: str, response_data: dict)
    """
    url = f"{AIRFLOW_URL}/api/v2/dags/{dag_id}/dagRuns"
    
    headers = {
        "Content-Type": "application/json",
    }
    
    # Prepare the configuration payload
    payload = {
        "conf": {
            "polygon": {
                "type": "geojson",
                "data": polygon_data
            }
        }
    }
    
    print("Payload being sent:")
    print(json.dumps(payload, indent=2))
    print("\n" + "="*60)
    
    try:
        response = requests.post(
            url,
            auth=(AIRFLOW_USER, AIRFLOW_PASSWORD),
            headers=headers,
            json=payload,
            timeout=10
        )
        
        if response.status_code == 200:
            data = response.json()
            dag_run_id = data.get("dag_run_id")
            print(f"✓ Successfully triggered DAG: {dag_id}")
            print(f"  DAG Run ID: {dag_run_id}")
            print(f"  State: {data.get('state')}")
            return True, dag_run_id, data
        elif response.status_code == 405:
            print(f"✗ Method Not Allowed (405)")
            print("  This usually means:")
            print("  1. The DAG doesn't exist or hasn't been loaded by Airflow")
            print("  2. The Airflow REST API might not be properly enabled")
            print("  3. Check if the DAG file is in the dags/ folder")
            print(f"\n  Response: {response.text}")
            return False, None, None
        else:
            print(f"✗ Failed to trigger DAG: {response.status_code}")
            print(f"  Response: {response.text}")
            return False, None, None
            
    except Exception as e:
        print(f"✗ Error triggering DAG: {e}")
        return False, None, None


# Only trigger if DAG exists
if dag_exists:
    print("\n" + "="*60)
    print(f"Triggering DAG: {DAG_ID}")
    print("="*60)
    
    success, dag_run_id, response_data = trigger_dag(DAG_ID, geojson_polygon)
    
    if success:
        print(f"\nDAG Run ID: {dag_run_id}")
        print(f"Execution Date: {response_data.get('execution_date')}")
        print(f"Logical Date: {response_data.get('logical_date')}")
    else:
        print("\nFailed to trigger DAG")
else:
    print("\n✗ Skipping trigger - DAG does not exist or is not accessible")
    success = False
    dag_run_id = None


Triggering DAG: osm_collection
Payload being sent:
{
  "conf": {
    "polygon": {
      "type": "geojson",
      "data": {
        "type": "Polygon",
        "coordinates": [
          [
            [
              -80.41607,
              43.73421
            ],
            [
              -80.46434,
              43.69177
            ],
            [
              -80.4252,
              43.63798
            ],
            [
              -80.33387,
              43.65827
            ],
            [
              -80.32865,
              43.71535
            ],
            [
              -80.38736,
              43.74788
            ],
            [
              -80.41607,
              43.73421
            ]
          ]
        ]
      }
    }
  }
}

✗ Failed to trigger DAG: 401
  Response: {"detail":"Not authenticated"}

Failed to trigger DAG


In [17]:
def get_dag_run_status(dag_id, dag_run_id):
    """Get the status of a specific DAG run."""
    url = f"{AIRFLOW_URL}/api/v2/dags/{dag_id}/dagRuns/{dag_run_id}"
    
    try:
        response = requests.get(
            url,
            auth=(AIRFLOW_USER, AIRFLOW_PASSWORD),
            timeout=10
        )
        
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Failed to get DAG run status: {response.status_code}")
            return None
            
    except Exception as e:
        print(f"Error getting DAG run status: {e}")
        return None


def get_task_instances(dag_id, dag_run_id):
    """Get all task instances for a DAG run."""
    url = f"{AIRFLOW_URL}/api/v2/dags/{dag_id}/dagRuns/{dag_run_id}/taskInstances"
    
    try:
        response = requests.get(
            url,
            auth=(AIRFLOW_USER, AIRFLOW_PASSWORD),
            timeout=10
        )
        
        if response.status_code == 200:
            data = response.json()
            return data.get("task_instances", [])
        else:
            print(f"Failed to get task instances: {response.status_code}")
            return []
            
    except Exception as e:
        print(f"Error getting task instances: {e}")
        return []


def monitor_until_complete(dag_id, dag_run_id, max_wait_seconds=900, poll_interval=5):
    """
    Monitor a DAG run until it completes or times out.
    
    Args:
        max_wait_seconds: Default 900s = 15 minutes (OSM pipeline can be slow)
    """
    start_time = time.time()
    previous_state = None
    
    print("\n" + "="*60)
    print("Continuous Monitoring (press Ctrl+C to stop)")
    print("="*60)
    
    while time.time() - start_time < max_wait_seconds:
        status = get_dag_run_status(dag_id, dag_run_id)
        
        if not status:
            print("Failed to get status")
            break
        
        current_state = status.get('state')
        
        # Print update if state changed
        if current_state != previous_state:
            elapsed = int(time.time() - start_time)
            print(f"\n[{elapsed}s] State: {current_state}")
            
            # Get task details
            tasks = get_task_instances(dag_id, dag_run_id)
            if tasks:
                for task in tasks:
                    task_id = task.get('task_id')
                    task_state = task.get('state')
                    if task_state:
                        print(f"  → {task_id}: {task_state}")
            
            previous_state = current_state
        
        # Check if DAG is finished
        if current_state in ['success', 'failed']:
            print("\n" + "="*60)
            print(f"DAG Execution Complete: {current_state.upper()}")
            print("="*60)
            
            # Get final task details
            tasks = get_task_instances(dag_id, dag_run_id)
            if tasks:
                print("\nFinal Task States:")
                print("-" * 60)
                for task in tasks:
                    task_id = task.get('task_id')
                    task_state = task.get('state')
                    duration = task.get('duration')
                    if duration:
                        duration_str = f"{duration:.2f}s"
                    else:
                        duration_str = "N/A"
                    print(f"  {task_id:<40} | {task_state:<10} | {duration_str}")
            
            return current_state
        
        time.sleep(poll_interval)
    
    print("\n" + "="*60)
    print("Monitoring timeout reached")
    print("="*60)
    return previous_state


# Start continuous monitoring if DAG was triggered
if success and dag_run_id:
    final_state = monitor_until_complete(DAG_ID, dag_run_id)
    
    if final_state == 'success':
        print("\n✓ OSM Collection pipeline completed successfully!")
    elif final_state == 'failed':
        print("\n✗ Pipeline failed. Check Airflow logs for details.")
    else:
        print(f"\nPipeline incomplete. Final state: {final_state}")
else:
    print("\nSkipping monitoring - DAG was not triggered")


Skipping monitoring - DAG was not triggered


In [18]:
if success and dag_run_id:
    # Construct Airflow UI URLs
    dag_url = f"{AIRFLOW_URL}/dags/{DAG_ID}/grid"
    dag_run_url = f"{AIRFLOW_URL}/dags/{DAG_ID}/grid?dag_run_id={dag_run_id}"
    
    print("\n" + "="*60)
    print("Airflow Web UI Links")
    print("="*60)
    print(f"\nDAG Overview:")
    print(f"  {dag_url}")
    print(f"\nThis DAG Run:")
    print(f"  {dag_run_url}")
    print("\n" + "="*60)
else:
    print("\nNo DAG run to view")


No DAG run to view


## View in Airflow UI

Links to view the pipeline execution in Airflow web interface.

## Monitor DAG Execution

Check the status and progress of the OSM collection pipeline.

## Trigger OSM Collection DAG

Send the polygon to Airflow via webhook to trigger the OSM collection pipeline.