# 01 API Ingestion bronze

1. Load config.
2. Build endpoint URL.
3. Test connectivity.
4. Fetch paginated raw JSON pages.
5. Save raw JSON pages + manifest to bronze.


In [27]:
# Library imports
from datetime import datetime, timezone
import json
from pathlib import Path

import requests
import yaml


## Step 1: Load config from YAML file

In [28]:
# Load config from YAML file
config_path = Path('../config/config.yaml')
with config_path.open('r', encoding='utf-8') as f:
    cfg = yaml.safe_load(f)

# Print config values for verification
source_cfg = cfg['source']
print('api_base_url:', source_cfg['api_base_url'])
print('incident_path:', source_cfg['incident_path'])
print('page_size:', source_cfg['page_size'])
print('max_records:', source_cfg['max_records'])
print('bronze:', cfg.get('storage', {}).get('bronze', '.local/data/bronze'))


api_base_url: https://incidents.j-hollands.workers.dev/
incident_path: api/now/table/incident
page_size: 1000
max_records: 10000
bronze: .local/data/bronze


## Step 2: Build endpoint URL

In [29]:
# Function to build the full API endpoint from config values
def build_endpoint_url(cfg: dict) -> str:
    source_cfg = cfg['source']
    api_base_url = source_cfg['api_base_url']
    incident_path = source_cfg['incident_path']
    return f"{api_base_url.rstrip('/')}/{incident_path.lstrip('/')}"

endpoint_url = build_endpoint_url(cfg)
endpoint_url


'https://incidents.j-hollands.workers.dev/api/now/table/incident'

## Step 3: Connectivity test

In [30]:
# Perform a test request to check connectivity and response structure
test_params = {
    'sysparm_limit': 1,
    'sysparm_offset': 0,
    'sysparm_display_value': 'true',
}
test_response = requests.get(endpoint_url, params=test_params, timeout=30)
print('Status code:', test_response.status_code)
test_response.raise_for_status()
test_payload = test_response.json()
print('Top-level keys:', list(test_payload.keys()))
print('Records returned:', len(test_payload.get('result', [])))


Status code: 200
Top-level keys: ['result']
Records returned: 1


## Step 4: Production functions

In [None]:
# Function to fetch all incident pages with pagination
def fetch_incident_pages(cfg: dict, timeout: int = 30) -> tuple[list[dict], int]:
    source_cfg = cfg['source']
    runtime_cfg = cfg.get('runtime', {})
    page_size = int(source_cfg['page_size'])
    max_records = int(source_cfg['max_records'])
    use_env_proxy = bool(runtime_cfg.get('use_env_proxy', False))
    endpoint_url = build_endpoint_url(cfg)

    session = requests.Session()
    session.trust_env = use_env_proxy

    pages = []
    offset = 0
    total_records = 0

    while total_records < max_records:
        remaining = max_records - total_records
        request_limit = min(page_size, remaining)
        params = {
            'sysparm_limit': request_limit,
            'sysparm_offset': offset,
            'sysparm_display_value': 'true',
        }

        response = session.get(endpoint_url, params=params, timeout=timeout)
        response.raise_for_status()
        payload = response.json()
        batch = payload.get('result', [])

        pages.append(payload)
        batch_count = len(batch)

        if batch_count == 0:
            break

        total_records += batch_count

        if batch_count < request_limit:
            break

        offset += request_limit

    return pages, total_records

# Save raw output to bronze storage, including manifest metadata
def save_raw_pages_to_bronze(pages: list[dict], cfg: dict, endpoint_url: str, total_records: int) -> Path:
    bronze_root = Path(cfg.get('storage', {}).get('bronze', '.local/data/bronze'))
    bronze_root.mkdir(parents=True, exist_ok=True)

    run_id = datetime.now(timezone.utc).strftime('%Y%m%dT%H%M%SZ')
    run_dir = bronze_root / f'incidents_raw_{run_id}'
    run_dir.mkdir(parents=True, exist_ok=False)

    for idx, payload in enumerate(pages, start=1):
        page_file = run_dir / f'page_{idx:04d}.json'
        page_file.write_text(json.dumps(payload, ensure_ascii=False), encoding='utf-8')

    manifest = {
        'endpoint_url': endpoint_url,
        'run_id': run_id,
        'page_count': len(pages),
        'record_count': total_records,
        'saved_at_utc': datetime.now(timezone.utc).isoformat(),
        'format': 'raw_json_pages',
    }
    (run_dir / 'manifest.json').write_text(json.dumps(manifest, indent=2), encoding='utf-8')

    return run_dir

# Main function to run the ingestion process
def run_ingestion(config_path: str = '../config/config.yaml') -> tuple[Path, int, str]:
    with Path(config_path).open('r', encoding='utf-8') as f:
        cfg_local = yaml.safe_load(f)

    endpoint = build_endpoint_url(cfg_local)
    pages, record_count = fetch_incident_pages(cfg=cfg_local)
    output_path = save_raw_pages_to_bronze(
        pages=pages,
        cfg=cfg_local,
        endpoint_url=endpoint,
        total_records=record_count,
    )
    return output_path, record_count, endpoint


## Step 5: Demonstrate the raw JSON bronze output

In [32]:
# Run the ingestion process
pages, record_count = fetch_incident_pages(cfg=cfg)
output_path = save_raw_pages_to_bronze(
    pages=pages,
    cfg=cfg,
    endpoint_url=endpoint_url,
    total_records=record_count,
)
print(f'Successfully ingested {record_count} records from endpoint {endpoint_url}')
print('Bronze raw JSON output path:', output_path)
print('Files:', sorted([p.name for p in output_path.iterdir()])[:5], '...')


Successfully ingested 10000 records from endpoint https://incidents.j-hollands.workers.dev/api/now/table/incident
Bronze raw JSON output path: .local\data\bronze\incidents_raw_20260210T205120Z
Files: ['manifest.json', 'page_0001.json', 'page_0002.json', 'page_0003.json', 'page_0004.json'] ...


In [33]:
# Output an example record
sample_page = sorted(output_path.glob('page_*.json'))[0]
sample_payload = json.loads(sample_page.read_text(encoding='utf-8'))
print('Sample page:', sample_page.name)
print('Sample keys:', list(sample_payload.keys()))
print('Sample record count:', len(sample_payload.get('result', [])))
display(sample_payload.get('result', [None])[0])

Sample page: page_0001.json
Sample keys: ['result']
Sample record count: 1000


{'sys_id': 'd98abc2af1c9ef3b3071043ad7526b01',
 'number': 'INC1200000',
 'task_effective_number': 'INC1200000',
 'sys_class_name': 'Incident',
 'state': 'Closed',
 'incident_state': 'Closed',
 'priority': '3 - Moderate',
 'impact': '3 - Low',
 'urgency': '3 - Low',
 'severity': '3 - Low',
 'approval': 'Not Requested',
 'escalation': 'Normal',
 'notify': 'Notify',
 'opened_at': '30-09-2024 13:37:24',
 'resolved_at': '01-10-2024 09:49:07',
 'closed_at': '01-10-2024 13:34:27',
 'activity_due': '',
 'due_date': '',
 'sys_created_on': '30-09-2024 13:37:43',
 'sys_updated_on': '01-10-2024 13:34:27',
 'sys_created_by': 'user.100270',
 'sys_updated_by': 'user.100337',
 'opened_by': {'display_value': 'User 100018',
  'link': 'https://example.service-now.com/api/now/table/sys_user/73931bdb2a0df3dbe4d58fed8a728e7e'},
 'caller_id': {'display_value': 'User 100121',
  'link': 'https://example.service-now.com/api/now/table/sys_user/754e554fb17f728b716bcfe11a3885cc'},
 'resolved_by': {'display_value':