# Fabric Dataset Refresh Monitoring

Collects **dataset refresh operations** and **metadata** from Fabric REST APIs and sends to Azure Log Analytics.

In [8]:
# === One-time installs per session Or Use Fabric Environment ===
%pip install --quiet msal requests azure-identity azure-keyvault-secrets python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [9]:
# === Parameters (mark this as a parameter cell in Fabric) ===
import os
from dotenv import load_dotenv

load_dotenv()

# Multi-workspace monitoring for security and compliance
# Empty list = monitor ALL accessible workspaces (recommended for security)
# Specific workspaces = ["workspace-id-1", "workspace-id-2"]
workspace_ids = (
    [os.getenv("FABRIC_WORKSPACE_ID")] if os.getenv("FABRIC_WORKSPACE_ID") else []
)

# Dataset filtering - leave empty to monitor all datasets in workspaces
dataset_ids = []  # Add specific dataset IDs here, or leave empty for all datasets
# Example: dataset_ids = ["8c4003da-3254-4a1a-b07e-ac62cb86b5cf", "94a8ba89-ac1b-4a6c-b310-775f3967f29f"]

lookback_hours = 42000  # Number of hours to look back for refresh history

dcr_endpoint_host = os.getenv("DCR_ENDPOINT_HOST")
dcr_immutable_id = os.getenv("DCR_IMMUTABLE_ID")
stream_dataset_refresh = "Custom-FabricDatasetRefresh_CL"
stream_dataset_metadata = "Custom-FabricDatasetMetadata_CL"

tenant_id = os.getenv("FABRIC_TENANT_ID")
client_id = os.getenv("FABRIC_APP_ID")
client_secret_env = os.getenv("FABRIC_APP_SECRET")

use_key_vault = False
use_managed_identity = False
key_vault_uri = os.getenv(
    "AZURE_KEY_VAULT_URI", "https://kaydemokeyvault.vault.azure.net/"
)
key_vault_secret_name = os.getenv(
    "AZURE_KEY_VAULT_SECRET_NAME", "FabricServicePrincipal"
)

if not all([tenant_id, client_id, dcr_endpoint_host, dcr_immutable_id]):
    missing = []
    if not tenant_id:
        missing.append("FABRIC_TENANT_ID")
    if not client_id:
        missing.append("FABRIC_APP_ID")
    if not dcr_endpoint_host:
        missing.append("DCR_ENDPOINT_HOST")
    if not dcr_immutable_id:
        missing.append("DCR_IMMUTABLE_ID")
    print(f"❌ Missing: {', '.join(missing)}")
else:
    print("✅ Environment variables loaded")

print(f"Workspace mode: {'Specific workspaces' if workspace_ids else 'Auto-discovery'}")
print(f"Dataset mode: {'Specific datasets' if dataset_ids else 'All datasets'}")
print(f"Lookback: {lookback_hours} hours")

✅ Environment variables loaded
Workspace mode: Specific workspaces
Dataset mode: All datasets
Lookback: 42000 hours


In [10]:
# === Define main functions ===
import os, json, time, datetime as dt
import requests
from typing import List, Dict, Any

def get_secret_from_kv(vault_uri: str, secret_name: str, tenant_id: str = None, client_id: str = None, client_secret: str = None, use_managed_identity: bool = False) -> str:
    try:
        from azure.keyvault.secrets import SecretClient
        if use_managed_identity:
            from azure.identity import ManagedIdentityCredential
            credential = ManagedIdentityCredential()
        else:
            from azure.identity import ClientSecretCredential
            credential = ClientSecretCredential(tenant_id=tenant_id, client_id=client_id, client_secret=client_secret)
        client = SecretClient(vault_url=vault_uri, credential=credential)
        return client.get_secret(secret_name).value
    except Exception as e:
        print(f"[KeyVault] Failed: {e}")
        return None

FABRIC_SCOPE = "https://api.fabric.microsoft.com/.default"
MONITOR_SCOPE = "https://monitor.azure.com/.default"
FABRIC_API = "https://api.fabric.microsoft.com/v1"

def acquire_token_client_credentials(tenant: str, client_id: str, client_secret: str, scope: str) -> str:
    import msal
    authority = f"https://login.microsoftonline.com/{tenant}"
    app = msal.ConfidentialClientApplication(client_id, authority=authority, client_credential=client_secret)
    result = app.acquire_token_for_client(scopes=[scope])
    if "access_token" not in result:
        raise RuntimeError(f"Failed to get token for {scope}: {result}")
    token = result["access_token"]
    print(f"✅ Token acquired for {scope}")
    return token

def iso_now() -> str:
    return dt.datetime.utcnow().replace(tzinfo=dt.timezone.utc).isoformat().replace("+00:00", "Z")

def parse_iso(s: str) -> dt.datetime:
    if not s:
        return None
    if s.endswith("Z"):
        s = s[:-1] + "+00:00"
    parsed = dt.datetime.fromisoformat(s)
    if parsed.tzinfo is None:
        parsed = parsed.replace(tzinfo=dt.timezone.utc)
    return parsed

def list_workspace_datasets(workspace_id: str, token: str) -> List[Dict[str, Any]]:
    url = f"{FABRIC_API}/workspaces/{workspace_id}/datasets"
    headers = {"Authorization": f"Bearer {token}"}
    try:
        r = requests.get(url, headers=headers, timeout=60)
        r.raise_for_status()
        data = r.json()
        return data.get("value", [])
    except Exception as e:
        print(f"❌ Failed to get datasets for workspace {workspace_id}: {e}")
        return []

def get_dataset_refresh_history(dataset_id: str, token: str, top: int = 200) -> List[Dict[str, Any]]:
    url = f"{FABRIC_API}/datasets/{dataset_id}/refreshes?$top={top}"
    headers = {"Authorization": f"Bearer {token}"}
    try:
        r = requests.get(url, headers=headers, timeout=60)
        r.raise_for_status()
        data = r.json()
        return data.get("value", [])
    except Exception as e:
        print(f"❌ Failed to get refresh history for dataset {dataset_id}: {e}")
        return []

def get_dataset_metadata(dataset_id: str, token: str) -> Dict[str, Any]:
    url = f"{FABRIC_API}/datasets/{dataset_id}"
    headers = {"Authorization": f"Bearer {token}"}
    try:
        r = requests.get(url, headers=headers, timeout=60)
        r.raise_for_status()
        return r.json()
    except Exception as e:
        print(f"❌ Failed to get metadata for dataset {dataset_id}: {e}")
        return {}

def map_dataset_refresh(workspace_id: str, dataset_id: str, dataset_name: str, refresh: Dict[str, Any]) -> Dict[str, Any]:
    start_time = refresh.get("startTime")
    end_time = refresh.get("endTime")
    
    duration_ms = None
    if start_time and end_time:
        try:
            start_dt = parse_iso(start_time)
            end_dt = parse_iso(end_time)
            if start_dt and end_dt:
                duration_ms = int((end_dt - start_dt).total_seconds() * 1000)
        except Exception:
            pass
    
    return {
        "TimeGenerated": end_time or start_time or iso_now(),
        "WorkspaceId": workspace_id,
        "DatasetId": dataset_id,
        "DatasetName": dataset_name,
        "RefreshId": refresh.get("id"),
        "RefreshType": refresh.get("refreshType"),
        "Status": refresh.get("status"),
        "StartTime": start_time,
        "EndTime": end_time,
        "DurationMs": duration_ms,
        "ServicePrincipalId": refresh.get("servicePrincipalId"),
        "ErrorCode": refresh.get("errorCode"),
        "ErrorMessage": refresh.get("errorMessage"),
        "RequestId": refresh.get("requestId"),
    }

def map_dataset_metadata(workspace_id: str, dataset: Dict[str, Any]) -> Dict[str, Any]:
    return {
        "TimeGenerated": iso_now(),
        "WorkspaceId": workspace_id,
        "DatasetId": dataset.get("id"),
        "DatasetName": dataset.get("name"),
        "Description": dataset.get("description"),
        "ConfiguredBy": dataset.get("configuredBy"),
        "CreatedDate": dataset.get("createdDate"),
        "ModifiedDate": dataset.get("modifiedDate"),
        "ContentProviderType": dataset.get("contentProviderType"),
        "DatasourceType": dataset.get("datasourceType"),
        "IsOnPremGatewayRequired": dataset.get("isOnPremGatewayRequired", False),
        "IsRefreshable": dataset.get("isRefreshable", False),
        "AddRowsAPIEnabled": dataset.get("addRowsAPIEnabled", False),
    }

def list_accessible_workspaces(token: str) -> List[Dict[str, Any]]:
    """Get all workspaces accessible to the service principal"""
    url = f"{FABRIC_API}/workspaces"
    headers = {"Authorization": f"Bearer {token}"}
    try:
        r = requests.get(url, headers=headers, timeout=60)
        r.raise_for_status()
        data = r.json()
        return data.get("value", [])
    except Exception as e:
        print(f"❌ Failed to list workspaces: {e}")
        return []

def get_workspace_details(workspace_id: str, token: str) -> Dict[str, Any]:
    """Get detailed workspace information"""
    url = f"{FABRIC_API}/workspaces/{workspace_id}"
    headers = {"Authorization": f"Bearer {token}"}
    try:
        r = requests.get(url, headers=headers, timeout=60)
        r.raise_for_status()
        return r.json()
    except Exception as e:
        print(f"❌ Failed to get workspace details for {workspace_id}: {e}")
        return {}

def list_workspace_items(workspace_id: str, token: str) -> List[Dict[str, Any]]:
    """Get all items in a workspace (alternative to datasets endpoint)"""
    url = f"{FABRIC_API}/workspaces/{workspace_id}/items"
    headers = {"Authorization": f"Bearer {token}"}
    try:
        r = requests.get(url, headers=headers, timeout=60)
        r.raise_for_status()
        data = r.json()
        items = data.get("value", [])
        # Only get Dataset and SemanticModel types, and convert to dataset-like format
        datasets = []
        for item in items:
            if item.get("type") in ["Dataset", "SemanticModel"]:
                # Convert item format to dataset format for compatibility
                dataset = {
                    "id": item.get("id"),
                    "name": item.get("displayName", item.get("name", "Unknown")),
                    "type": item.get("type"),
                    "description": item.get("description"),
                    # Note: Items API doesn't provide refresh metadata
                }
                datasets.append(dataset)
        return datasets
    except Exception as e:
        print(f"❌ Failed to get items for workspace {workspace_id}: {e}")
        return []

def post_rows_to_dcr(endpoint_host: str, dcr_id: str, stream_name: str, rows: List[Dict[str, Any]], monitor_token: str):
    if not rows:
        return {"sent": 0, "batches": 0}
    
    MAX_BYTES = 950_000
    batch, batches, sent, size = [], 0, 0, 2
    
    def flush():
        nonlocal batches, sent, batch, size
        if not batch:
            return
        url = f"https://{endpoint_host}/dataCollectionRules/{dcr_id}/streams/{stream_name}?api-version=2023-01-01"
        headers = {"Authorization": f"Bearer {monitor_token}", "Content-Type": "application/json"}
        resp = requests.post(url, headers=headers, data=json.dumps(batch), timeout=60)
        if resp.status_code >= 400:
            raise RuntimeError(f"Ingestion failed ({resp.status_code}): {resp.text[:500]}")
        batches += 1
        sent += len(batch)
        batch, size = [], 2
    
    for row in rows:
        s = len(json.dumps(row, separators=(",", ":")))
        if size + s + (1 if batch else 0) > MAX_BYTES:
            flush()
        batch.append(row)
        size += s + (1 if batch else 0)
    flush()
    return {"sent": sent, "batches": batches}

print("✅ Functions loaded")

✅ Functions loaded


In [11]:
# === Main ===
client_secret = None

if client_secret_env:
    client_secret = client_secret_env
    print("✅ Using environment variable")
elif use_key_vault:
    if use_managed_identity:
        client_secret = get_secret_from_kv(key_vault_uri, key_vault_secret_name, use_managed_identity=True)
    else:
        temp_secret = os.getenv("FABRIC_APP_SECRET")
        if not temp_secret:
            raise RuntimeError("FABRIC_APP_SECRET required for Key Vault access")
        client_secret = get_secret_from_kv(key_vault_uri, key_vault_secret_name, tenant_id, client_id, temp_secret, use_managed_identity=False)

if not client_secret:
    raise RuntimeError("Client secret not found")

print("✅ Client secret resolved")

fabric_token = acquire_token_client_credentials(tenant_id, client_id, client_secret, FABRIC_SCOPE)
monitor_token = acquire_token_client_credentials(tenant_id, client_id, client_secret, MONITOR_SCOPE)

# First, let's check what workspaces are accessible
print("🔍 Checking accessible workspaces...")
accessible_workspaces = list_accessible_workspaces(fabric_token)
print(f"Found {len(accessible_workspaces)} accessible workspaces:")

for ws in accessible_workspaces[:10]:  # Show first 10
    print(f"  - {ws.get('name', 'Unknown')} ({ws.get('id', 'No ID')})")

if len(accessible_workspaces) > 10:
    print(f"  ... and {len(accessible_workspaces) - 10} more")

# Check if specified workspace IDs are accessible
if workspace_ids:
    accessible_ids = {ws.get('id') for ws in accessible_workspaces}
    print(f"\n🔍 Checking specified workspace IDs:")
    for ws_id in workspace_ids:
        if ws_id in accessible_ids:
            ws_name = next((ws.get('name') for ws in accessible_workspaces if ws.get('id') == ws_id), 'Unknown')
            print(f"  ✅ {ws_id} - {ws_name}")
        else:
            print(f"  ❌ {ws_id} - NOT ACCESSIBLE")

now = dt.datetime.utcnow().replace(tzinfo=dt.timezone.utc)
cutoff_time = now - dt.timedelta(hours=lookback_hours)

print(f"\nCollecting dataset refresh data from last {lookback_hours} hours...")

refresh_rows = []
metadata_rows = []
total_datasets = 0
total_refreshes = 0

# Use accessible workspaces if no specific ones provided
workspaces_to_process = workspace_ids if workspace_ids else [ws.get('id') for ws in accessible_workspaces]

for workspace_id in workspaces_to_process:
    print(f"\nProcessing workspace: {workspace_id}")
    
    # Get workspace details
    workspace_details = get_workspace_details(workspace_id, fabric_token)
    if workspace_details:
        print(f"  Workspace name: {workspace_details.get('displayName', 'Unknown')}")
        print(f"  Workspace type: {workspace_details.get('type', 'Unknown')}")
    
    # Try the datasets endpoint first
    datasets = list_workspace_datasets(workspace_id, fabric_token)
    using_items_api = False
    
    # If datasets endpoint fails, try items endpoint as fallback
    if not datasets:
        print("  Trying alternative items endpoint...")
        datasets = list_workspace_items(workspace_id, fabric_token)
        using_items_api = True
        if datasets:
            print(f"  Found {len(datasets)} datasets via items endpoint")
        else:
            print("  No datasets found via either endpoint")
            continue
    else:
        print(f"  Found {len(datasets)} datasets via datasets endpoint")
    
    # Process each dataset once
    for dataset in datasets:
        dataset_id = dataset.get("id")
        dataset_name = dataset.get("name", dataset.get("displayName", "Unknown"))
        
        if dataset_ids and dataset_id not in dataset_ids:
            continue
        
        print(f"  Processing dataset: {dataset_name} ({dataset_id})")
        total_datasets += 1
        
        # Skip refresh history collection if using items API since those IDs might not work with dataset APIs
        if using_items_api:
            print(f"    Skipping refresh history (using items API)")
            # Create a basic metadata record from items data
            metadata_row = map_dataset_metadata(workspace_id, dataset)
            metadata_rows.append(metadata_row)
        else:
            # Normal processing for datasets API
            detailed_metadata = get_dataset_metadata(dataset_id, fabric_token)
            if detailed_metadata:
                metadata_row = map_dataset_metadata(workspace_id, {**dataset, **detailed_metadata})
                metadata_rows.append(metadata_row)
            
            refreshes = get_dataset_refresh_history(dataset_id, fabric_token)
            
            recent_refreshes = []
            for refresh in refreshes:
                refresh_time = parse_iso(refresh.get("endTime") or refresh.get("startTime"))
                if refresh_time and refresh_time >= cutoff_time:
                    recent_refreshes.append(refresh)
            
            print(f"    Found {len(recent_refreshes)} recent refreshes")
            total_refreshes += len(recent_refreshes)
            
            for refresh in recent_refreshes:
                refresh_row = map_dataset_refresh(workspace_id, dataset_id, dataset_name, refresh)
                refresh_rows.append(refresh_row)

print(f"Collection Summary:")
print(f"Total Datasets: {total_datasets}")
print(f"Total Refreshes: {total_refreshes}")
print(f"Metadata Records: {len(metadata_rows)}")
print(f"Refresh Records: {len(refresh_rows)}")

summary = {}

if metadata_rows:
    print("Sending dataset metadata...")
    result = post_rows_to_dcr(dcr_endpoint_host, dcr_immutable_id, stream_dataset_metadata, metadata_rows, monitor_token)
    summary["dataset_metadata"] = result

if refresh_rows:
    print("Sending refresh history...")
    result = post_rows_to_dcr(dcr_endpoint_host, dcr_immutable_id, stream_dataset_refresh, refresh_rows, monitor_token)
    summary["dataset_refreshes"] = result

print("✅ Done!")
print(json.dumps(summary, indent=2))

✅ Using environment variable
✅ Client secret resolved
✅ Token acquired for https://api.fabric.microsoft.com/.default
✅ Token acquired for https://monitor.azure.com/.default
🔍 Checking accessible workspaces...
Found 1 accessible workspaces:
  - Unknown (8457f746-f2d9-4d27-8221-5714601e40c6)

🔍 Checking specified workspace IDs:
  ✅ 8457f746-f2d9-4d27-8221-5714601e40c6 - None

Collecting dataset refresh data from last 42000 hours...

Processing workspace: 8457f746-f2d9-4d27-8221-5714601e40c6
  Workspace name: WWI_Samples
  Workspace type: Workspace
❌ Failed to get datasets for workspace 8457f746-f2d9-4d27-8221-5714601e40c6: 404 Client Error: Not Found for url: https://api.fabric.microsoft.com/v1/workspaces/8457f746-f2d9-4d27-8221-5714601e40c6/datasets
  Trying alternative items endpoint...
  Found 14 datasets via items endpoint
  Processing dataset: LH_WWI (292521d7-2a1d-475a-b534-ea66d422d925)
    Skipping refresh history (using items API)
  Processing dataset: DataflowsStagingLakehouse 