# Databricks Workspace Inventory

This notebook runs `workspace_inventory.py` from the repository. The notebook is designed to run on Databricks.

Important: You do NOT need to provide a personal access token when running this notebook on Databricks clusters — the notebook will prefer the cluster's environment/authentication. If you do want to run against an external workspace or provide an explicit token, you may fill the Token widget.

In [None]:
# Create widgets (Databricks notebook widgets).
# Token is optional: leave empty to use notebook/cluster auth.
try:
    dbutils.widgets.text("workspace_url", "", "Workspace URL (optional)")
    dbutils.widgets.text("cluster_id", "auto", "Cluster ID (set to 'auto' for serverless or leave blank)")
    dbutils.widgets.text("profile", "DEFAULT", "Databricks CLI profile (optional)")
    dbutils.widgets.text("token", "", "Personal Access Token (optional; leave empty to use notebook auth)")
    dbutils.widgets.dropdown("output_format", "csv", ["csv", "parquet", "delta"], "Output format")
    dbutils.widgets.text("output_path", "/tmp/workspace_inventory", "Output path")
    print("Widgets created (Databricks notebook). Provide 'token' only if necessary.")
except NameError:
    # Not running in Databricks notebook environment.
    print("dbutils not found: not running in a Databricks notebook.")
    # Continue; later cells will handle missing dbutils by falling back to environment variables or direct CLI usage.

In [None]:
# Read widget values (works in Databricks). For non-Databricks runs, set environment variables or edit variables below.
try:
    workspace_url = dbutils.widgets.get("workspace_url").strip()
    cluster_id = dbutils.widgets.get("cluster_id").strip()
    profile = dbutils.widgets.get("profile").strip()
    token = dbutils.widgets.get("token").strip()
    output_format = dbutils.widgets.get("output_format").strip()
    output_path = dbutils.widgets.get("output_path").strip()
except NameError:
    # Not a Databricks notebook; fall back to environment variables or defaults.
    import os
    workspace_url = os.environ.get("DATABRICKS_WORKSPACE_URL", "")
    cluster_id = os.environ.get("DATABRICKS_CLUSTER_ID", "") or os.environ.get("DATABRICKS_SERVERLESS_COMPUTE_ID", "")
    profile = os.environ.get("DATABRICKS_CONFIG_PROFILE", "DEFAULT")
    token = os.environ.get("DATABRICKS_TOKEN", "")
    output_format = os.environ.get("OUTPUT_FORMAT", "csv")
    output_path = os.environ.get("OUTPUT_PATH", "/tmp/workspace_inventory")

print("Configuration:")
print("  workspace_url:", workspace_url if workspace_url else '(empty, will use profile or environment)')
print("  cluster_id:", cluster_id if cluster_id else '(empty)')
print("  profile:", profile)
print("  token provided?:", 'yes' if token else 'no')
print("  output_format:", output_format)
print("  output_path:", output_path)

In [None]:
# Clone the repository (idempotent) and install requirements, then run the inventory script.
import os, subprocess, sys, shlex
repo_dir = '/tmp/dbx_workspace_inventory'
if not os.path.exists(repo_dir):
    subprocess.run(['git', 'clone', 'https://github.com/LaurentPRAT-DB/dbx_workspace_inventory.git', repo_dir], check=True)
else:
    print('Repository already present at', repo_dir)

# Install requirements into the notebook driver (Databricks will isolate jobs, but installing here helps when running interactively)
req_file = os.path.join(repo_dir, 'requirements.txt')
if os.path.exists(req_file):
    print('Installing requirements from', req_file)
    try:
        subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', req_file], check=True)
    except subprocess.CalledProcessError as e:
        print('Warning: pip install returned non-zero exit code:', e)
else:
    print('No requirements.txt found; continuing')

# Build command to run the script. Token is optional — include it only when provided.
cmd = [sys.executable, 'workspace_inventory.py']
# Working directory for running the script
cwd = repo_dir

if workspace_url:
    cmd.extend(['--workspace-url', workspace_url])
if profile:
    cmd.extend(['--profile', profile])
if cluster_id:
    # Provide cluster_id only if not empty
    cmd.extend(['--cluster-id', cluster_id])
# Include token only when explicitly provided.
if token:
    cmd.extend(['--token', token])
# Output options
if output_format:
    cmd.extend(['--format', output_format])
if output_path:
    cmd.extend(['--output', output_path])

print('Running command:')
print(' '.join(shlex.quote(p) for p in cmd))

# Execute and stream output live
proc = subprocess.Popen(cmd, cwd=cwd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1)
for line in proc.stdout:
    print(line, end='')
ret = proc.wait()
print('
Process exited with code', ret)
# Diagnostic: print cluster Python and Spark versions (useful for Spark Connect alignment)
import sys
import platform
try:
    py_exec = sys.executable
    py_version = f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
    print("Python executable:", py_exec)
    print("Python version:", py_version)
    # Try to obtain Spark version from existing session or create one
    try:
        spark_version = spark.version  # notebook usually exposes `spark`
    except NameError:
        from pyspark.sql import SparkSession
        spark = SparkSession.builder.getOrCreate()
        spark_version = spark.version
    print("Spark version:", spark_version)
    # Show platform details for more precise matching
    print("Platform:", platform.platform())
except Exception as _e:
    print("Could not determine Python/Spark versions:", str(_e))

Notes:
- When run on Databricks, the notebook will prefer notebook/cluster authentication and **you don't need to provide a personal access token**.
- If you need to run against a remote workspace from your local environment, provide a token in the Token widget (or set `DATABRICKS_TOKEN` environment variable).
- The notebook installs `requirements.txt` into the driver environment; cluster/executor package setup may differ depending on your Databricks configuration.