# DQX

In [0]:
https://adb-5943863453538272.12.azuredatabricks.net/editor/notebooks/4441860942440814?o=5943863453538272

In [0]:
databricks labs dqx open-dashboards

## Reference

### DQX Reference

#### DQEngine Methods

| Method                                 | Purpose                                                        | Local?    | Notes                                                   |
|-----------------------------------------|----------------------------------------------------------------|-----------|---------------------------------------------------------|
| apply_checks                           | Apply DQRule checks to DataFrame, adds _warning/_error cols    | Yes       | Use for reporting-only; does not split DF               |
| apply_checks_and_split                  | Apply checks, returns valid/quarantine DFs                     | Yes       | Use if you want to split valid/invalid rows             |
| apply_checks_and_save_in_table          | Apply checks, save valid/quarantine DFs to Delta tables        | No        | Needs OutputConfig and QuarantineConfig                 |
| apply_checks_by_metadata                | Apply YAML/Delta table checks to DF, adds _warning/_error      | Yes       | Ideal for rules from YAML/Delta; no split               |
| apply_checks_by_metadata_and_split      | Same, but returns valid/quarantine DFs                         | Yes       | Only use if you want to split output                    |
| apply_checks_by_metadata_and_save_in_table | Metadata checks, save valid/quarantine to Delta tables        | No        | Needs OutputConfig and QuarantineConfig                 |
| validate_checks                        | Validates structure/types of check config                      | Yes       | Use in CI/CD for YAML validation                        |
| get_invalid                            | Extract rows with warnings/errors                              | Yes       |                                                        |
| get_valid                              | Extract only valid rows                                        | Yes       |                                                        |
| load_checks_from_local_file             | Read checks from YAML/JSON file                                | Yes       | For CI/CD/local dev                                     |
| save_checks_in_local_file               | Write checks to YAML/JSON                                      | Yes       |                                                        |
| load_checks_from_workspace_file         | Load checks from Databricks workspace file                     | No        | Only in DBX context                                     |
| save_checks_in_workspace_file           | Save checks to workspace file                                  | No        |                                                        |
| load_checks_from_installation           | Load checks from install dir                                   | No        |                                                        |
| save_checks_in_installation             | Save checks to install dir                                     | No        |                                                        |
| save_results_in_table                   | Save (output_df, quarantine_df) to Delta tables                | No        | Needs OutputConfig (and optional QuarantineConfig)      |
| load_run_config                        | Load run config from install dir                               | No        |                                                        |

#### Pre-defined Functions

In [0]:
ROW_LEVEL_FUNCTIONS = {
    "is_not_null": {
        "args": ["column"],
        "optional": []
    },
    "is_not_empty": {
        "args": ["column"],
        "optional": []
    },
    "is_not_null_and_not_empty": {
        "args": ["column"],
        "optional": ["trim_strings"]
    },
    "is_in_list": {
        "args": ["column", "allowed"],
        "optional": []
    },
    "is_not_null_and_is_in_list": {
        "args": ["column", "allowed"],
        "optional": []
    },
    "is_not_null_and_not_empty_array": {
        "args": ["column"],
        "optional": []
    },
    "is_in_range": {
        "args": ["column", "min_limit", "max_limit"],
        "optional": []
    },
    "is_not_in_range": {
        "args": ["column", "min_limit", "max_limit"],
        "optional": []
    },
    "is_not_less_than": {
        "args": ["column", "limit"],
        "optional": []
    },
    "is_not_greater_than": {
        "args": ["column", "limit"],
        "optional": []
    },
    "is_valid_date": {
        "args": ["column"],
        "optional": ["date_format"]
    },
    "is_valid_timestamp": {
        "args": ["column"],
        "optional": ["timestamp_format"]
    },
    "is_not_in_future": {
        "args": ["column", "offset"],
        "optional": ["curr_timestamp"]
    },
    "is_not_in_near_future": {
        "args": ["column", "offset"],
        "optional": ["curr_timestamp"]
    },
    "is_older_than_n_days": {
        "args": ["column", "days"],
        "optional": ["curr_date", "negate"]
    },
    "is_older_than_col2_for_n_days": {
        "args": ["column1", "column2", "days"],
        "optional": ["negate"]
    },
    "regex_match": {
        "args": ["column", "regex"],
        "optional": ["negate"]
    },
    "is_valid_ipv4_address": {
        "args": ["column"],
        "optional": []
    },
    "is_ipv4_address_in_cidr": {
        "args": ["column", "cidr_block"],
        "optional": []
    },
    "sql_expression": {
        "args": ["expression"],
        "optional": ["msg", "name", "negate", "columns"]
    }
}

In [0]:
DATASET_LEVEL_FUNCTIONS = {
    "is_unique": {
        "args": ["columns"],
        "optional": ["nulls_distinct"],
    },
    "is_aggr_not_greater_than": {
        "args": ["limit"],
        "optional": ["column", "aggr_type", "group_by"],
    },
    "is_aggr_not_less_than": {
        "args": ["limit"],
        "optional": ["column", "aggr_type", "group_by"],
    },
    "is_aggr_equal": {
        "args": ["limit"],
        "optional": ["column", "aggr_type", "group_by"],
    },
    "is_aggr_not_equal": {
        "args": ["limit"],
        "optional": ["column", "aggr_type", "group_by"],
    },
    "foreign_key": {
        "args": ["columns", "ref_columns"],
        "optional": ["ref_df_name", "ref_table", "negate"],
    },
    "sql_query": {
        "args": ["query", "merge_columns", "condition_column"],
        "optional": ["input_placeholder", "msg", "name", "negate"],
    },
    "compare_datasets": {
        "args": ["columns", "ref_columns"],
        "optional": ["exclude_columns", "ref_df_name", "ref_table", "check_missing_records"],
    },
}

####  [Output Column Structure](https://databrickslabs.github.io/dqx/docs/guide/quality_checks/#quality-check-results)

| **Field**         | **Description**                                                                 |
|-------------------|---------------------------------------------------------------------------------|
| `name`            | Name of the check (string type).                                                |
| `message`         | Message describing the quality issue (string type).                             |
| `columns`         | Name of the column(s) where the quality issue was found (string type).          |
| `filter`          | Filter applied if any (string type).                                            |
| `function`        | Rule/check function applied (string type).                                      |
| `run_time`        | Timestamp when the check was executed (timestamp type).                         |
| `user_metadata`   | Optional key-value custom metadata provided by the user (dictionary type).      |

In [0]:
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, TimestampType, MapType

dq_result_item_schema = StructType(
    [
        StructField("name", StringType(), nullable=True),
        StructField("message", StringType(), nullable=True),
        StructField("columns", ArrayType(StringType()), nullable=True),
        StructField("filter", StringType(), nullable=True),
        StructField("function", StringType(), nullable=True),
        StructField("run_time", TimestampType(), nullable=True),
        StructField("user_metadata", MapType(StringType(), StringType()), nullable=True),
    ]
)

dq_result_schema = ArrayType(dq_result_item_schema)

In [0]:
[
  {
    "name": "col_city_is_null",
    "message": "Column 'city' is null",
    "columns": ["city"],
    "filter": "country = 'Poland'",
    "function": "is_not_null",
    "run_time": "2025-01-01 14:31:21",
    "user_metadata": {"key1": "value1", "key2": "value2"},
  },
]

## Custom Rules

### Examples

In [0]:
################################################################################
# Section 1 – Simple row‑level checks
#
# These rules operate on a single column at a time for each row.  Only the
# `column` argument is mandatory.  Additional arguments apply only to certain
# functions and are shown commented out with explanations.
################################################################################

- table_name: catalog.schema.table       # added table name placeholder
  criticality: error                     # 'error' => quarantine row on failure; 'warn' => flag but keep row
  run_config_name: default               # added run configuration name
  check:
    function: is_not_null                # any row-level built-in function
    arguments:
      column: col1                       # name or expression of a column in the table being checked
      # allowed: [1, 2, 3]               # for is_in_list / is_not_null_and_is_in_list: values the column must belong to
      # min_limit: 0                     # for is_in_range / is_not_in_range: lower bound (inclusive)
      # max_limit: 10                    # for is_in_range / is_not_in_range: upper bound (inclusive)
      # limit: 5                         # for is_not_less_than / is_not_greater_than: numeric/date/timestamp limit
      # date_format: yyyy-MM-dd          # for is_valid_date: expected date pattern
      # timestamp_format: yyyy-MM-dd HH:mm:ss   # for is_valid_timestamp: expected timestamp pattern
      # offset: 86400                    # for is_not_in_future / is_not_in_near_future: offset in seconds
      # curr_timestamp: '2025-08-05T00:00:00'   # override current timestamp; string or timestamp literal
      # days: 7                          # is_older_than_n_days / is_older_than_col2_for_n_days
      # column2: other_col               # required for is_older_than_col2_for_n_days
      # regex: '[A-Z]{3}[0-9]{4}'        # regex_match
      # negate: false                    # set true to invert the match
      # cidr_block: '192.168.1.0/24'     # is_ipv4_address_in_cidr
      # trim_strings: true               # is_not_null_and_not_empty

################################################################################
# Section 2 – Row‑level checks applied to multiple columns individually
#
# Use `for_each_column` when the same single‑column rule should be executed on
# several columns.  Each entry must be a valid column name or expression from
# the table being checked.  There is no hard limit on the number of columns.
################################################################################

- table_name: catalog.schema.table       # added table name placeholder
  criticality: error
  run_config_name: default               # added run configuration name
  check:
    function: is_not_null
    for_each_column:
      - col1                             # first column name in the dataset
      - col2                             # second column name in the dataset
      # - col3                          # add additional column names as needed; each must exist in the dataset

################################################################################
# Section 3 – Row‑level checks on complex types (struct, map, array)
#
# These templates target elements within complex columns.  Use dot notation
# (`struct_field`) for struct fields, and `try_element_at` for map or array
# elements.  For array aggregations (e.g. max/min), include a numeric limit.
################################################################################

# Single element from a complex type
- table_name: catalog.schema.table       # added table name placeholder
  criticality: error
  run_config_name: default               # added run configuration name
  check:
    function: is_not_null
    arguments:
      column: col8.field1                # struct field; must refer to an existing field
      # column: try_element_at(col7, 'key1')  # map element lookup; key must exist
      # column: try_element_at(col4, 0)       # array element lookup by zero-based index
      # Any optional parameters from Section 1 can be included here (e.g. regex)

# Aggregating an array (e.g. ensure max element ≤ 10)
- table_name: catalog.schema.table       # added table name placeholder
  criticality: error
  run_config_name: default               # added run configuration name
  check:
    function: is_not_greater_than        # or is_not_less_than
    arguments:
      column: array_max(col4)            # aggregate function applied to an array column
      limit: 10                          # required for array aggregation; numeric threshold for comparison

################################################################################
# Section 4 – Row‑level SQL expression
#
# For complex row‑level logic, use `sql_expression`.  This rule does not take a
# `column` field; instead provide an SQL predicate.  Optional fields allow you to
# customise messages, naming and inversion behaviour.
################################################################################

- table_name: catalog.schema.table       # added table name placeholder
  criticality: error
  run_config_name: default               # added run configuration name
  check:
    function: sql_expression
    arguments:
      expression: "col3 >= col2 AND col3 <= 10"  # SQL predicate; fails when expression is True
      # msg: "col3 out of range"         # optional message to include in result rows
      # name: "col3_range_check"         # optional name for the resulting check column (appears in result DataFrame)
      # negate: false                    # boolean: set true to invert the logic (fail when expression is False)
      # columns: [col2, col3]            # optional list of one or more column names used only for reporting; does not affect logic

################################################################################
# Section 5 – Dataset‑level uniqueness check
#
# Ensures that the specified columns are unique across all rows.  A failure is
# logged for every duplicate row found.  Use `nulls_distinct` to control how
# NULL values are treated.
################################################################################

- table_name: catalog.schema.table       # added table name placeholder
  criticality: error
  run_config_name: default               # added run configuration name
  check:
    function: is_unique
    arguments:
      columns:
        - col1                            # key columns (one or more) used to test uniqueness
        # - col2                         # add additional columns for composite key
      # nulls_distinct: true              # default true: treat NULLs as distinct; set false to consider NULLs equal

################################################################################
# Section 6 – Dataset‑level aggregation checks
#
# Compare aggregated values against a threshold.  Specify `aggr_type` (count,
# sum, avg, min, max).  For count, omit the column; otherwise provide the
# column being aggregated.  `group_by` can list one or more columns to group
# rows before aggregation.
################################################################################

# Count of all rows must not exceed 10
- table_name: catalog.schema.table       # added table name placeholder
  criticality: error
  run_config_name: default               # added run configuration name
  check:
    function: is_aggr_not_greater_than
    arguments:
      aggr_type: count                   # aggregation type (count, sum, avg, min, max)
      limit: 10                          # numeric or literal threshold for comparison

# Count of non-null values in col2 must not exceed 10
- table_name: catalog.schema.table       # added table name placeholder
  criticality: error
  run_config_name: default               # added run configuration name
  check:
    function: is_aggr_not_greater_than
    arguments:
      column: col2                       # column to aggregate (omit for count of all rows)
      aggr_type: count
      limit: 10

# Count of col2 grouped by col3 must not exceed 10
- table_name: catalog.schema.table       # added table name placeholder
  criticality: error
  run_config_name: default               # added run configuration name
  check:
    function: is_aggr_not_greater_than
    arguments:
      column: col2
      aggr_type: count
      group_by:
        - col3                          # optional grouping; one or more columns; must exist in the table
      limit: 10

################################################################################
# Section 7 – Dataset‑level foreign key check
#
# Validates that values in the source dataset exist in a reference dataset.
# Provide either a reference DataFrame name (`ref_df_name`) or a fully qualified
# table name (`ref_table`), not both.  `negate` flips the logic to flag rows
# that do exist in the reference.
################################################################################

# Single-column foreign key using a reference DataFrame
- table_name: catalog.schema.table       # added table name placeholder
  criticality: error
  run_config_name: default               # added run configuration name
  check:
    function: foreign_key
    arguments:
      columns:
        - col1                           # column(s) in source dataset
      ref_columns:
        - ref_col1                       # matching column(s) in reference dataset
      ref_df_name: ref_df_key            # key to locate the reference DataFrame in ref_dfs
      # negate: false                    # optional; set true to fail rows that exist in the reference

# Composite-key foreign key using a table
- table_name: catalog.schema.table       # added table name placeholder
  criticality: error
  run_config_name: default               # added run configuration name
  check:
    function: foreign_key
    arguments:
      columns:
        - col1
        - col2
      ref_columns:
        - ref_col1
        - ref_col2
      ref_table: catalog.schema.ref_table   # fully qualified table name used as the reference

################################################################################
# Section 8 – Dataset‑level SQL query
#
# Executes an arbitrary SQL query across the dataset (and optional reference data).
# The query must return all `merge_columns` plus a boolean `condition_column`.
# The check fails when the condition column is True (unless `negate` is true).
# Use `input_placeholder` to name the input DataFrame within the SQL query.
################################################################################

- table_name: catalog.schema.table       # added table name placeholder
  criticality: error
  run_config_name: default               # added run configuration name
  check:
    function: sql_query
    arguments:
      query: |
        SELECT col1, col2, SUM(col3) = 0 AS condition
        FROM {{ input_view }}                  # placeholder referencing the input dataset
        GROUP BY col1, col2
      merge_columns:                           # must exist in input DataFrame and query result
        - col1
        - col2
      condition_column: condition              # boolean; True means failure
      input_placeholder: input_view            # alias used inside double braces in the SQL
      # msg: "Aggregated col3 is zero"       # optional message added to result rows
      # name: "check_sum_col3_zero"         # optional name of the resulting check column
      # negate: false                        # optional; set true to invert the meaning (fail when condition is False)

################################################################################
# Section 9 – Dataset‑level compare datasets
#
# Compares two datasets at row and column level.  Provide matching key columns
# for both source and reference.  You can exclude columns from the comparison
# and enable detection of missing records via full outer join.
################################################################################

- table_name: catalog.schema.table       # added table name placeholder
  criticality: error
  run_config_name: default               # added run configuration name
  check:
    function: compare_datasets
    arguments:
      columns:
        - col1                           # key columns in the source dataset
        - col2
      ref_columns:
        - ref_col1                       # corresponding key columns in the reference dataset
        - ref_col2
      # ref_df_name: ref_df_key         # OR use ref_table: catalog.schema.ref_table
      # exclude_columns:
      #   - col7                        # columns to ignore differences for
      # check_missing_records: true     # if true, perform FULL OUTER JOIN to detect missing rows; increases result size

################################################################################
# Section 10 – Dataset‑level multi‑key application (`for_each_column`)
#
# Use this construct for functions that accept a `columns` argument (e.g.
# `is_unique`).  Each inner list defines a distinct key on which the check will
# run independently.
################################################################################

- table_name: catalog.schema.table       # added table name placeholder
  criticality: error
  run_config_name: default               # added run configuration name
  check:
    function: is_unique
    for_each_column:
      - [col1]                           # first uniqueness key: single column in the dataset; run uniqueness check on col1 alone
      - [col2, col3]                     # second key: composite of col2 and col3; run uniqueness check on (col2, col3)
      # - [col4, col5, col6]             # additional keys; each inner list must contain existing column names

################################################################################
# End of template with placeholders
################################################################################


In [0]:
# apply check to multiple columns (simple col, struct, map and array)
- table_name: catalog.schema.table       # added table name placeholder
  name: rule_name
  criticality: error
  check:
    function: is_not_null
    for_each_column:
    - col1 # col
    - col8.field1 # struct col
    - try_element_at(col7, 'key1') # map col
    - try_element_at(col4, 1) # array col 

| employee_id | age |  salary | hire_date  |
|------------:|----:|--------:|-----------|
| 1           | 30 |  50000  | 2020-01-15 |
| 2           | 45 |  80000  | 2018-05-30 |
| 3           | 25 |  45000  | 2022-07-10 |
| 4           | 40 |  70000  | 2019-09-20 |
| 5           | 29 |      0  | 2021-03-12 |
| 6           | 60 | 120000  | 2010-11-05 |

In [0]:
# Section 1 – Simple row-level check
# Ensure salary is within a reasonable range [0, 200000].
- table_name: dq_dev.company.employees
  name: salary_range
  criticality: error
  run_config_name: default
  check:
    function: is_in_range
    arguments:
      column: salary
      min_limit: 0
      max_limit: 200000

# Section 2 – Row-level check applied individually
# Ensure both age and salary are at least 1.
- table_name: dq_dev.company.employees
  name: age_salary_min
  criticality: warn
  run_config_name: default
  check:
    function: is_not_less_than
    for_each_column:
      - age
      - salary
    arguments:
      limit: 1

# Section 3 – Complex types
# (a) Validate that hire_date is a valid date (format yyyy-MM-dd).
- table_name: dq_dev.company.employees
  name: valid_hire_date
  criticality: error
  run_config_name: default
  check:
    function: is_valid_date
    arguments:
      column: hire_date
      date_format: yyyy-MM-dd

# (b) Ensure salary is all numeric digits.
- table_name: dq_dev.company.employees
  name: salary_numeric
  criticality: warn
  run_config_name: default
  check:
    function: regex_match
    arguments:
      column: salary
      regex: '^[0-9]+$'
      negate: false

# (c) Ensure employee_id is present.
- table_name: dq_dev.company.employees
  name: employee_id_exists
  criticality: error
  run_config_name: default
  check:
    function: is_not_null
    arguments:
      column: employee_id

# (d) Ensure salary does not exceed 200000.
- table_name: dq_dev.company.employees
  name: max_salary_le_200k
  criticality: warn
  run_config_name: default
  check:
    function: is_not_greater_than
    arguments:
      column: salary
      limit: 200000

# Section 4 – Row-level SQL expression
# Ensure salary is at least 1000 times age.
- table_name: dq_dev.company.employees
  name: salary_vs_age_rule
  criticality: error
  run_config_name: default
  check:
    function: sql_expression
    arguments:
      expression: "salary >= age * 1000"
      msg: "Salary is unexpectedly low for age"
      name: "salary_vs_age"
      columns: [salary, age]

# Section 5 – Dataset-level uniqueness
# Ensure employee_id is unique.
- table_name: dq_dev.company.employees
  name: unique_employee_id
  criticality: error
  run_config_name: default
  check:
    function: is_unique
    arguments:
      columns: [employee_id]
      nulls_distinct: false

# Section 6 – Dataset-level aggregation
# Ensure the average salary is at least 30000.
- table_name: dq_dev.company.employees
  name: avg_salary_min
  criticality: warn
  run_config_name: default
  check:
    function: is_aggr_not_less_than
    arguments:
      column: salary
      aggr_type: avg
      limit: 30000

# Section 7 – Foreign key
# Ensure employee_id exists in a master employee table.
- table_name: dq_dev.company.employees
  name: employee_fk_check
  criticality: error
  run_config_name: default
  check:
    function: foreign_key
    arguments:
      columns: [employee_id]
      ref_columns: [employee_id]
      ref_table: catalog.hr.employees_master
      negate: false

# Section 8 – Dataset-level SQL query
# Ensure there are at least two employees in each age group.
- table_name: dq_dev.company.employees
  name: age_group_count_check
  criticality: error
  run_config_name: default
  check:
    function: sql_query
    arguments:
      query: |
        SELECT age,
               COUNT(*) < 2 AS condition      -- condition = True when there are < 2 employees in the group
        FROM {{ input_view }}
        GROUP BY age
      merge_columns: [age]
      condition_column: condition
      input_placeholder: input_view
      msg: "Age group has fewer than two employees"
      name: "min_two_employees_per_age"

# Section 9 – Compare datasets
# Compare current employees against last month's snapshot by employee_id, ignoring salary.
- table_name: dq_dev.company.employees
  name: compare_employees
  criticality: warn
  run_config_name: default
  check:
    function: compare_datasets
    arguments:
      columns: [employee_id]
      ref_columns: [employee_id]
      ref_df_name: last_month                 # Reference DataFrame name
      exclude_columns: [salary]               # Ignore salary differences
      check_missing_records: true             # Identify missing or new employee_ids

# Section 10 – Multi-key uniqueness
# Ensure both employee_id and (age, salary) keys are unique.
- table_name: dq_dev.company.employees
  name: multi_key_uniqueness
  criticality: error
  run_config_name: default
  check:
    function: is_unique
    for_each_column:
      - [employee_id]                          # First key
      - [age, salary]                          # Second key (composite)

### CLA Rules

#### wkdy_dim_project.yaml

In [0]:
#Dataset level
- check:
  arguments:
    columns: project_key
  function: is_unique
  criticality: error
name: project_key_is_not_unique
#row level
- check:
    arguments:
      column: project_key
    function: is_not_null
  criticality: error
  name: project_key_is_null
  
- check:
    arguments:
      column: project_status
    function: is_not_null
  criticality: error
  name: project_status_is_null
  
  - check:
    arguments:
      allowed:
        - Active
        - Closed
        - Pending Close
        - Schedule Pending
        - Suspended
      column: project_status
    function: is_in_list
  criticality: warning
  name: project_status_is_new_value
  
 # project type is not null or Unknown
 - check:
    arguments:
      expression: project_type_name is not null AND project_type_name != 'Unknown'
    function: sql_expression
  criticality: warning
  name: project_type_is_not_null_or_unknown

In [0]:
# Enforce uniqueness of project_key across the table (dataset-level)
- table_name: de_prd.gold.wkdy_dim_project
  name: project_key_is_not_unique
  criticality: warn
  run_config_name: default
  check:
    function: is_unique
    arguments:
      columns: [project_key]

# project_key must not be null (row-level)
- table_name: de_prd.gold.wkdy_dim_project
  name: project_key_is_null
  criticality: warn
  run_config_name: default
  check:
    function: is_not_null
    arguments:
      column: project_key

# project_status must not be null (row-level)
- table_name: de_prd.gold.wkdy_dim_project
  name: project_status_is_null
  criticality: warn
  run_config_name: default
  check:
    function: is_not_null
    arguments:
      column: project_status

# project_status must be from allowed set (row-level)
- table_name: de_prd.gold.wkdy_dim_project
  name: project_status_is_new_value
  criticality: warn
  run_config_name: default
  check:
    function: is_in_list
    arguments:
      column: project_status
      allowed:
        - Active
        - Closed
        - Pending Close
        - Schedule Pending
        - Suspended

# project_type_name must not be null or 'Unknown'
- table_name: de_prd.gold.wkdy_dim_project
  name: project_type_is_not_null_or_unknown
  criticality: warn
  run_config_name: default
  check:
    function: sql_expression
    arguments:
      expression: "project_type_name is not null AND project_type_name != 'Unknown'"

# email should match pattern (basic email format)
- table_name: de_prd.gold.wkdy_dim_project
  name: email_is_not_valid
  criticality: warn
  run_config_name: default
  check:
    function: regex_match
    arguments:
      column: email
      regex: "^(.+)@(.+)$"

# project_start_date must be <= project_end_date
- table_name: de_prd.gold.wkdy_dim_project
  name: project_start_after_end_date
  criticality: warn
  run_config_name: default
  check:
    function: sql_expression
    arguments:
      expression: "coalesce(project_start_date, '1900-01-01') <= coalesce(project_end_date, '9999-12-31')"

# first_activity_date must be <= last_activity_date
- table_name: de_prd.gold.wkdy_dim_project
  name: first_activity_date_after_last_activity_date
  criticality: warn
  run_config_name: default
  check:
    function: sql_expression
    arguments:
      expression: "coalesce(first_activity_date, '1900-01-01') <= coalesce(last_activity_date, '9999-12-31')"

# _created_date must be <= _last_updated_date
- table_name: de_prd.gold.wkdy_dim_project
  name: created_date_after_last_updated_date
  criticality: warn
  run_config_name: default
  check:
    function: sql_expression
    arguments:
      expression: "coalesce(_created_date, '1900-01-01') <= coalesce(_last_updated_date, '9999-12-31')"

#### dq_prd.monitoring.job_run_audit

## Working

### Load DQX Config

In [0]:
"""
START: update_dqx_rules_table
|
|-- 1. Check if Delta config table exists:
|     |-- If table does NOT exist:
|     |     |-- Create empty Delta table using TABLE_SCHEMA.
|     |
|     |-- If table DOES exist:
|           |-- Proceed to next step.
|
|-- 2. For each YAML file in rules_dir:
|     |-- Parse file (YAML load).
|     |-- FILE-LEVEL validation (all rules in file target same table; no dup rule names; filename matches table name).
|     |-- For each rule:
|           |-- RULE-LEVEL validation (required fields, format, criticality, etc).
|     |-- DQX syntax validation (DQEngine.validate_checks).
|     |-- For each rule:
|           |-- Extract run_config_name, look up valid_target_table and quarantine_target_table from output config.
|           |-- Flatten and collect rule as dict (with hash_id, audit fields, etc).
|
|-- 3. Combine all flattened rules into one list.
|
|-- 4. Upsert all rules into Delta table:
|     |-- If entry exists (yaml_path, table_name, name): UPDATE all fields except created_at/created_by.
|     |-- If entry missing: INSERT, set created_at=now(UTC), created_by='admin', updated_at/updated_by=None.
|
|-- END: update_dqx_rules_table
"""

In [0]:
%pip install databricks-labs-dqx

In [0]:
dbutils.library.restartPython()

In [0]:
import os
import json
import hashlib
import yaml
from typing import Dict, Any, Optional

from pyspark.sql import SparkSession, types as T
from delta.tables import DeltaTable
from databricks.labs.dqx.engine import DQEngine
from pyspark.sql.functions import to_timestamp, col

from utils.print import print_notebook_env
from utils.timezone import current_time_iso


# --- Unified schema, with DQX columns sandwiched ---
# DQX core columns must match the docs:
# name (STRING), criticality (STRING), check (STRUCT), filter (STRING),
# run_config_name (STRING), user_metadata (MAP<STRING,STRING>)
TABLE_SCHEMA = T.StructType([
    T.StructField("hash_id", T.StringType(), False),
    T.StructField("table_name", T.StringType(), False),

    # DQX fields begin here
    T.StructField("name", T.StringType(), False),
    T.StructField("criticality", T.StringType(), False),
    T.StructField(
        "check",
        T.StructType([
            T.StructField("function", T.StringType(), False),
            T.StructField("for_each_column", T.ArrayType(T.StringType()), True),
            T.StructField("arguments", T.MapType(T.StringType(), T.StringType()), True),
        ]),
        False,
    ),
    T.StructField("filter", T.StringType(), True),
    T.StructField("run_config_name", T.StringType(), False),
    T.StructField("user_metadata", T.MapType(T.StringType(), T.StringType()), True),

    # your ops fields
    T.StructField("yaml_path", T.StringType(), False),
    T.StructField("active", T.BooleanType(), False),
    T.StructField("created_by", T.StringType(), False),
    T.StructField("created_at", T.StringType(), False),  # stored as ISO string; we may cast on write
    T.StructField("updated_by", T.StringType(), True),
    T.StructField("updated_at", T.StringType(), True),
])


def compute_hash(rule_dict: Dict[str, Any]) -> str:
    """Stable hash over the identifying fields of a rule."""
    relevant = {
        k: rule_dict[k]
        for k in ["table_name", "name", "criticality", "run_config_name", "check"]
        if k in rule_dict
    }
    return hashlib.md5(json.dumps(relevant, sort_keys=True).encode()).hexdigest()


def _stringify_map_values(d: Dict[str, Any]) -> Dict[str, str]:
    """
    Convert a dict of arbitrary JSON-serializable values to map<string,string>
    required by DQX (lists/dicts -> JSON, bool -> 'true'/'false', else str()).
    """
    out: Dict[str, str] = {}
    for k, v in (d or {}).items():
        if isinstance(v, (list, dict)):
            out[k] = json.dumps(v)
        elif isinstance(v, bool):
            out[k] = "true" if v else "false"
        elif v is None:
            out[k] = "null"
        else:
            out[k] = str(v)
    return out


def process_yaml_file(path: str, output_config: Dict[str, Any], time_zone: str = "UTC"):
    """Read one YAML file, validate, and flatten into rows for the table."""
    file_base = os.path.splitext(os.path.basename(path))[0]
    with open(path, "r") as fh:
        docs = yaml.safe_load(fh)
    if isinstance(docs, dict):
        docs = [docs]

    validate_rules_file(docs, file_base, path)

    now = current_time_iso(time_zone)
    flat_rules = []

    for rule in docs:
        validate_rule_fields(rule, path)

        h = compute_hash(rule)
        check_dict = rule["check"]

        # Strong typing for DQX struct:
        function = check_dict.get("function")
        if not isinstance(function, str) or not function:
            raise ValueError(f"{path}: check.function must be a non-empty string (rule '{rule.get('name')}').")

        for_each = check_dict.get("for_each_column")
        if for_each is not None and not isinstance(for_each, list):
            raise ValueError(f"{path}: check.for_each_column must be an array of strings (rule '{rule.get('name')}').")
        if isinstance(for_each, list):
            try:
                for_each = [str(x) for x in for_each]
            except Exception:
                raise ValueError(f"{path}: unable to cast for_each_column items to strings (rule '{rule.get('name')}').")

        arguments = check_dict.get("arguments", {}) or {}
        if not isinstance(arguments, dict):
            raise ValueError(f"{path}: check.arguments must be a map (rule '{rule.get('name')}').")
        arguments = _stringify_map_values(arguments)  # enforce map<string,string>

        user_metadata = rule.get("user_metadata")
        if user_metadata is not None:
            if not isinstance(user_metadata, dict):
                raise ValueError(f"{path}: user_metadata must be a map<string,string> (rule '{rule.get('name')}').")
            user_metadata = _stringify_map_values(user_metadata)

        flat_rules.append({
            "hash_id": h,
            "table_name": rule["table_name"],

            "name": rule["name"],
            "criticality": rule["criticality"],
            "check": {
                "function": function,
                "for_each_column": for_each if for_each else None,
                "arguments": arguments if arguments else None,
            },
            "filter": rule.get("filter"),
            "run_config_name": rule["run_config_name"],
            "user_metadata": user_metadata if user_metadata else None,

            "yaml_path": path,
            "active": rule.get("active", True),
            "created_by": "AdminUser",
            "created_at": now,
            "updated_by": None,
            "updated_at": None,
        })

    # Validate with DQX engine (semantic)
    validate_with_dqx(docs, path)
    return flat_rules


def parse_output_config(config_path: str) -> Dict[str, Any]:
    with open(config_path, "r") as fh:
        config = yaml.safe_load(fh)
    # Expect the new keys
    required = ["dqx_checks_config_table_name", "dqx_yaml_checks", "run_config_name"]
    missing = [k for k in required if k not in config]
    if missing:
        raise ValueError(f"Config missing required keys: {missing}")
    return config


def validate_rules_file(rules, file_base: str, file_path: str):
    problems = []
    seen_names = set()
    table_names = {r.get("table_name") for r in rules if isinstance(r, dict)}
    if len(table_names) != 1:
        problems.append(f"Inconsistent table_name values in {file_path}: {table_names}")
    expected_table = file_base
    try:
        tn = list(table_names)[0]
        if tn.split(".")[-1] != expected_table:
            problems.append(
                f"Table name in rules ({tn}) does not match filename ({expected_table}) in {file_path}"
            )
    except Exception:
        problems.append(f"No valid table_name found in {file_path}")

    for rule in rules:
        name = rule.get("name")
        if not name:
            problems.append(f"Missing rule name in {file_path}")
        if name in seen_names:
            problems.append(f"Duplicate rule name '{name}' in {file_path}")
        seen_names.add(name)

    if problems:
        raise ValueError(f"File-level validation failed in {file_path}: {problems}")


def validate_rule_fields(rule, file_path: str):
    problems = []
    required_fields = ["table_name", "name", "criticality", "run_config_name", "check"]
    for field in required_fields:
        if not rule.get(field):
            problems.append(
                f"Missing required field '{field}' in rule '{rule.get('name')}' ({file_path})"
            )
    if rule.get("table_name", "").count(".") != 2:
        problems.append(
            f"table_name '{rule.get('table_name')}' not fully qualified in rule '{rule.get('name')}' ({file_path})"
        )
    if rule.get("criticality") not in {"error", "warn", "warning"}:
        problems.append(
            f"Invalid criticality '{rule.get('criticality')}' in rule '{rule.get('name')}' ({file_path})"
        )
    if not rule.get("check", {}).get("function"):
        problems.append(
            f"Missing check.function in rule '{rule.get('name')}' ({file_path})"
        )
    if problems:
        raise ValueError(f"Rule-level validation failed: {problems}")


def validate_with_dqx(rules, file_path: str):
    status = DQEngine.validate_checks(rules)
    if status.has_errors:
        raise ValueError(f"DQX validation failed in {file_path}:\n{status.to_string()}")


def ensure_delta_table(spark: SparkSession, delta_table_name: str):
    if not spark.catalog.tableExists(delta_table_name):
        print(f"Creating new Delta table at {delta_table_name}")
        empty_df = spark.createDataFrame([], TABLE_SCHEMA)
        empty_df.write.format("delta").saveAsTable(delta_table_name)
    else:
        print(f"Delta table already exists at {delta_table_name}")


def upsert_rules_into_delta(spark: SparkSession, rules, delta_table_name: str):
    if not rules:
        print("No rules to write, skipping upsert.")
        return

    print(f"\nWriting rules to Delta table '{delta_table_name}'...")
    print(f"Number of rules to write: {len(rules)}")

    df = spark.createDataFrame(rules, schema=TABLE_SCHEMA)

    # Cast audit timestamps to actual TIMESTAMP in the sink
    df = df.withColumn("created_at", to_timestamp(col("created_at"))) \
           .withColumn("updated_at", to_timestamp(col("updated_at")))

    try:
        delta_table = DeltaTable.forName(spark, delta_table_name)
        delta_table.alias("target").merge(
            df.alias("source"),
            "target.yaml_path = source.yaml_path AND target.table_name = source.table_name AND target.name = source.name"
        ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
    except Exception:
        print("Delta merge failed (likely first write). Writing full table.")
        df.write.format("delta").saveAsTable(delta_table_name)

    print(f"Successfully wrote {df.count()} rules to '{delta_table_name}'.")


def print_rules_df(spark: SparkSession, rules):
    if not rules:
        print("No rules to show.")
        return
    df = spark.createDataFrame(rules, schema=TABLE_SCHEMA)
    df = df.withColumn("created_at", to_timestamp(col("created_at"))) \
           .withColumn("updated_at", to_timestamp(col("updated_at")))
    print("\n==== Dry Run: Rules DataFrame to be uploaded ====")
    df.show(truncate=False, n=50)
    print(f"Total rules: {df.count()}")
    return df


def validate_all_rules(rules_dir: str, output_config: Dict[str, Any], fail_fast: bool = True):
    errors = []
    print(f"Starting validation for all YAML rule files in '{rules_dir}'")
    for fname in os.listdir(rules_dir):
        if not fname.endswith((".yaml", ".yml")):
            continue
        full_path = os.path.join(rules_dir, fname)
        print(f"\nValidating file: {full_path}")
        try:
            file_base = os.path.splitext(os.path.basename(full_path))[0]
            with open(full_path, "r") as fh:
                docs = yaml.safe_load(fh)
            if isinstance(docs, dict):
                docs = [docs]
            validate_rules_file(docs, file_base, full_path)
            print(f"  File-level validation passed for {full_path}")
            for rule in docs:
                validate_rule_fields(rule, full_path)
                print(f"    Rule-level validation passed for rule '{rule.get('name')}'")
            validate_with_dqx(docs, full_path)
            print(f"  DQX validation PASSED for {full_path}")
        except Exception as ex:
            print(f"  Validation FAILED for file {full_path}\n  Reason: {ex}")
            errors.append(str(ex))
            if fail_fast:
                break
    if not errors:
        print("\nAll YAML rule files are valid!")
    else:
        print("\nRule validation errors found:")
        for e in errors:
            print(e)
    return errors


def main(
    output_config_path: str = "resources/dqx_config.yaml",
    rules_dir: Optional[str] = None,
    time_zone: str = "America/Chicago",
    dry_run: bool = False,
    validate_only: bool = False
):
    spark = SparkSession.builder.getOrCreate()

    print_notebook_env(spark, local_timezone=time_zone)

    output_config = parse_output_config(output_config_path)

    # pick up rules_dir from config if not provided explicitly
    rules_dir = rules_dir or output_config["dqx_yaml_checks"]

    delta_table_name = output_config["dqx_checks_config_table_name"]

    all_rules = []
    for fname in os.listdir(rules_dir):
        if not fname.endswith((".yaml", ".yml")):
            continue
        full_path = os.path.join(rules_dir, fname)
        file_rules = process_yaml_file(full_path, output_config, time_zone=time_zone)
        all_rules.extend(file_rules)

    if validate_only:
        print("\nValidation only: not writing any rules.")
        validate_all_rules(rules_dir, output_config)
        return

    if dry_run:
        print_rules_df(spark, all_rules)
        return

    ensure_delta_table(spark, delta_table_name)
    upsert_rules_into_delta(spark, all_rules, delta_table_name)
    print(f"Finished writing rules to '{delta_table_name}'.")


if __name__ == "__main__":
    # main(dry_run=True)
    # main(validate_only=True)
    main()