# DQX

In [0]:
https://adb-5943863453538272.12.azuredatabricks.net/editor/notebooks/4441860942440814?o=5943863453538272

In [0]:
databricks labs dqx open-dashboards

## Reference

In [0]:
# --------------------------------------------------------------------------
# Built‑in DQX quality rule definitions
# --------------------------------------------------------------------------

ROW_LEVEL_FUNCTIONS = {
    "is_not_null": {"args": ["column"], "optional": []},
    "is_not_empty": {"args": ["column"], "optional": []},
    "is_not_null_and_not_empty": {"args": ["column"], "optional": ["trim_strings"]},
    "is_in_list": {"args": ["column", "allowed"], "optional": []},
    "is_not_null_and_is_in_list": {"args": ["column", "allowed"], "optional": []},
    "is_not_null_and_not_empty_array": {"args": ["column"], "optional": []},
    "is_in_range": {"args": ["column", "min_limit", "max_limit"], "optional": []},
    "is_not_in_range": {"args": ["column", "min_limit", "max_limit"], "optional": []},
    "is_not_less_than": {"args": ["column", "limit"], "optional": []},
    "is_not_greater_than": {"args": ["column", "limit"], "optional": []},
    "is_valid_date": {"args": ["column"], "optional": ["date_format"]},
    "is_valid_timestamp": {"args": ["column"], "optional": ["timestamp_format"]},
    "is_not_in_future": {"args": ["column", "offset"], "optional": ["curr_timestamp"]},
    "is_not_in_near_future": {"args": ["column", "offset"], "optional": ["curr_timestamp"]},
    "is_older_than_n_days": {"args": ["column", "days"], "optional": ["curr_date", "negate"]},
    "is_older_than_col2_for_n_days": {"args": ["column1", "column2", "days"], "optional": ["negate"]},
    "regex_match": {"args": ["column", "regex"], "optional": ["negate"]},
    "is_valid_ipv4_address": {"args": ["column"], "optional": []},
    "is_ipv4_address_in_cidr": {"args": ["column", "cidr_block"], "optional": []},
    "sql_expression": {
        "args": ["expression"],
        "optional": ["msg", "name", "negate", "columns"],
    },
}

DATASET_LEVEL_FUNCTIONS = {
    "is_unique": {
        "args": ["columns"],
        "optional": ["nulls_distinct"],
    },
    "is_aggr_not_greater_than": {
        "args": ["limit"],
        "optional": ["column", "aggr_type", "group_by"],
    },
    "is_aggr_not_less_than": {
        "args": ["limit"],
        "optional": ["column", "aggr_type", "group_by"],
    },
    "is_aggr_equal": {
        "args": ["limit"],
        "optional": ["column", "aggr_type", "group_by"],
    },
    "is_aggr_not_equal": {
        "args": ["limit"],
        "optional": ["column", "aggr_type", "group_by"],
    },
    "foreign_key": {
        "args": ["columns", "ref_columns"],
        "optional": ["ref_df_name", "ref_table", "negate"],
    },
    "sql_query": {
        "args": ["query", "merge_columns", "condition_column"],
        "optional": ["input_placeholder", "msg", "name", "negate"],
    },
    "compare_datasets": {
        "args": ["columns", "ref_columns"],
        "optional": ["exclude_columns", "ref_df_name", "ref_table", "check_missing_records"],
    },
}

CRITICALITY_VALUES = {
    "error": "record goes only into the bad/quarantine DataFrame",
    "warn": "record goes into both good and bad DataFrames",
}

NEGATE_VALUES = [True, False]

REGEX_PARAMETER = {
    "key": "regex",
    "type": "str",
    "description": "Regular expression used by regex_match to evaluate values",
    "used_in": ["regex_match"],
}

RULE_LEVEL = ["row", "aggregate"]
FUNCTION_TYPE = ["built_in", "sql_expression", "custom"]

CHECK_METADATA_FIELDS = {
    "criticality": {"type": "str", "allowed_values": list(CRITICALITY_VALUES.keys())},
    "check": {
        "function": {"type": "str"},
        "arguments": {"type": "dict"},
        "for_each_column": {"type": "list[str]", "optional": True},
    },
    "name": {"type": "str", "optional": True},
    "filter": {"type": "str", "optional": True},
    "user_metadata": {"type": "dict[str, any]", "optional": True},
}

In [0]:
DQX_RESULT_KEYWORDS = {
    "criticality": {
        "values": ["error", "warn"],
        "description": "Severity level for the rule—shows up in dashboards/results."
    },
    "_errors": {
        "description": "Column auto-added by DQX to result DF, contains error violations"
    },
    "_warnings": {
        "description": "Column auto-added by DQX to result DF, contains warning violations"
    }
}

### Yaml-defined Rules Templates

In [0]:
################################################################################
# Section 1 – Simple row‑level checks
#
# These rules operate on a single column at a time for each row.  Only the
# `column` argument is mandatory.  Additional arguments apply only to certain
# functions and are shown commented out with explanations.
################################################################################

- criticality: error                 # 'error' => quarantine row on failure; 'warn' => flag but keep row
  check:
    function: is_not_null            # any row-level built-in function
    arguments:
      column: col1                   # name or expression of a column in the table being checked
      # allowed: [1, 2, 3]           # for is_in_list / is_not_null_and_is_in_list: values the column must belong to
      # min_limit: 0                 # for is_in_range / is_not_in_range: lower bound (inclusive)
      # max_limit: 10                # for is_in_range / is_not_in_range: upper bound (inclusive)
      # limit: 5                     # for is_not_less_than / is_not_greater_than: numeric/date/timestamp limit
      # date_format: yyyy-MM-dd      # for is_valid_date: expected date pattern
      # timestamp_format: yyyy-MM-dd HH:mm:ss   # for is_valid_timestamp: expected timestamp pattern
      # offset: 86400                # for is_not_in_future / is_not_in_near_future: offset in seconds
      # curr_timestamp: '2025-08-05T00:00:00'   # override current timestamp; string or timestamp literal
      # days: 7                      # for is_older_than_n_days / is_older_than_col2_for_n_days: number of days
      # regex: '[A-Z]{3}[0-9]{4}'    # for regex_match: regular expression pattern
      # negate: false                # boolean: set true to invert the check (fail when regex matches, pass otherwise)

################################################################################
# Section 2 – Row‑level checks applied to multiple columns individually
#
# Use `for_each_column` when the same single‑column rule should be executed on
# several columns.  Each entry must be a valid column name or expression from
# the table being checked.  There is no hard limit on the number of columns.
################################################################################

- criticality: error
  check:
    function: is_not_null
    for_each_column:
      - col1                         # first column name in the dataset
      - col2                         # second column name in the dataset
      # - col3                      # add additional column names as needed; each must exist in the dataset

################################################################################
# Section 3 – Row‑level checks on complex types (struct, map, array)
#
# These templates target elements within complex columns.  Use dot notation
# (`struct_field`) for struct fields, and `try_element_at` for map or array
# elements.  For array aggregations (e.g. max/min), include a numeric limit.
################################################################################

# Single element from a complex type
- criticality: error
  check:
    function: is_not_null
    arguments:
      column: col8.field1            # struct field; must refer to an existing field
      # column: try_element_at(col7, 'key1')  # map element lookup; key must exist
      # column: try_element_at(col4, 0)       # array element lookup by zero-based index
      # Any optional parameters from Section 1 can be included here (e.g. regex)

# Aggregating an array (e.g. ensure max element ≤ 10)
- criticality: error
  check:
    function: is_not_greater_than    # or is_not_less_than
    arguments:
      column: array_max(col4)        # aggregate function applied to an array column
      limit: 10                      # required for array aggregation; numeric threshold for comparison

################################################################################
# Section 4 – Row‑level SQL expression
#
# For complex row‑level logic, use `sql_expression`.  This rule does not take a
# `column` field; instead provide an SQL predicate.  Optional fields allow you to
# customise messages, naming and inversion behaviour.
################################################################################

- criticality: error
  check:
    function: sql_expression
    arguments:
      expression: "col3 >= col2 AND col3 <= 10"  # SQL predicate; fails when expression is True
      # msg: "col3 out of range"       # optional message to include in result rows
      # name: "col3_range_check"       # optional name for the resulting check column (appears in result DataFrame)
      # negate: false                  # boolean: set true to invert the logic (fail when expression is False)
      # columns: [col2, col3]          # optional list of one or more column names used only for reporting; does not affect logic

################################################################################
# Section 5 – Dataset‑level uniqueness check
#
# Ensures that the specified columns are unique across all rows.  A failure is
# logged for every duplicate row found.  Use `nulls_distinct` to control how
# NULL values are treated.
################################################################################

- criticality: error
  check:
    function: is_unique
    arguments:
      columns:
        - col1                        # key columns (one or more) used to test uniqueness
        # - col2                     # add additional columns for composite key
      # nulls_distinct: true          # default true: treat NULLs as distinct; set false to consider NULLs equal

################################################################################
# Section 6 – Dataset‑level aggregation checks
#
# Compare aggregated values against a threshold.  Specify `aggr_type` (count,
# sum, avg, min, max).  For count, omit the column; otherwise provide the
# column being aggregated.  `group_by` can list one or more columns to group
# rows before aggregation.
################################################################################

# Count of all rows must not exceed 10
- criticality: error
  check:
    function: is_aggr_not_greater_than
    arguments:
      aggr_type: count               # aggregation type (count, sum, avg, min, max)
      limit: 10                      # numeric or literal threshold for comparison

# Count of non-null values in col2 must not exceed 10
- criticality: error
  check:
    function: is_aggr_not_greater_than
    arguments:
      column: col2                   # column to aggregate (omit for count of all rows)
      aggr_type: count
      limit: 10

# Count of col2 grouped by col3 must not exceed 10
- criticality: error
  check:
    function: is_aggr_not_greater_than
    arguments:
      column: col2
      aggr_type: count
      group_by:
        - col3                       # one or more grouping columns; must exist in the table
      limit: 10

################################################################################
# Section 7 – Dataset‑level foreign key check
#
# Validates that values in the source dataset exist in a reference dataset.
# Provide either a reference DataFrame name (`ref_df_name`) or a fully qualified
# table name (`ref_table`), not both.  `negate` flips the logic to flag rows
# that do exist in the reference.
################################################################################

# Single-column foreign key using a reference DataFrame
- criticality: error
  check:
    function: foreign_key
    arguments:
      columns:
        - col1                       # column(s) in source dataset
      ref_columns:
        - ref_col1                   # matching column(s) in reference dataset
      ref_df_name: ref_df_key        # key to locate the reference DataFrame in ref_dfs
      # negate: false                # optional; set true to fail rows that exist in the reference

# Composite-key foreign key using a table
- criticality: error
  check:
    function: foreign_key
    arguments:
      columns:
        - col1
        - col2
      ref_columns:
        - ref_col1
        - ref_col2
      ref_table: catalog.schema.ref_table   # fully qualified table name used as the reference

################################################################################
# Section 8 – Dataset‑level SQL query
#
# Executes an arbitrary SQL query across the dataset (and optional reference data).
# The query must return all `merge_columns` plus a boolean `condition_column`.
# The check fails when the condition column is True (unless `negate` is true).
# Use `input_placeholder` to name the input DataFrame within the SQL query.
################################################################################

- criticality: error
  check:
    function: sql_query
    arguments:
      query: |
        SELECT col1, col2, SUM(col3) = 0 AS condition
        FROM {{ input_view }}                   # placeholder referencing the input dataset
        GROUP BY col1, col2
      merge_columns:
        - col1
        - col2
      condition_column: condition              # name of the boolean column returned by the query
      input_placeholder: input_view            # alias used inside double braces in the SQL
      # msg: "Aggregated col3 is zero"         # optional message added to result rows
      # name: "check_sum_col3_zero"            # optional name of the resulting check column
      # negate: false                          # optional; set true to invert the meaning (fail when condition is False)

################################################################################
# Section 9 – Dataset‑level compare datasets
#
# Compares two datasets at row and column level.  Provide matching key columns
# for both source and reference.  You can exclude columns from the comparison
# and enable detection of missing records via full outer join.
################################################################################

- criticality: error
  check:
    function: compare_datasets
    arguments:
      columns:
        - col1                       # key columns in the source dataset
        - col2
      ref_columns:
        - ref_col1                   # corresponding key columns in the reference dataset
        - ref_col2
      # ref_df_name: ref_df_key               # OR use ref_table: catalog.schema.ref_table
      # exclude_columns:
      #   - col7                              # columns to ignore differences for
      # check_missing_records: true           # if true, perform FULL OUTER JOIN to detect missing rows; increases result size

################################################################################
# Section 10 – Dataset‑level multi‑key application (`for_each_column`)
#
# Use this construct for functions that accept a `columns` argument (e.g.
# `is_unique`).  Each inner list defines a distinct key on which the check will
# run independently.
################################################################################

- criticality: error
  check:
    function: is_unique
    for_each_column:
      - [col1]                     # first uniqueness key: single column in the dataset
      - [col2, col3]               # second key: composite of col2 and col3
      # - [col4, col5, col6]       # additional keys; each inner list must contain existing column names

################################################################################
# End of document
################################################################################

#### Example Rules

| animal_id | species | age_years | weight_kg | feeding_schedule | care_info                                      | attributes                          |
|-----------|---------|-----------|-----------|------------------|------------------------------------------------|-------------------------------------|
| 101       | lion    | 5         | 190.5     | [8, 16]          | {"keeper":"Alice", "last_check":"2025-07-31"}  | {"color":"gold", "region":"Africa"} |
| 102       | tiger   | 3         | 220.0     | [7, 15, 22]      | {"keeper":"Bob", "last_check":"2025-07-30"}    | {"color":"orange", "pattern":"striped"} |
| 103       | giraffe | 8         | 800.0     | [9]              | {"keeper":"Carol", "last_check":"2025-07-29"}  | {"height":"tall"}                   |
| 104       | zebra   | 4         | 350.0     | [10, 18]         | {"keeper":"Dave", "last_check":"2025-07-28"}   | {"color":"black-white", "pattern":"striped"} |

In [0]:
# Section 1 – Simple row-level check
# Ensure weight_kg is within a reasonable range [0, 1000].
- criticality: error
  name: weight_range
  check:
    function: is_in_range
    arguments:
      column: weight_kg       # Column from the dataset
      min_limit: 0            # Lower bound inclusive
      max_limit: 1000         # Upper bound inclusive

# Section 2 – Row-level check applied individually
# Ensure both age_years and weight_kg are at least 1.
- criticality: warn
  name: age_weight_min
  check:
    function: is_not_less_than
    for_each_column:
      - age_years            # First column to check
      - weight_kg            # Second column to check
    arguments:
      limit: 1               # Minimum allowable value

# Section 3 – Complex types
# (a) Validate that care_info.last_check is a valid date (format yyyy-MM-dd).
- criticality: error
  name: valid_last_check
  check:
    function: is_valid_date
    arguments:
      column: care_info.last_check
      date_format: yyyy-MM-dd

# (b) Ensure attributes['pattern'] matches either "striped" or "spotted".
- criticality: warn
  name: pattern_regex
  check:
    function: regex_match
    arguments:
      column: try_element_at(attributes, 'pattern')
      regex: '^(striped|spotted)$'
      negate: false          # Fail when the pattern DOESN'T match

# (c) Ensure the first element of feeding_schedule is present.
- criticality: error
  name: first_feed_exists
  check:
    function: is_not_null
    arguments:
      column: try_element_at(feeding_schedule, 0)

# (d) Ensure no feeding time exceeds 24 hours.
- criticality: warn
  name: max_feed_le_24
  check:
    function: is_not_greater_than
    arguments:
      column: array_max(feeding_schedule)
      limit: 24              # Required for array aggregation; must be a number

# Section 4 – Row-level SQL expression
# Ensure weight_kg is at least 20 times age_years.
- criticality: error
  name: weight_vs_age_rule
  check:
    function: sql_expression
    arguments:
      expression: "weight_kg >= age_years * 20"
      msg: "Animal weight is unexpectedly low"
      name: "weight_vs_age"
      columns: [weight_kg, age_years]  # Columns used for reporting; must exist in table

# Section 5 – Dataset-level uniqueness [oai_citation:0‡databrickslabs.github.io](https://databrickslabs.github.io/dqx/docs/reference/quality_rules/#:~:text=Available%20dataset)
# Ensure (animal_id, species) combinations are unique; treat NULLs as equal.
- criticality: error
  name: unique_animal_species
  check:
    function: is_unique
    arguments:
      columns: [animal_id, species]
      nulls_distinct: false  # NULLs are treated as duplicate values

# Section 6 – Dataset-level aggregation [oai_citation:1‡databrickslabs.github.io](https://databrickslabs.github.io/dqx/docs/reference/quality_rules/)
# Ensure the average age per species is at least 2 years.
- criticality: warn
  name: avg_age_per_species
  check:
    function: is_aggr_not_less_than
    arguments:
      column: age_years
      aggr_type: avg         # Average aggregation
      group_by: [species]    # Group by species
      limit: 2               # Minimum allowable average

# Section 7 – Foreign key
# Ensure the keeper exists in an external staff table.
- criticality: error
  name: keeper_fk_check
  check:
    function: foreign_key
    arguments:
      columns: [care_info.keeper]
      ref_columns: [keeper_name]
      ref_table: catalog.hr.allowed_keepers
      negate: false          # Fail when keeper is NOT found in the reference table

# Section 8 – Dataset-level SQL query [oai_citation:2‡databrickslabs.github.io](https://databrickslabs.github.io/dqx/docs/reference/quality_rules/#:~:text=,%28optional%29%20name%20of)
# Ensure there are at least two animals of each species.
- criticality: error
  name: species_count_check
  check:
    function: sql_query
    arguments:
      query: |
        SELECT species,
               COUNT(*) < 2 AS condition      -- condition = True when there are < 2 animals
        FROM {{ input_view }}
        GROUP BY species
      merge_columns: [species]                # Join key back to input dataset
      condition_column: condition             # Boolean column; True means failure
      input_placeholder: input_view           # Placeholder name used in query
      msg: "Species has fewer than two animals"
      name: "min_two_animals_per_species"

# Section 9 – Compare datasets [oai_citation:3‡databrickslabs.github.io](https://databrickslabs.github.io/dqx/docs/reference/quality_rules/#:~:text=systems.,missing%20from%20source%20or%20reference)
# Compare current animals against last week's snapshot by animal_id, ignoring weight.
- criticality: warn
  name: compare_animals
  check:
    function: compare_datasets
    arguments:
      columns: [animal_id]
      ref_columns: [animal_id]
      ref_df_name: last_week                 # Reference DataFrame name
      exclude_columns: [weight_kg]           # Ignore weight differences
      check_missing_records: true            # Identify missing or new animal_ids

# Section 10 – Multi-key uniqueness
# Ensure both animal_id and (species, care_info.keeper) keys are unique.
- criticality: error
  name: multi_key_uniqueness
  check:
    function: is_unique
    for_each_column:
      - [animal_id]                          # First key
      - [species, care_info.keeper]          # Second key (composite)

## Custom Rules

In [0]:
#Dataset level
- check:
  arguments:
    columns: project_key
  function: is_unique
  criticality: error
name: project_key_is_not_unique
#row level
- check:
    arguments:
      column: project_key
    function: is_not_null
  criticality: error
  name: project_key_is_null
  
- check:
    arguments:
      column: project_status
    function: is_not_null
  criticality: error
  name: project_status_is_null
  
  - check:
    arguments:
      allowed:
        - Active
        - Closed
        - Pending Close
        - Schedule Pending
        - Suspended
      column: project_status
    function: is_in_list
  criticality: warning
  name: project_status_is_new_value
  
 # project type is not null or Unknown
 - check:
    arguments:
      expression: project_type_name is not null AND project_type_name != 'Unknown'
    function: sql_expression
  criticality: warning
  name: project_type_is_not_null_or_unknown

 #check if email matches
- check:
    function: regex_match
    arguments:
      column: email
      regex: '^(.+)@(.+)$'
  criticality: warning
  name: email_is_not_valid
 
- check:  
    arguments:
        expression: coalesce(project_start_date, '1900-01-01') <= (coalesce(project_end_date, '9999-12-31'))
    criticality: warning
    function: sql_expression
    name: project_start_after_end_date
   
- check:
    criticality: warning
    function: sql_expression
    arguments:
        expression: coalesce(first_activity_date, '1900-01-01') <= (coalesce(last_activity_date, '9999-12-31'))
    name: first_activity_date_after_last_activity_date
 
- check:
    criticality: warning
    function: sql_expression
    arguments:
        expression: coalesce(_created_date, '1900-01-01') <= (coalesce(_last_updated_date, '9999-12-31'))
    name: created_date_after_last_updated_date 

## Working

In [0]:
%pip install databricks-labs-dqx

dbutils.library.restartPython()