In [1]:
import great_expectations as gx
from great_expectations.core import RunIdentifier
from great_expectations.checkpoint.actions import UpdateDataDocsAction
from datetime import datetime
import pandas as pd
import os
import numpy as np

In [2]:
#getting the data and context
context = gx.get_context() #not specified the mode name as file as we have already did that in our test file and get_context() will automatically look for the context in the current directory
#context = gx.get_context(mode="file", project_root_dir=os.getcwd()) #if you want to specify the mode and project root directory
df = pd.read_csv("Test_data.csv")

In [3]:
df.head()


Unnamed: 0,CERTNUM,ORGNUM,ABBRNAME,BLTV,RATEIND,CLTV,SECMFLAG,REGION,Sum_TUBUREAU,Sum_EQFAXBR,...,12 Months Ago- First,24 Months Ago,12 Months Ago -Last,1-12 Flag,13-24 Flag,NUM_BOR,URBAN_RURAL,COMP TDSR >40 (#),COMP GDSR >35 (#),GIFT (#)
0,2043570463,89H7,ATB,89.99,A,89.99,F,ALBERTA,1519.0,1487.0,...,2024-07-01 00:00:00,2023-07-01 00:00:00,2024-06-30 00:00:00,,,MULTIPLE,URBAN,0,0,0
1,2034712641,A389,B2B,94.9,F,94.9,F,ONTARIO,1619.0,1503.0,...,2024-07-01 00:00:00,2023-07-01 00:00:00,2024-06-30 00:00:00,,,MULTIPLE,URBAN,0,0,0
2,2040637737,5PVL,CPDQ,95.0,F,95.0,F,QUEBEC,1613.0,1552.0,...,2024-07-01 00:00:00,2023-07-01 00:00:00,2024-06-30 00:00:00,,,MULTIPLE,URBAN,0,0,0
3,2032083442,58DL,BNS,29.14,F,0.0,F,PACIFIC,0.0,0.0,...,2024-07-01 00:00:00,2023-07-01 00:00:00,2024-06-30 00:00:00,,,SINGLE,URBAN,0,0,0
4,2045249808,F936,BPCU,90.0,F,90.0,F,PRAIRIES,1520.0,1476.0,...,2024-07-01 00:00:00,2023-07-01 00:00:00,2024-06-30 00:00:00,,13-24 MONTH,MULTIPLE,RURAL,0,0,0


In [14]:
# Specifying Names - Data Source, Data Asset, Batch, Expecation Suite, Validation, Checkpoint
data_source_name = "risk_analytics"
data_asset_name = "delq_history"
batch_name = "delq_history_batch"
expectation_suite_name = "delq_history_quality_suite" 
batch_parameters = {"dataframe": df}
checkpoint_name = "delq_history_checkpoint"
validation_definition_name = "delq_history_validation"
run_id = RunIdentifier(run_name=f"Delq History Validation")

In [None]:
# adding data source,  data asset, batch to the context
data_source = context.data_sources.add_pandas(name=data_source_name)
data_asset = data_source.add_dataframe_asset(name=data_asset_name)
batch_definition = data_asset.add_batch_definition_whole_dataframe(name=batch_name)

In [11]:
# Creating an expectation suite and adding it to the context
delq_suite = gx.ExpectationSuite(name=expectation_suite_name)
context.suites.add(delq_suite)

{
  "name": "delq_history_quality_suite",
  "id": "72ea19b8-992e-4ede-9ef9-dd327ee9b6a0",
  "expectations": [],
  "meta": {
    "great_expectations_version": "1.5.5"
  },
  "notes": null
}

In [12]:
# Creating Expectations and add them to the expectation suite
# Expecation 1: Column 'Region' Should be in a set of values 
    # We will use the value counts of the 'REGION' column to create a set of expected values
    # and then use that set to create an expectation that the 'REGION' column should only contain those values.
    # This is useful for ensuring that the 'REGION' column does not contain unexpected values.
region_set = set(df['REGION'].value_counts().index)
region_set.remove("OTHER")
region_expectation = gx.expectations.ExpectColumnValuesToBeInSet( 
        column="REGION",
        value_set=region_set,
        result_format="COMPLETE",
        mostly=0.99
    )
delq_suite.add_expectation(region_expectation)

# Expecation 2: Columns which should not be null 
not_null_columns = ["REGION", "CERTNUM","DTAPPREC"]
for column in not_null_columns:
    expectation_name = f"not_null_expectation_{column}"
    expectation_name = gx.expectations.ExpectColumnValuesToNotBeNull(
        column=column,
        result_format="COMPLETE",
        mostly=0.99
    )
    delq_suite.add_expectation(expectation_name)

# Expecation 3: Columns which should be not null with a condition 
condition_columns = ["CURRENT_OUTSTANDING_BAL_AMT", "CPA_REMAINING_AMORTIZATION"]
for column in condition_columns:
    expectation_name = f"not_null_condition_expectation_{column}"
    expectation_name = gx.expectations.ExpectColumnValuesToNotBeNull(
        column=column,
        result_format="COMPLETE",
        mostly=0.99,
        row_condition= "CURRENT_OUTSTANDING_BAL_AMT > 0",
        condition_parser="pandas"
    )
    delq_suite.add_expectation(expectation_name)

# Expecation 4: Column 'CURRENT_OUTSTANDING_BAL_AMT' should be greater than 0 when 'CPA_REMAINING_AMORTIZATION' is greater than 0
outstanding_condition_expectation = gx.expectations.ExpectColumnValuesToNotBeNull(
        column="CURRENT_OUTSTANDING_BAL_AMT",
        result_format="COMPLETE",
        mostly=0.99,
        row_condition="CPA_REMAINING_AMORTIZATION > 0",
        condition_parser="pandas"
    )
delq_suite.add_expectation(outstanding_condition_expectation)

# Expecation 5: Outstanding balance should be less than or equal to 1500000
outstanding_balance_expectation = gx.expectations.ExpectColumnValuesToBeBetween(
        column="CURRENT_OUTSTANDING_BAL_AMT",
        min_value=  0,
        strict_min = True,
        max_value=1500000,
        result_format="COMPLETE",
        mostly=0.99
    )

In [13]:
delq_suite.expectations

[ExpectColumnValuesToBeInSet(id='028de4d7-b367-445b-844b-470545b01b24', meta={}, notes=None, result_format=<ResultFormat.COMPLETE: 'COMPLETE'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='REGION', mostly=0.99, row_condition=None, condition_parser='pandas', value_set=['GTA', 'PACIFIC', 'ATLANTIC', 'PRAIRIES', 'QUEBEC', 'ONTARIO', 'ALBERTA']),
 ExpectColumnValuesToNotBeNull(id='bf9fffdb-2abc-42ef-9158-253490f6900b', meta={}, notes=None, result_format=<ResultFormat.COMPLETE: 'COMPLETE'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='REGION', mostly=0.99, row_condition=None, condition_parser='pandas'),
 ExpectColumnValuesToNotBeNull(id='8235255c-4aff-4671-8c84-3c8ff55c4180', meta={}, notes=None, result_format=<ResultFormat.COMPLETE: 'COMPLETE'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='CERTNUM', mostly=0.99, row_conditio

In [23]:
validation_definition = gx.ValidationDefinition(
    name=validation_definition_name,
    suite=delq_suite,
    data =batch_definition)

context.validation_definitions.add(validation_definition)

ValidationDefinition(name='delq_history_validation', data=BatchDefinition(id=UUID('8a63d1e7-333a-41ed-811c-e0a568265ea4'), name='delq_history_batch', partitioner=None), suite={
  "name": "delq_history_quality_suite",
  "id": "72ea19b8-992e-4ede-9ef9-dd327ee9b6a0",
  "expectations": [
    {
      "type": "expect_column_values_to_be_in_set",
      "kwargs": {
        "result_format": "COMPLETE",
        "column": "REGION",
        "mostly": 0.99,
        "value_set": [
          "GTA",
          "PACIFIC",
          "ATLANTIC",
          "PRAIRIES",
          "QUEBEC",
          "ONTARIO",
          "ALBERTA"
        ]
      },
      "meta": {},
      "id": "028de4d7-b367-445b-844b-470545b01b24"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "result_format": "COMPLETE",
        "column": "REGION",
        "mostly": 0.99
      },
      "meta": {},
      "id": "bf9fffdb-2abc-42ef-9158-253490f6900b"
    },
    {
      "type": "expect_column_value

In [24]:
# Specifying the action to take when the validation fails
action_list = [UpdateDataDocsAction(
    name="Update Data Docs for delq history",
)]
checkpoint = gx.Checkpoint(
    name=checkpoint_name,
    validation_definitions=[validation_definition],
    actions=action_list,
    result_format= {"result_format": "COMPLETE"})

context.checkpoints.add(checkpoint)


Checkpoint(name='delq_history_checkpoint', validation_definitions=[ValidationDefinition(name='delq_history_validation', data=BatchDefinition(id=UUID('8a63d1e7-333a-41ed-811c-e0a568265ea4'), name='delq_history_batch', partitioner=None), suite={
  "name": "delq_history_quality_suite",
  "id": "72ea19b8-992e-4ede-9ef9-dd327ee9b6a0",
  "expectations": [
    {
      "type": "expect_column_values_to_be_in_set",
      "kwargs": {
        "result_format": "COMPLETE",
        "column": "REGION",
        "mostly": 0.99,
        "value_set": [
          "GTA",
          "PACIFIC",
          "ATLANTIC",
          "PRAIRIES",
          "QUEBEC",
          "ONTARIO",
          "ALBERTA"
        ]
      },
      "meta": {},
      "id": "028de4d7-b367-445b-844b-470545b01b24"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "result_format": "COMPLETE",
        "column": "REGION",
        "mostly": 0.99
      },
      "meta": {},
      "id": "bf9fffdb-2abc-42ef

In [17]:
# Running the checkpoint
checkpoint_result = checkpoint.run(run_id=run_id, batch_parameters=batch_parameters)

# Displaying the result of the checkpoint run
print(f"Result of the run: {checkpoint_result.success}")

CheckpointRelatedResourcesFreshnessError: 
	ExpectationSuite 'delq_history_quality_suite' has changed since it has last been saved. Please update with `<SUITE_OBJECT>.save()`, then try your action again.

In [15]:
for exp in delq_suite.expectations:
    exp.condition_parser= "pandas"
delq_suite.save()

In [16]:
delq_suite.save()

running after adding everything

In [3]:
# Specifying Names - Data Source, Data Asset, Batch, Expecation Suite, Validation, Checkpoint
data_source_name = "risk_analytics"
data_asset_name = "delq_history"
batch_name = "delq_history_batch"
expectation_suite_name = "delq_history_quality_suite" 
batch_parameters = {"dataframe": df}
checkpoint_name = "delq_history_checkpoint"
validation_definition_name = "delq_history_validation"
run_id = RunIdentifier(run_name=f"Delq History Validation")

In [4]:
#data_definition = context.data_sources.get(data_source_name).get_asset(data_asset_name).get_batch_definition(batch_name)
#alidation_definition = context.validation_definitions.get(validation_definition_name)
checkpoint = context.checkpoints.get(checkpoint_name)

In [5]:
# Running the checkpoint
checkpoint_result = checkpoint.run(run_id=run_id, batch_parameters=batch_parameters)

# Displaying the result of the checkpoint run
print(f"Result of the run: {checkpoint_result.success}")

Calculating Metrics:   0%|          | 0/54 [00:00<?, ?it/s]

Result of the run: True


In [18]:
delq_suite = context.suites.get(expectation_suite_name)
for exp in delq_suite.expectations:
    if exp.row_condition is not None and exp.column in ('CURRENT_OUTSTANDING_BAL_AMT', 'CPA_REMAINING_AMORTIZATION'):
        exp.condition_parser = "pandas"
        exp.row_condition = 'CURRENT_OUTSTANDING_BAL_AMT'>0
    elif exp.column == 'CURRENT_OUTSTANDING_BAL_AMT':
        exp.condition_parser = "pandas"
        exp.row_condition = 'CPA_REMAINING_AMORTIZATION'>0
delq_suite.save()


  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)


TypeError: '>' not supported between instances of 'str' and 'int'

In [21]:
delq_suite.expectations

[ExpectColumnValuesToBeInSet(id='028de4d7-b367-445b-844b-470545b01b24', meta={}, notes=None, result_format=<ResultFormat.COMPLETE: 'COMPLETE'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='REGION', mostly=0.99, row_condition=None, condition_parser='pandas', value_set=['GTA', 'PACIFIC', 'ATLANTIC', 'PRAIRIES', 'QUEBEC', 'ONTARIO', 'ALBERTA']),
 ExpectColumnValuesToNotBeNull(id='bf9fffdb-2abc-42ef-9158-253490f6900b', meta={}, notes=None, result_format=<ResultFormat.COMPLETE: 'COMPLETE'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='REGION', mostly=0.99, row_condition=None, condition_parser='pandas'),
 ExpectColumnValuesToNotBeNull(id='8235255c-4aff-4671-8c84-3c8ff55c4180', meta={}, notes=None, result_format=<ResultFormat.COMPLETE: 'COMPLETE'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='CERTNUM', mostly=0.99, row_conditio

In [6]:
checkpoint_result

CheckpointResult(run_id={"run_name": "Delq History Validation", "run_time": "2025-07-28T22:05:46.449645-04:00"}, run_results={ValidationResultIdentifier::delq_history_quality_suite/Delq History Validation/20250729T020546.449645Z/risk_analytics-delq_history: {
  "success": true,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "type": "expect_column_values_to_be_in_set",
        "kwargs": {
          "result_format": "COMPLETE",
          "batch_id": "risk_analytics-delq_history",
          "column": "REGION",
          "mostly": 0.99,
          "condition_parser": "pandas",
          "value_set": [
            "GTA",
            "PACIFIC",
            "ATLANTIC",
            "PRAIRIES",
            "QUEBEC",
            "ONTARIO",
            "ALBERTA"
          ]
        },
        "meta": {},
        "id": "028de4d7-b367-445b-844b-470545b01b24"
      },
      "result": {
        "element_count": 744337,
        "unexpected_count": 1,
        "unexpect