In [1]:
import pandas as pd
import numpy as np
from great_expectations.core import RunIdentifier
import great_expectations as gx
from great_expectations.checkpoint.actions import UpdateDataDocsAction

In [2]:
df =  pd.read_csv("Test_data.csv")

In [3]:
context = gx.get_context(mode="file")

In [4]:
data_asset_name = "my_test_asset"
data_source_name = "my_test_source"
batch_definition_name = "my_batch_definition"
definition_name = "my_validation_definition"
checkpoint_name = "my_checkpoint"

In [28]:

data_source = context.data_sources.add_pandas(name=data_source_name)

data_asset = data_source.add_dataframe_asset(name=data_asset_name)

batch_definition = data_asset.add_batch_definition_whole_dataframe(
    batch_definition_name
)


error uploading: HTTPSConnectionPool(host='posthog.greatexpectations.io', port=443): Max retries exceeded with url: /batch/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)')))


In [15]:
batch_definition = (
    context.data_sources.get(data_source_name)
    .get_asset(data_asset_name)
    .get_batch_definition(batch_definition_name)
)

In [6]:
batch_parameters = {"dataframe": df}

In [None]:
print(type(context).__name__)

In [31]:
print(f"\nConfigured Data Source: {data_source.name}")
print(f"Configured Data Asset: {data_asset.name}")
print(f"Configured Batch Definition: {batch_definition.name}")


Configured Data Source: my_test_source
Configured Data Asset: my_test_asset
Configured Batch Definition: my_batch_definition


In [None]:
print(context)

In [32]:
# Create an Expectation Suite
suite_name = "daily_delq-quality_suite"
delq_suite = gx.ExpectationSuite(name=suite_name)

In [33]:
context.suites.add(delq_suite)

{
  "name": "daily_delq-quality_suite",
  "id": "92aacbee-22dc-47be-a51f-5323bf8edea3",
  "expectations": [],
  "meta": {
    "great_expectations_version": "1.5.5"
  },
  "notes": null
}

In [34]:
region_set = set(df['REGION'].value_counts().index)

In [35]:
region_set

{'ALBERTA',
 'ATLANTIC',
 'GTA',
 'ONTARIO',
 'OTHER',
 'PACIFIC',
 'PRAIRIES',
 'QUEBEC'}

In [36]:
region_set.remove("OTHER")

In [37]:
df.columns

Index(['CERTNUM', 'ORGNUM', 'ABBRNAME', 'BLTV', 'RATEIND', 'CLTV', 'SECMFLAG',
       'REGION', 'Sum_TUBUREAU', 'Sum_EQFAXBR', 'Sum_TUBUREAU_COUNT',
       'Sum_TUEQFAXBR_COUNT', 'AVERAGE SCORE', 'SCORE BUCKETS', 'LOB',
       'MTGTERM', 'DTPRINT', 'LENDERSC', 'INSURAMT', 'DTEFFECT', 'DTAPPRS',
       'DTAPPREC', 'RRMCITY', 'SRCOFBUS', 'BORINCOM', 'AMORMTHS', 'SALESPR',
       'LOANPURP', 'GDS', 'TDS', 'GDSR', 'TDSR', 'CLOANINT', 'PRD_COMP_GDS',
       'PRD_COMP_TDS', 'PRD_COMP_GDSR', 'PRD_COMP_TDSR',
       'PRD_COMP_QUAL_INT_RATE', 'PROVINCE', 'PROPTYPE', 'BORSELF', 'BORAGE',
       'PREMORIG', 'LOANAMT', 'NEWEXIST', 'PROP_AVM_YEAR_BUILD', 'PROPAGE',
       'AVMVAR', 'FRGNBORR', 'Sum_BOR_COUNT', 'FSACODE', 'POD_OMNI_RURAL_IND',
       'PRD_GIFT', 'OTHREQTY', 'TIME_MTH_SID', 'D_ind',
       'FIRST_PAYMENT_MISSED_DATE', 'GNW_APPLICATION_NUMBER',
       'CURRENT_OUTSTANDING_BAL_AMT', 'CURRENT_LOAN_STAGE', 'INT_RATE_TYPE',
       'DETAIL_LENDER_PTY_ALIAS', 'CURRENT_TERM_EXPIRY_DATE',
   

In [None]:
region_expectation = gx.expectations.ExpectColumnValuesToBeInSet( 
        column="REGION",
        value_set=region_set,
        result_format="COMPLETE"
    )
delq_suite.add_expectation(region_expectation
    
)

ExpectColumnValuesToBeInSet(id='90004e20-2e55-4893-81ac-34f1be4c5da5', meta=None, notes=None, result_format=<ResultFormat.COMPLETE: 'COMPLETE'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='REGION', mostly=1, row_condition=None, condition_parser=None, value_set=['GTA', 'QUEBEC', 'PRAIRIES', 'PACIFIC', 'ALBERTA', 'ONTARIO', 'ATLANTIC'])

error uploading: HTTPSConnectionPool(host='posthog.greatexpectations.io', port=443): Max retries exceeded with url: /batch/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)')))
error uploading: HTTPSConnectionPool(host='posthog.greatexpectations.io', port=443): Max retries exceeded with url: /batch/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)')))


In [55]:
region_expectation.mostly = 0.99
region_expectation.save()

error uploading: HTTPSConnectionPool(host='posthog.greatexpectations.io', port=443): Max retries exceeded with url: /batch/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)')))


In [39]:
delq_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(
        column="REGION",  # Correct: single string for the column name
        result_format="COMPLETE"
    )
)

# Add expectation for 'CERTNUM' column
delq_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(
        column="CERTNUM",  # Correct: single string for the column name
        result_format="COMPLETE"
    )
)

ExpectColumnValuesToNotBeNull(id='6c5ea5ad-227f-4de1-8cb8-1b953b938ae1', meta=None, notes=None, result_format=<ResultFormat.COMPLETE: 'COMPLETE'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='CERTNUM', mostly=1, row_condition=None, condition_parser=None)

In [40]:
delq_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(
        column="CURRENT_OUTSTANDING_BAL_AMT",
        min_value=None, # This indicates no lower bound check
        max_value=1500000,
        result_format="COMPLETE"
    )
)

ExpectColumnValuesToBeBetween(id='93ca2687-a5ec-443e-b316-d625bdbaa9b0', meta=None, notes=None, result_format=<ResultFormat.COMPLETE: 'COMPLETE'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='CURRENT_OUTSTANDING_BAL_AMT', mostly=1, row_condition=None, condition_parser=None, min_value=None, max_value=1500000.0, strict_min=False, strict_max=False)

error uploading: HTTPSConnectionPool(host='posthog.greatexpectations.io', port=443): Max retries exceeded with url: /batch/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)')))


In [41]:
delq_validation_definition = gx.ValidationDefinition(
    data= batch_definition,
    suite=delq_suite,
    name="delq_validation_definition")

In [16]:
batch = batch_definition.get_batch(batch_parameters=batch_parameters)

In [43]:
expectation_suite = context.suites.get(name="daily_delq-quality_suite")

In [44]:
batch_definition = (
    context.data_sources.get(data_source_name)
    .get_asset(data_asset_name)
    .get_batch_definition(batch_definition_name)
)

In [45]:

validation_definition = gx.ValidationDefinition(
    data=batch_definition, suite=expectation_suite, name=definition_name
)

In [46]:



validation_definition = context.validation_definitions.add(validation_definition)

In [17]:
#getting validation definition
validation_definition = context.validation_definitions.get(name=definition_name)

In [18]:
validation_results = validation_definition.run(batch_parameters=batch_parameters)

Calculating Metrics: 100%|██████████| 30/30 [00:02<00:00, 10.86it/s] 


In [19]:
print(validation_results)

{
  "success": true,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "type": "expect_column_values_to_be_in_set",
        "kwargs": {
          "result_format": "COMPLETE",
          "batch_id": "my_test_source-my_test_asset",
          "column": "REGION",
          "mostly": 0.99,
          "value_set": [
            "GTA",
            "QUEBEC",
            "PRAIRIES",
            "PACIFIC",
            "ALBERTA",
            "ONTARIO",
            "ATLANTIC"
          ]
        },
        "meta": {},
        "id": "90004e20-2e55-4893-81ac-34f1be4c5da5"
      },
      "result": {
        "element_count": 744337,
        "unexpected_count": 1,
        "unexpected_percent": 0.0001343477483989107,
        "partial_unexpected_list": [
          "OTHER"
        ],
        "missing_count": 0,
        "missing_percent": 0.0,
        "unexpected_percent_total": 0.0001343477483989107,
        "unexpected_percent_nonmissing": 0.0001343477483989107,
        "par

In [60]:
context.build_data_docs()

{'local_site': 'file://c:\\Users\\650000337\\Documents\\Lax_dev\\Data_Quality_Great_Expectations\\gx\\uncommitted/data_docs/local_site/index.html'}

error uploading: HTTPSConnectionPool(host='posthog.greatexpectations.io', port=443): Max retries exceeded with url: /batch/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)')))


In [51]:
df[df['REGION'] == 'OTHER'][['CERTNUM','PROVINCE']]

Unnamed: 0,CERTNUM,PROVINCE
395380,2040210350,QC


In [5]:
suite = context.suites.get(name="daily_delq-quality_suite")

In [61]:
action_list = [UpdateDataDocsAction(
    name="Update Data Docs"
)]

In [65]:
checkpoint = gx.Checkpoint(
    name=checkpoint_name,
    validation_definitions=[validation_definition],
    actions=action_list,
    result_format={"result_format": "COMPLETE"},
)

In [66]:
context.checkpoints.add(checkpoint)

Checkpoint(name='my_checkpoint', validation_definitions=[ValidationDefinition(name='my_validation_definition', data=BatchDefinition(id=UUID('4d4e1c7a-0f6d-475a-ac2c-0c2d1be809c5'), name='my_batch_definition', partitioner=None), suite={
  "name": "daily_delq-quality_suite",
  "id": "92aacbee-22dc-47be-a51f-5323bf8edea3",
  "expectations": [
    {
      "type": "expect_column_values_to_be_in_set",
      "kwargs": {
        "result_format": "COMPLETE",
        "column": "REGION",
        "mostly": 0.99,
        "value_set": [
          "GTA",
          "QUEBEC",
          "PRAIRIES",
          "PACIFIC",
          "ALBERTA",
          "ONTARIO",
          "ATLANTIC"
        ]
      },
      "meta": {},
      "id": "90004e20-2e55-4893-81ac-34f1be4c5da5"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "result_format": "COMPLETE",
        "column": "REGION"
      },
      "meta": {},
      "id": "7f4802f2-7ef8-471b-8858-e4f8e0e9c679"
    },
    {
 

error uploading: HTTPSConnectionPool(host='posthog.greatexpectations.io', port=443): Max retries exceeded with url: /batch/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1000)')))


In [5]:
from great_expectations.core import RunIdentifier
from datetime import datetime

run_id = RunIdentifier(run_name=f"my_run_{datetime.now().strftime('%Y%m%d_%H%M%S')}")


In [30]:
checkpoint = context.checkpoints.get(name=checkpoint_name)
validation_results = checkpoint.run(
    run_id= run_id,
    batch_parameters=batch_parameters
)

Calculating Metrics: 100%|██████████| 30/30 [00:03<00:00,  9.70it/s] 


In [24]:
print(validation_results.success)

True


In [5]:
delq_suite = context.suites.get(name="daily_delq-quality_suite")
delq_suite.expectations

[ExpectColumnValuesToBeInSet(id='90004e20-2e55-4893-81ac-34f1be4c5da5', meta={}, notes=None, result_format=<ResultFormat.COMPLETE: 'COMPLETE'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='REGION', mostly=0.99, row_condition=None, condition_parser=None, value_set=['GTA', 'QUEBEC', 'PRAIRIES', 'PACIFIC', 'ALBERTA', 'ONTARIO', 'ATLANTIC']),
 ExpectColumnValuesToNotBeNull(id='7f4802f2-7ef8-471b-8858-e4f8e0e9c679', meta={}, notes=None, result_format=<ResultFormat.COMPLETE: 'COMPLETE'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='REGION', mostly=1, row_condition=None, condition_parser=None),
 ExpectColumnValuesToNotBeNull(id='6c5ea5ad-227f-4de1-8cb8-1b953b938ae1', meta={}, notes=None, result_format=<ResultFormat.COMPLETE: 'COMPLETE'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='CERTNUM', mostly=1, row_condition=None, condit

In [12]:
for exp in delq_suite.expectations:
    if exp.expectation_type == "expect_column_values_to_be_in_set" and exp.column == 'REGION':
        exp.mostly = 0.99
delq_suite.save()

In [7]:
# add expecation for 'Outstanding Balance' column where it should be not null if CPA_REMAINING_AMORTIZATION is greater than 0 or is not null
delq_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(
        column="CURRENT_OUTSTANDING_BAL_AMT",
        row_condition="CPA_REMAINING_AMORTIZATION > 0 or CPA_REMAINING_AMORTIZATION is not null",
        result_format="COMPLETE",
        condition_parser= "pandas"
    )
)
delq_suite.save()

In [20]:
#modify the expecation with row condition
for exp in delq_suite.expectations:
    if isinstance(exp, gx.expectations.ExpectColumnValuesToNotBeNull) and exp.row_condition:
        exp.row_condition = "CPA_REMAINING_AMORTIZATION > 0 or CPA_REMAINING_AMORTIZATION.notnull()" 
        exp.description = f"Column '{exp.column}' should not be null when {exp.row_condition}"

delq_suite.save()

In [21]:
delq_suite.expectations

[ExpectColumnValuesToBeInSet(id='90004e20-2e55-4893-81ac-34f1be4c5da5', meta={}, notes=None, result_format=<ResultFormat.COMPLETE: 'COMPLETE'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='REGION', mostly=0.99, row_condition=None, condition_parser=None, value_set=['GTA', 'QUEBEC', 'PRAIRIES', 'PACIFIC', 'ALBERTA', 'ONTARIO', 'ATLANTIC']),
 ExpectColumnValuesToNotBeNull(id='7f4802f2-7ef8-471b-8858-e4f8e0e9c679', meta={}, notes=None, result_format=<ResultFormat.COMPLETE: 'COMPLETE'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='REGION', mostly=1, row_condition=None, condition_parser=None),
 ExpectColumnValuesToNotBeNull(id='6c5ea5ad-227f-4de1-8cb8-1b953b938ae1', meta={}, notes=None, result_format=<ResultFormat.COMPLETE: 'COMPLETE'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='CERTNUM', mostly=1, row_condition=None, condit

In [6]:
run_id = RunIdentifier(run_name=f"Test_run")
batch_parameters = {"dataframe": df}

In [7]:
checkpoint = context.checkpoints.get(checkpoint_name)
checkpoint_result = checkpoint.run(run_id=run_id, batch_parameters=batch_parameters)
print(f"Checkpoint run result: {checkpoint_result.success}")

Calculating Metrics: 100%|██████████| 38/38 [00:04<00:00,  8.80it/s]


Checkpoint run result: True


In [19]:
checkpoint_result

CheckpointResult(run_id={"run_name": "Test_run", "run_time": "2025-07-29T12:19:09.283310-04:00"}, run_results={ValidationResultIdentifier::daily_delq-quality_suite/Test_run/20250729T161909.283310Z/my_test_source-my_test_asset: {
  "success": true,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "type": "expect_column_values_to_be_in_set",
        "kwargs": {
          "result_format": "COMPLETE",
          "batch_id": "my_test_source-my_test_asset",
          "column": "REGION",
          "mostly": 0.99,
          "value_set": [
            "GTA",
            "QUEBEC",
            "PRAIRIES",
            "PACIFIC",
            "ALBERTA",
            "ONTARIO",
            "ATLANTIC"
          ]
        },
        "meta": {},
        "id": "90004e20-2e55-4893-81ac-34f1be4c5da5"
      },
      "result": {
        "element_count": 744337,
        "unexpected_count": 1,
        "unexpected_percent": 0.0001343477483989107,
        "partial_unexpected_list":

## Move data docs to desired folder

In [9]:
import shutil
import os

source_file = r"C:\Users\650000337\Documents\Lax_dev\Data_Quality_Great_Expectations\gx\uncommitted\data_docs\local_site\index.html"
destination_path = r"C:\Users\650000337\Documents\Lax_dev\index.html"

os.makedirs(os.path.dirname(destination_path), exist_ok=True)
shutil.copy2(source_file, destination_path)  # copy2 preserves metadata
print(f"File copied from {source_file} to {destination_path}")

File copied from C:\Users\650000337\Documents\Lax_dev\Data_Quality_Great_Expectations\gx\uncommitted\data_docs\local_site\index.html to C:\Users\650000337\Documents\Lax_dev\index.html


### Giving a custom directory for context file 

In [4]:
context =  gx.get_context(mode="file",context_root_dir=r"C:\Users\650000337\Documents\Lax_dev\gx")

In [5]:
suite = context.suites.get(name="daily_delq-quality_suite")
suite.expectations

[ExpectColumnValuesToBeInSet(id='90004e20-2e55-4893-81ac-34f1be4c5da5', meta={}, notes=None, result_format=<ResultFormat.COMPLETE: 'COMPLETE'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='REGION', mostly=0.99, row_condition=None, condition_parser=None, value_set=['GTA', 'QUEBEC', 'PRAIRIES', 'PACIFIC', 'ALBERTA', 'ONTARIO', 'ATLANTIC']),
 ExpectColumnValuesToNotBeNull(id='7f4802f2-7ef8-471b-8858-e4f8e0e9c679', meta={}, notes=None, result_format=<ResultFormat.COMPLETE: 'COMPLETE'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='REGION', mostly=1, row_condition=None, condition_parser=None),
 ExpectColumnValuesToNotBeNull(id='6c5ea5ad-227f-4de1-8cb8-1b953b938ae1', meta={}, notes=None, result_format=<ResultFormat.COMPLETE: 'COMPLETE'>, description=None, catch_exceptions=True, rendered_content=None, windows=None, batch_id=None, column='CERTNUM', mostly=1, row_condition=None, condit

In [6]:
context.root_directory

'C:\\Users\\650000337\\Documents\\Lax_dev\\gx'

### Copying folder to another location

In [8]:
import shutil
import os


source_folder = r"C:\Users\650000337\Documents\Lax_dev\Data_Quality_Great_Expectations\gx"
destination_folder = r"C:\Users\650000337\Documents\Lax_dev\gx"
# Remove destination folder if it exists
if os.path.exists(destination_folder):
    print(f"Removing existing destination folder: {destination_folder}")
    shutil.rmtree(destination_folder)

# Copy the source folder to destination
shutil.copytree(source_folder, destination_folder)

Removing existing destination folder: C:\Users\650000337\Documents\Lax_dev\gx


'C:\\Users\\650000337\\Documents\\Lax_dev\\gx'

### Data Profiling 

In [9]:
batch_definition = (
    context.data_sources.get(data_source_name)
    .get_asset(data_asset_name)
    .get_batch_definition(batch_definition_name)
)

In [10]:
batch = batch_definition.get_batch(batch_parameters=batch_parameters)

In [4]:
from ydata_profiling import ProfileReport

# Create a profile report for the DataFrame
profile = ProfileReport(df, title="Test Data Profiling Report")

# Save the report to HTML
profile.to_file("data_profile_report.html")

ModuleNotFoundError: No module named 'pkg_resources'