In [3]:
# Import GX and instantiate a Data Context
import great_expectations as gx

context = gx.get_context()

2023-05-23 19:02:07,058 - great_expectations.data_context.data_context.file_data_context - INFO - FileDataContext loading fluent config
2023-05-23 19:02:07,061 - great_expectations.datasource.fluent.config - INFO - Loading 'datasources' ->
[{'assets': [...],
  'boto3_options': {},
  'bucket': 'processed-data-7374046',
  'name': 'processed_datasource',
  'type': 'pandas_s3'}]
2023-05-23 19:02:07,106 - great_expectations.data_context.data_context.abstract_data_context - INFO - Loaded 'processed_datasource' from fluent config
2023-05-23 19:02:07,108 - great_expectations.datasource.fluent.fluent_base_model - INFO - PandasS3Datasource.dict() - substituting config values
2023-05-23 19:02:07,119 - botocore.credentials - INFO - Found credentials in shared credentials file: ~/.aws/credentials
2023-05-23 19:02:07,278 - great_expectations.datasource.fluent.pandas_s3_datasource - INFO - S3DataConnector created for 'my_taxi_data_asset'
2023-05-23 19:02:07,279 - great_expectations.datasource.fluent.

In [4]:
# Create a Datasource
datasource_name = "processed_datasource"
bucket_name = "processed-data-7374046"
boto3_options = {}

datasource = context.sources.add_pandas_s3(
    name=datasource_name, bucket=bucket_name, boto3_options=boto3_options
)

2023-05-23 19:02:09,366 - great_expectations.datasource.fluent.fluent_base_model - INFO - PandasS3Datasource.dict() - substituting config values


In [23]:
data_asset_name = "processed_data_asset"
# Add S3 processed data to the Datasource as a Data Asset
s3_prefix = "03_processed/"
data_asset = datasource.add_csv_asset(
    name=data_asset_name, s3_prefix=s3_prefix
)

In [6]:
# Retrieve the Data Asset
asset = context.get_datasource(datasource_name).get_asset("processed_data_asset")

In [7]:
# Build a Batch Request
batch_request = asset.build_batch_request()


In [8]:
# Verify that the correct Batches were returned
batches = asset.get_batch_list_from_batch_request(batch_request)

for batch in batches:
    print(batch.batch_spec)

# https://docs.greatexpectations.io/docs/guides/expectations/data_assistants/how_to_create_an_expectation_suite_with_the_onboarding_data_assistant


2023-05-23 19:02:13,495 - great_expectations.datasource.fluent.fluent_base_model - INFO - PandasS3Datasource.dict() - substituting config values
2023-05-23 19:02:13,874 - great_expectations.datasource.data_connector.batch_filter - INFO - batch_slice: None was parsed to: slice(0, None, None)
2023-05-23 19:02:13,876 - great_expectations.datasource.data_connector.batch_filter - INFO - batch_slice: slice(0, None, None) was parsed to: slice(0, None, None)
2023-05-23 19:02:13,877 - great_expectations.datasource.fluent.fluent_base_model - INFO - CSVAsset.dict() - substituting config values
{'path': 's3a://processed-data-7374046/03_processed/processed_p18_19_5.csv', 'reader_method': 'read_csv', 'reader_options': {}}


In [9]:
# Create an ExpectationSuite

suite = context.add_expectation_suite(expectation_suite_name="processed_data_suite")

In [10]:
# Create Expectation Configurations

from great_expectations.core.expectation_configuration import ExpectationConfiguration

# Create an Expectation
expectation_configuration_1 = ExpectationConfiguration(
    # Name of expectation type being added
    expectation_type="expect_table_columns_to_match_ordered_list",
    # These are the arguments of the expectation
    # The keys allowed in the dictionary are Parameters and
    # Keyword Arguments of this Expectation Type
    kwargs={
        "column_list": [
            ['Other entities', 'Headcount_Female', 'Headcount_Male',
            'Headcount_Total', 'Percentage_Female', 'Percentage_Male']
        ]
    },
    # This is how you can optionally add a comment about this expectation.
    # It will be rendered in Data Docs.
    # See this guide for details:
    # `How to add comments to Expectations and display them in Data Docs`.
    meta={
        "notes": {
            "format": "markdown",
            "content": "Expectation that the columns of the actual table will appear in the order specified above",
        }
    },
)
# Add the Expectation to the suite
suite.add_expectation(expectation_configuration=expectation_configuration_1)

In [11]:
expectation_configuration_2 = ExpectationConfiguration(
    expectation_type="expect_column_values_to_be_in_set",
    kwargs={
        "column": "transaction_type",
        "value_set": ["purchase", "refund", "upgrade"],
    },
    # Note optional comments omitted
)
suite.add_expectation(expectation_configuration=expectation_configuration_2)

In [12]:
expectation_configuration_3 = ExpectationConfiguration(
    expectation_type="expect_column_values_to_not_be_null",
    kwargs={
        "column": "Other entities",
    },
)
suite.add_expectation(expectation_configuration=expectation_configuration_3)

In [18]:
expectation_suite_name = "processed_data_suite"
# Save your Expectations for future use
context.save_expectation_suite(expectation_suite=suite)




'C:\\Users\\gdbt0\\Projects\\Data-preparation-pipeline\\data-processing-pipeline\\great_expectations\\expectations/processed_data_suite.json'

In [41]:
# Instantiate a Validator
validator = context.get_validator(
    batch_request=batch_request, expectation_suite_name=expectation_suite_name
)
validator.head()

2023-05-23 21:01:06,327 - great_expectations.datasource.fluent.fluent_base_model - INFO - PandasS3Datasource.dict() - substituting config values
2023-05-23 21:01:06,328 - great_expectations.datasource.data_connector.batch_filter - INFO - batch_slice: None was parsed to: slice(0, None, None)
2023-05-23 21:01:06,329 - great_expectations.datasource.data_connector.batch_filter - INFO - batch_slice: slice(0, None, None) was parsed to: slice(0, None, None)
2023-05-23 21:01:06,331 - great_expectations.datasource.fluent.fluent_base_model - INFO - CSVAsset.dict() - substituting config values


Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,other_entities,headcount_female,headcount_male,headcount_total,percentage_female,percentage_male
0,Legal Aid Queensland,480,143,623,0.770465,0.229535
1,Office of the Health Ombudsman,100,42,142,0.704225,0.295775
2,Queensland Art Gallery,234,131,365,0.641096,0.358904
3,Queensland Family and Child Commission,56,17,73,0.767123,0.232877
4,Queensland Human Rights Commission,35,9,44,0.795455,0.204545


In [15]:
# Instantiate a UserConfigurableProfiler, passing in the Validator with the data

from great_expectations.profile.user_configurable_profiler import (
    UserConfigurableProfiler,
)

profiler = UserConfigurableProfiler(profile_dataset=validator)

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
# Use the profiler to build a suite
suite = profiler.build_suite()

2023-05-23 19:04:17,867 - great_expectations.validator.validator - INFO - 	3 expectation(s) included in expectation_suite.



Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling:   0%|          | 0/6 [00:00<?, ?it/s, Column=Other entities]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]





Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]





Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]





Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]





Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]





Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]




Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

2023-05-23 19:04:19,405 - great_expectations.validator.validator - INFO - 	45 expectation(s) included in expectation_suite.
Creating an expectation suite with the following expectations:

Table-Level Expectations
expect_table_columns_to_match_ordered_list
expect_table_row_count_to_be_between

Expectations by Column
Column Name: Headcount_Female | Column Data Type: INT | Cardinality: UNIQUE
expect_column_max_to_be_between
expect_column_mean_to_be_between
expect_column_median_to_be_between
expect_column_min_to_be_between
expect_column_proportion_of_unique_values_to_be_between
expect_column_quantile_values_to_be_between
expect_column_values_to_be_in_type_list
expect_column_values_to_not_be_null


Column Name: Headcount_Male | Column Data Type: INT | Cardinality: UNIQUE
expect_column_max_to_be_between
expect_column_mean_to_be_between
expect_column_median_to_be_between
expect_column_min_to_be_between
expect_column_proportion_of_unique_values_to_be_between
expect_column_quantile_values_to_be

In [42]:
from great_expectations.checkpoint.checkpoint import SimpleCheckpoint

# Review and save our Expectation Suite
print(validator.get_expectation_suite(discard_failed_expectations=False))
validator.save_expectation_suite(discard_failed_expectations=False)

# Set up and run a Simple Checkpoint for ad hoc validation of our data
checkpoint_config = {
    "class_name": "SimpleCheckpoint",
    "validations": [
        {
            "batch_request": batch_request,
            "expectation_suite_name": expectation_suite_name,
        }
    ],
}

2023-05-23 21:01:35,482 - great_expectations.validator.validator - INFO - 	8 expectation(s) included in expectation_suite.
{
  "expectation_suite_name": "processed_data_suite",
  "ge_cloud_id": null,
  "data_asset_type": null,
  "meta": {
    "great_expectations_version": "0.16.13"
  },
  "expectations": [
    {
      "meta": {},
      "expectation_type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "Other entities"
      }
    },
    {
      "meta": {},
      "expectation_type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "Other entities"
      }
    },
    {
      "meta": {},
      "expectation_type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "Other entities"
      }
    },
    {
      "meta": {},
      "expectation_type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "Other entities"
      }
    },
    {
      "meta": {},
      "expectation_type": "expect_column

In [43]:
checkpoint_name = "processed_data_checkpoint"
checkpoint = gx.checkpoint.SimpleCheckpoint(
    name = checkpoint_name,
    data_context = context,
    **checkpoint_config,
)
checkpoint_result = checkpoint.run()

# Build Data Docs
context.build_data_docs()

2023-05-23 21:01:35,975 - great_expectations.datasource.fluent.fluent_base_model - INFO - PandasS3Datasource.dict() - substituting config values
2023-05-23 21:01:35,976 - great_expectations.datasource.data_connector.batch_filter - INFO - batch_slice: None was parsed to: slice(0, None, None)
2023-05-23 21:01:35,977 - great_expectations.datasource.data_connector.batch_filter - INFO - batch_slice: slice(0, None, None) was parsed to: slice(0, None, None)
2023-05-23 21:01:35,979 - great_expectations.datasource.fluent.fluent_base_model - INFO - CSVAsset.dict() - substituting config values
2023-05-23 21:01:36,267 - great_expectations.validator.validator - INFO - 	8 expectation(s) included in expectation_suite.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{'local_site': 'file://C:\\Users\\gdbt0\\Projects\\Data-preparation-pipeline\\data-processing-pipeline\\great_expectations\\uncommitted/data_docs/local_site/index.html'}

In [44]:
# add (save) the checkpoint to the data context
context.add_checkpoint(checkpoint=checkpoint)
cp = context.get_checkpoint(name=checkpoint_name)
assert cp.name == checkpoint_name




In [45]:
# Get the only validation_result_identifier from our SimpleCheckpoint run, and open Data Docs to that page
validation_result_identifier = checkpoint_result.list_validation_result_identifiers()[0]
context.open_data_docs(resource_identifier=validation_result_identifier)


In [46]:
suite.show_expectations_by_expectation_type()

[ { 'expect_table_columns_to_match_ordered_list': { 'column_list': [ 'Other '
                                                                     'entities',
                                                                     'Headcount_Female',
                                                                     'Headcount_Male',
                                                                     'Headcount_Total',
                                                                     'Percentage_Female',
                                                                     'Percentage_Male'],
                                                    'domain': 'table'}},
  { 'expect_table_row_count_to_be_between': { 'domain': 'table',
                                              'max_value': 11,
                                              'min_value': 11}},
  { 'expect_column_max_to_be_between': { 'column': 'Headcount_Female',
                                         'domain': 'column',


In [47]:
import sys
# context = gx.get_context()
result = context.run_checkpoint(
    checkpoint_name=checkpoint_name,
)

if not result["success"]:
    print("Validation failed!")
    sys.exit(1)

print("Validation succeeded!")

2023-05-23 21:01:38,014 - great_expectations.datasource.fluent.fluent_base_model - INFO - PandasS3Datasource.dict() - substituting config values
2023-05-23 21:01:38,015 - great_expectations.datasource.data_connector.batch_filter - INFO - batch_slice: None was parsed to: slice(0, None, None)
2023-05-23 21:01:38,017 - great_expectations.datasource.data_connector.batch_filter - INFO - batch_slice: slice(0, None, None) was parsed to: slice(0, None, None)
2023-05-23 21:01:38,018 - great_expectations.datasource.fluent.fluent_base_model - INFO - CSVAsset.dict() - substituting config values
2023-05-23 21:01:38,092 - great_expectations.validator.validator - INFO - 	8 expectation(s) included in expectation_suite.


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Validation failed!


  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)

