## Automated Data Quality Monitoring
**Objective**: Use Great Expectations to perform data profiling and write validation rules.

1. Data Profiling with Great Expectations

### Profile a JSON dataset with product sales data to check for null values in the 'ProductID' and 'Price' fields.
- Create an expectation suite and connect it to the data context.
- Use the `expect_column_values_to_not_be_null` expectation to profile these fields.
- Review the summary to identify any unexpected null values.

In [2]:
import pandas as pd
import great_expectations as gx
from great_expectations.core.batch import RuntimeBatchRequest

# Sample JSON-like product sales data (list of dicts)
data = [
    {"ProductID": 101, "Price": 9.99, "Quantity": 5},
    {"ProductID": 102, "Price": 14.99, "Quantity": 3},
    {"ProductID": None, "Price": 7.50, "Quantity": 10},
    {"ProductID": 104, "Price": None, "Quantity": 2},
    {"ProductID": 105, "Price": 19.99, "Quantity": 1},
]

# Load data into DataFrame
df = pd.DataFrame(data)

# Initialize Great Expectations context
context = gx.get_context()

# Add Pandas datasource if not exists
try:
    context.add_datasource(
        name="pandas_datasource",
        class_name="Datasource",
        execution_engine={"class_name": "PandasExecutionEngine"},
        data_connectors={
            "default_runtime_data_connector_name": {
                "class_name": "RuntimeDataConnector",
                "batch_identifiers": ["default_identifier_name"],
            }
        },
    )
except Exception:
    pass  # Ignore if datasource exists

# Create RuntimeBatchRequest properly
batch_request = RuntimeBatchRequest(
    datasource_name="pandas_datasource",
    data_connector_name="default_runtime_data_connector_name",
    data_asset_name="product_sales_data",
    runtime_parameters={"batch_data": df},
    batch_identifiers={"default_identifier_name": "default_identifier"},
)

# Create or overwrite expectation suite
suite_name = "product_sales_suite"
try:
    context.create_expectation_suite(suite_name, overwrite_existing=True)
except Exception:
    pass

# Get validator
validator = context.get_validator(batch_request=batch_request, expectation_suite_name=suite_name)

# Add expectations to check 'ProductID' and 'Price' are not null
validator.expect_column_values_to_not_be_null("ProductID")
validator.expect_column_values_to_not_be_null("Price")

# Validate dataset
validation_results = validator.validate()

# Print validation summary
print(validation_results)

# Print unexpected counts for each expectation
for result in validation_results["results"]:
    exp = result["expectation_config"]["expectation_type"]
    col = result["expectation_config"]["kwargs"]["column"]
    unexpected_count = result["result"]["unexpected_count"]
    print(f"Expectation: {exp} on column '{col}' has {unexpected_count} unexpected nulls.")

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/9 [00:00<?, ?it/s]

{
  "success": false,
  "results": [
    {
      "success": false,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_not_be_null",
        "kwargs": {
          "column": "ProductID",
          "batch_id": "090b3342eb4770223b66f65655fb080a"
        },
        "meta": {}
      },
      "result": {
        "element_count": 5,
        "unexpected_count": 1,
        "unexpected_percent": 20.0,
        "partial_unexpected_list": [
          null
        ]
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_traceback": null,
        "exception_message": null
      }
    },
    {
      "success": false,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_not_be_null",
        "kwargs": {
          "column": "Price",
          "batch_id": "090b3342eb4770223b66f65655fb080a"
        },
        "meta": {}
      },
      "result": {
        "element_count": 5,
        "unexpected_

2. Writing Validation Rules for Data Ingestion

### Define validation rules for an API data source to confirm that 'Status' field contains only predefined statuses ('Active', 'Inactive').

- Apply `expect_column_values_to_be_in_set` to check field values during data ingestion.
- Execute the validation and review any mismatches.

In [3]:
# write your code from here
import pandas as pd
import great_expectations as gx
from great_expectations.core.batch import RuntimeBatchRequest

# Sample API data (list of dicts) with 'Status' field
data = [
    {"UserID": 1, "Status": "Active"},
    {"UserID": 2, "Status": "Inactive"},
    {"UserID": 3, "Status": "Pending"},    # Invalid status
    {"UserID": 4, "Status": "Active"},
    {"UserID": 5, "Status": "Deleted"},    # Invalid status
]

# Load data into DataFrame
df = pd.DataFrame(data)

# Initialize Great Expectations context
context = gx.get_context()

# Add Pandas datasource if not exists
try:
    context.add_datasource(
        name="pandas_api_datasource",
        class_name="Datasource",
        execution_engine={"class_name": "PandasExecutionEngine"},
        data_connectors={
            "default_runtime_data_connector_name": {
                "class_name": "RuntimeDataConnector",
                "batch_identifiers": ["default_identifier_name"],
            }
        },
    )
except Exception:
    pass  # Ignore if datasource exists

# Create RuntimeBatchRequest
batch_request = RuntimeBatchRequest(
    datasource_name="pandas_api_datasource",
    data_connector_name="default_runtime_data_connector_name",
    data_asset_name="api_user_status",
    runtime_parameters={"batch_data": df},
    batch_identifiers={"default_identifier_name": "default_identifier"},
)

# Create or overwrite expectation suite
suite_name = "api_status_validation_suite"
try:
    context.create_expectation_suite(suite_name, overwrite_existing=True)
except Exception:
    pass

# Get validator
validator = context.get_validator(batch_request=batch_request, expectation_suite_name=suite_name)

# Define valid statuses
valid_statuses = ["Active", "Inactive"]

# Add expectation that 'Status' column values must be in valid_statuses
validator.expect_column_values_to_be_in_set(column="Status", value_set=valid_statuses)

# Validate dataset
validation_results = validator.validate()

# Print validation summary
print(validation_results)

# Print details of unexpected values
for result in validation_results["results"]:
    if not result["success"]:
        col = result["expectation_config"]["kwargs"]["column"]
        unexpected_values = result["result"].get("unexpected_list", [])
        print(f"Unexpected values found in column '{col}': {unexpected_values}")


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": false,
  "results": [
    {
      "success": false,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_be_in_set",
        "kwargs": {
          "column": "Status",
          "value_set": [
            "Active",
            "Inactive"
          ],
          "batch_id": "81e80fef31dfa50db49d3228b8dd7cd5"
        },
        "meta": {}
      },
      "result": {
        "element_count": 5,
        "unexpected_count": 2,
        "unexpected_percent": 40.0,
        "partial_unexpected_list": [
          "Pending",
          "Deleted"
        ],
        "missing_count": 0,
        "missing_percent": 0.0,
        "unexpected_percent_total": 40.0,
        "unexpected_percent_nonmissing": 40.0
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_traceback": null,
        "exception_message": null
      }
    }
  ],
  "evaluation_parameters": {},
  "statistics": {
    "evaluated_expectations": 1,