In [1]:
!pip install --upgrade pip



In [6]:
!pip install pyspark great_expectations==1.1.3

Collecting great_expectations==1.1.3
  Downloading great_expectations-1.1.3-py3-none-any.whl.metadata (8.5 kB)
Downloading great_expectations-1.1.3-py3-none-any.whl (5.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: great_expectations
  Attempting uninstall: great_expectations
    Found existing installation: great-expectations 1.2.0
    Uninstalling great-expectations-1.2.0:
      Successfully uninstalled great-expectations-1.2.0
Successfully installed great_expectations-1.1.3


In [2]:
# Cell 2: Import packages and verify installation
import great_expectations as gx
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, IntegerType, TimestampType
from datetime import datetime

In [12]:
print(f'Great Expectations version {gx.__version__} is installed')

# Cell 3: Define ExpectationGenerator class (from dataquality.py)
class ExpectationGenerator:
    def __init__(self, rule: dict) -> None:
        self.rule = rule
        self.fieldName = rule['field_name']
        self.meta = dict(
            expectation=rule['expectation'],
            action=rule['expectation_action'],
            operator=rule['operator']
        )
        
    def required(self):
        return gx.expectations.ExpectColumnValuesToNotBeNull(
            column=self.fieldName,
            meta=self.meta
        )
    
    def choice(self, *value_set):
        value_set = list(value_set)
        return gx.expectations.ExpectColumnValuesToBeInSet(
            column=self.fieldName,
            value_set=value_set,
            meta=self.meta
        )
    
    def string_format(self, regex: str):
        return gx.expectations.ExpectColumnValuesToMatchRegex(
            column=self.fieldName,
            regex=regex,
            meta=self.meta
        )

# Cell 4: Create test data and run validation
# Initialize Spark session
spark = SparkSession.builder.appName("GE_Test").getOrCreate()

def create_sample_data():
    schema = StructType([
        StructField("user_id", IntegerType(), True),
        StructField("name", StringType(), True),
        StructField("email", StringType(), True),
        StructField("age", IntegerType(), True),
        StructField("signup_date", TimestampType(), True)
    ])

    data = [
        (1, "John Doe", "john@email.com", 25, datetime(2023, 1, 1)),
        (2, None, "jane@email.com", 30, datetime(2023, 2, 1)),
        (3, "Bob Smith", "invalid-email", -5, datetime(2024, 1, 1)),
        (4, "Alice Brown", "alice@email.com", 40, datetime(2023, 3, 1)),
        (5, "", None, None, None)
    ]
    return spark.createDataFrame(data, schema=schema)

# Create test data
df = create_sample_data()

# Initialize Great Expectations context
context = gx.get_context()

# Create rules for testing
rules = [
    {
        'field_name': 'name',
        'expectation': 'required',
        'expectation_action': 'warn',
        'operator': 'not_null'
    },
    {
        'field_name': 'email',
        'expectation': 'string_format(^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$)',
        'expectation_action': 'warn',
        'operator': 'regex'
    }
]

# Create and add expectations
suite = context.suites.add(
    gx.core.expectation_suite.ExpectationSuite(
        name="my_test_suite"
    )
)

for rule in rules:
    generator = ExpectationGenerator(rule)
    if 'required' in rule['expectation']:
        expectation = generator.required()
    elif 'string_format' in rule['expectation']:
        regex = rule['expectation'].split('(')[1].rstrip(')')
        expectation = generator.string_format(regex)
    suite.add_expectation(expectation)

# Create batch definition
batch_definition = context.data_sources.add_spark(
    name="my_spark_datasource"
).add_dataframe_asset(
    name="my_data_asset"
).add_batch_definition_whole_dataframe(
    name="my_batch"
)

# Create validation definition
validation_definition = context.validation_definitions.add(
    gx.core.validation_definition.ValidationDefinition(
        name="my_validation",
        data=batch_definition,
        suite=suite
    )
)

# Create and run checkpoint
checkpoint = context.checkpoints.add(
    gx.checkpoint.checkpoint.Checkpoint(
        name="my_checkpoint",
        validation_definitions=[validation_definition]
    )
)

results = checkpoint.run(
    batch_parameters={"dataframe": df}
)

# Print results
print("\nValidation Results:")
print("==================")

# Get the first (and only) validation result from run_results
validation_result = list(results.run_results.values())[0]
print("\nDetailed Results:")
for result in validation_result['results']:
    print(f"\nExpectation: {result['expectation_config']['type']}")
    print(f"Column: {result['expectation_config']['kwargs']['column']}")
    print(f"Success: {result['success']}")
    print(f"Unexpected count: {result['result'].get('unexpected_count', 0)}")
    if not result['success']:
        print(f"Unexpected values: {result['result'].get('partial_unexpected_list', [])}")

Great Expectations version 1.0.3 is installed


  'expectation': 'string_format(^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$)',


Calculating Metrics:   0%|          | 0/19 [00:00<?, ?it/s]


Validation Results:

Detailed Results:

Expectation: expect_column_values_to_not_be_null
Column: name
Success: False
Unexpected count: 1
Unexpected values: [None]

Expectation: expect_column_values_to_match_regex
Column: email
Success: False
Unexpected count: 1
Unexpected values: ['invalid-email']
