### Task 1: Validate Data with a Custom Expectation in Great Expectations
**Description**: Create a custom expectation and validate data with Great Expectations.

**Load a sample DataFrame**

data = {
'age': [25, 30, 35, 40, 45],
'income': [50000, 60000, 75000, None, 100000]
}

In [5]:
import pandas as pd
from great_expectations.core.expectation_configuration import ExpectationConfiguration
from great_expectations.expectations.expectation import ColumnMapExpectation
from great_expectations.execution_engine import PandasExecutionEngine
from great_expectations.expectations.metrics import ColumnMapMetricProvider, column_condition_partial
from great_expectations.validator.validator import Validator
from great_expectations.core.batch import Batch, BatchRequest, RuntimeBatchRequest
from great_expectations.data_context import BaseDataContext

# Step 1: Create DataFrame
data = {
    'age': [25, 30, 35, 40, 45],
    'income': [50000, 60000, 75000, None, 100000]
}
df = pd.DataFrame(data)

# Step 2: Define custom metric
class ColumnValuesNonNullAndGreaterThan(ColumnMapMetricProvider):
    condition_metric_name = "column_values.non_null_and_greater_than"
    condition_value_keys = ("value",)

    @column_condition_partial(engine=PandasExecutionEngine)
    def _pandas(cls, column, value, **kwargs):
        return column.notnull() & (column > value)

# Step 3: Define custom expectation
class ExpectColumnValuesToBeNonNullAndGreaterThan(ColumnMapExpectation):
    map_metric = "column_values.non_null_and_greater_than"
    success_keys = ("value",)
    default_kwarg_values = {
        "value": 0,
        "mostly": 1.0
    }

    def validate_configuration(self, configuration: ExpectationConfiguration):
        assert "value" in configuration.kwargs, "'value' is required"
        super().validate_configuration(configuration)
        return True

# Step 4: Set up Great Expectations validator manually
context = BaseDataContext()

# Create RuntimeBatchRequest
batch_request = RuntimeBatchRequest(
    datasource_name="my_datasource",
    data_connector_name="my_data_connector",
    data_asset_name="my_asset",
    runtime_parameters={"batch_data": df},
    batch_identifiers={"default_identifier_name": "default_id"},
)

# Temporarily add datasource and data connector
context.add_datasource(
    name="my_datasource",
    class_name="Datasource",
    execution_engine={"class_name": "PandasExecutionEngine"},
    data_connectors={
        "my_data_connector": {
            "class_name": "RuntimeDataConnector",
            "batch_identifiers": ["default_identifier_name"],
        }
    },
)

# Create expectation suite
suite_name = "custom_suite"
context.create_expectation_suite(suite_name=suite_name, overwrite_existing=True)

# Create validator
validator = context.get_validator(batch_request=batch_request, expectation_suite_name=suite_name)

# Register custom expectation class with the validator's execution engine
validator.execution_engine.register_metric(ColumnValuesNonNullAndGreaterThan)
validator.register_expectation(ExpectColumnValuesToBeNonNullAndGreaterThan)

# Step 5: Apply custom expectation
result = validator.expect_column_values_to_be_non_null_and_greater_than("income", value=30000)

# Step 6: Print result
print(result)


ModuleNotFoundError: No module named 'great_expectations.core.expectation_configuration'

### Task 2: Implement a Basic Alert System for Data Quality Drops
**Description**: Set up a basic alert system that triggers when data quality drops.

In [7]:
# Write your code from here
def calculate_dqi(total_entries, valid_entries):
    if total_entries == 0:
        return 0.0
    return (valid_entries / total_entries) * 100

def check_data_quality_alert(dqi, threshold=90.0):
    if dqi < threshold:
        print(f"⚠️ Alert: DQI dropped to {dqi:.1f}%, which is below the threshold of {threshold}%!")
    else:
        print(f"✅ DQI is {dqi:.1f}% — All good.")

# Example: Dataset
total_entries = 1000
valid_entries = 880

# Calculate DQI
dqi_score = calculate_dqi(total_entries, valid_entries)

# Check and trigger alert
check_data_quality_alert(dqi_score, threshold=90.0)


⚠️ Alert: DQI dropped to 88.0%, which is below the threshold of 90.0%!


### Task 3: Real-time Data Quality Monitoring with Python and Great Expectations
**Description**: Implement a system that monitors data quality in real-time.

In [6]:
# Write your code from here
import great_expectations as ge
import pandas as pd
import time

# Sample batches simulating streaming data
data_batches = [
    pd.DataFrame({'age': [25, 30, 35], 'income': [50000, 60000, 70000]}),
    pd.DataFrame({'age': [22, 45, 33], 'income': [45000, None, 80000]}),  # Missing income
    pd.DataFrame({'age': [29, 31, 50], 'income': [52000, 62000, 90000]})
]

# Define validation function
def validate_data(df, batch_number):
    df_ge = ge.from_pandas(df)

    # Add expectations
    df_ge.expect_column_values_to_not_be_null("age")
    df_ge.expect_column_values_to_not_be_null("income")
    df_ge.expect_column_values_to_be_between("age", min_value=18, max_value=65)

    # Validate
    result = df_ge.validate()

    # Check if validation passed
    if not result["success"]:
        print(f"❌ Alert: Batch {batch_number} failed data quality checks.")
    else:
        print(f"✅ Batch {batch_number} passed data quality checks.")

# Simulate real-time streaming
for i, batch in enumerate(data_batches, start=1):
    print(f"\n📦 Validating Batch {i}")
    validate_data(batch, i)
    time.sleep(2)  # Simulate delay in real-time monitoring



📦 Validating Batch 1


AttributeError: module 'great_expectations' has no attribute 'from_pandas'