### Task 1: Validate Data with a Custom Expectation in Great Expectations
**Description**: Create a custom expectation and validate data with Great Expectations.

**Load a sample DataFrame**

data = {
'age': [25, 30, 35, 40, 45],
'income': [50000, 60000, 75000, None, 100000]
}

In [1]:
!pip install great_expectations
import pandas as pd
from great_expectations.data_context import DataContext
from great_expectations.core.batch import BatchRequest

# Sample DataFrame
df = pd.DataFrame({
    "age": [25, 30, 35, 40, 45],
    "income": [50000, 60000, 75000, None, 100000]
})

# Create a minimal in-memory DataContext config
context_config = {
    "datasources": {
        "my_pandas_datasource": {
            "class_name": "Datasource",
            "execution_engine": {
                "class_name": "PandasExecutionEngine"
            },
            "data_connectors": {
                "default_runtime_data_connector": {
                    "class_name": "RuntimeDataConnector",
                    "batch_identifiers": ["default_identifier_name"]
                }
            }
        }
    },
    "stores": {
        "expectations_store": {
            "class_name": "InMemoryStore"
        },
        "validations_store": {
            "class_name": "InMemoryStore"
        },
        "evaluation_parameter_store": {
            "class_name": "InMemoryStore"
        }
    },
    "expectations_store_name": "expectations_store",
    "validations_store_name": "validations_store",
    "evaluation_parameter_store_name": "evaluation_parameter_store",
    "data_docs_sites": {},
    "anonymous_usage_statistics": {
        "enabled": False
    }
}

# Initialize DataContext with the config
context = DataContext(project_config=context_config)

# Create a BatchRequest with your DataFrame
batch_request = BatchRequest(
    datasource_name="my_pandas_datasource",
    data_connector_name="default_runtime_data_connector",
    data_asset_name="my_data_asset",
    runtime_parameters={"batch_data": df},
    batch_identifiers={"default_identifier_name": "default_id"},
)

# Get a Validator
validator = context.get_validator(batch_request=batch_request)

# Add expectation: income values >= 50000 (None values allowed)
result = validator.expect_column_values_to_be_between(
    column="income",
    min_value=50000,
    mostly=1.0,
)

print(result)

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


TypeError: DataContext() got an unexpected keyword argument 'project_config'

### Task 2: Implement a Basic Alert System for Data Quality Drops
**Description**: Set up a basic alert system that triggers when data quality drops.

In [2]:
import pandas as pd

def check_data_quality(df, completeness_threshold=0.9):
    completeness = df.notnull().mean().mean()
    if completeness < completeness_threshold:
        print(f"ALERT: Data quality dropped! Completeness: {completeness:.2f}")
    else:
        print(f"Data quality is good. Completeness: {completeness:.2f}")

data_good = {
    "A": [1, 2, 3, None],
    "B": [4, 5, None, 7]
}
df_good = pd.DataFrame(data_good)

data_bad = {
    "A": [None, None, None, None],
    "B": [None, None, None, None]
}
df_bad = pd.DataFrame(data_bad)

check_data_quality(df_good)
check_data_quality(df_bad)


ALERT: Data quality dropped! Completeness: 0.75
ALERT: Data quality dropped! Completeness: 0.00


### Task 3: Real-time Data Quality Monitoring with Python and Great Expectations
**Description**: Implement a system that monitors data quality in real-time.

In [3]:
import great_expectations as ge
import pandas as pd
import time

data_samples = [
    {"id": 1, "value": 10},
    {"id": 2, "value": None},
    {"id": 3, "value": 15},
    {"id": 4, "value": 20},
]

context = ge.get_context()

suite_name = "realtime_data_quality_suite"
try:
    suite = context.get_expectation_suite(suite_name)
except Exception:
    suite = context.create_expectation_suite(suite_name, overwrite_existing=True)

def create_or_update_expectations(df):
    batch = ge.from_pandas(df)
    batch.expect_column_values_to_not_be_null("value")
    return batch

def monitor_data_stream():
    for sample in data_samples:
        df = pd.DataFrame([sample])
        batch = create_or_update_expectations(df)
        results = batch.validate(expectation_suite=suite)
        print(f"Data sample id {sample['id']} validation results: {results['success']}")
        time.sleep(1)

monitor_data_stream()


Data sample id 1 validation results: True
Data sample id 2 validation results: True
Data sample id 3 validation results: True
Data sample id 4 validation results: True
