In [1]:
# Question: Data Quality Automation Tools - Introduction to Great Expectations
# Description: Set up a simple Great Expectations check for missing values in a numeric column.
import pandas as pd
import great_expectations as ge
from great_expectations.dataset import PandasDataset

# 1. Sample data creation (you would use your own data)
data = {
    'id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'age': [25, 30, None, 40, 35],
    'salary': [50000, 60000, 70000, None, 90000]
}

# 2. Create a DataFrame
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("\n")

# 3. Convert to Great Expectations DataFrame
ge_df = ge.from_pandas(df)

# 4. Define and run expectations for missing values in numeric columns

# Check for missing values in 'age' column
age_missing_result = ge_df.expect_column_values_to_not_be_null('age')
print("Age column missing values check:")
print(f"Success: {age_missing_result['success']}")
print(f"Result: {age_missing_result['result']}")
print(f"Missing count: {age_missing_result['result']['unexpected_count']}")
print(f"Missing percentage: {age_missing_result['result']['unexpected_percent']}%")
print("\n")

# Check for missing values in 'salary' column
salary_missing_result = ge_df.expect_column_values_to_not_be_null('salary')
print("Salary column missing values check:")
print(f"Success: {salary_missing_result['success']}")
print(f"Result: {salary_missing_result['result']}")
print(f"Missing count: {salary_missing_result['result']['unexpected_count']}")
print(f"Missing percentage: {salary_missing_result['result']['unexpected_percent']}%")
print("\n")

# 5. Set a threshold for missing values (e.g., allow up to 10% missing)
age_threshold_result = ge_df.expect_column_values_to_not_be_null('age', mostly=0.9)
print("Age column with 10% missing threshold:")
print(f"Success: {age_threshold_result['success']}")
print(f"Missing percentage: {age_threshold_result['result']['unexpected_percent']}%")
print("\n")

# 6. Save expectations to a suite
expectation_suite = ge_df.get_expectation_suite()
print("Saved expectations in suite:")
for expectation in expectation_suite.expectations:
    print(f"- {expectation.expectation_type} on {expectation.kwargs.get('column', 'N/A')}")

ModuleNotFoundError: No module named 'great_expectations.dataset'