## Using AI for Anomalies Detection in Data Quality
**Description**: Implement an AI-based approach to detect anomalies in data quality.

**Steps**:
1. Use an Anomaly Detection Algorithm:
    - Use sklearn's Isolation Forest for anomaly detection.

**Example data:**

data = np.array([[25, 50000], [30, 60000], [35, 75000], [40, None], [45, 100000]])

2. Integrate with Great Expectations:
    - Generate alerts if anomalies are detected:

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
import great_expectations as ge
from great_expectations.core.batch import RuntimeBatchRequest

# Step 1: Sample data
data = [[25, 50000], [30, 60000], [35, 75000], [40, None], [45, 100000]]
df = pd.DataFrame(data, columns=["age", "income"])

# Step 2: Prepare data for Isolation Forest
df_filled = df.fillna(-999)  # Handle missing income

# Step 3: Apply Isolation Forest
iso = IsolationForest(contamination=0.2, random_state=42)
df['anomaly'] = iso.fit_predict(df_filled)

# Step 4: Print anomalies
anomalies = df[df["anomaly"] == -1]
print("🔎 Detected Anomalies:\n", anomalies)

# Step 5: Load or create a Great Expectations context
context = ge.get_context()

# Step 6: Add a temporary in-memory datasource
datasource_name = "my_pandas_datasource"
if datasource_name not in context.list_datasources():
    context.add_datasource(
        name=datasource_name,
        class_name="Datasource",
        execution_engine={"class_name": "PandasExecutionEngine"},
        data_connectors={
            "runtime_data_connector": {
                "class_name": "RuntimeDataConnector",
                "batch_identifiers": ["default_identifier_name"]
            }
        }
    )

# Step 7: Create a batch request from the DataFrame
batch_request = RuntimeBatchRequest(
    datasource_name=datasource_name,
    data_connector_name="runtime_data_connector",
    data_asset_name="anomaly_data",
    runtime_parameters={"batch_data": df},
    batch_identifiers={"default_identifier_name": "default_id"},
)

# Step 8: Create expectation suite
suite_name = "anomaly_suite"
if suite_name not in context.list_expectation_suites():
    context.create_expectation_suite(suite_name)

# Step 9: Get validator
validator = context.get_validator(batch_request=batch_request, expectation_suite_name=suite_name)

# Step 10: Optional: Add some sanity checks
validator.expect_column_values_to_not_be_null("income")

# Step 11: Generate alert
if (df["anomaly"] == -1).any():
    print(f"\n⚠️ ALERT: {sum(df['anomaly'] == -1)} anomaly/anomalies detected!")
else:
    print("\n✅ No anomalies detected.")

# Step 12: Run validation
results = validator.validate()
print("\nValidation Results:")
print(results)


🔎 Detected Anomalies:
    age  income  anomaly
3   40     NaN       -1


DataContextError: Datasource is not a FluentDatasource