### Task 1: Automated Data Profiling

**Steps**:
1. Using Pandas-Profiling
    - Generate a profile report for an existing CSV file.
    - Customize the profile report to include correlations.
    - Profile a specific subset of columns.
2. Using Great Expectations
    - Create a basic expectation suite for your data.
    - Validate data against an expectation suite.
    - Add multiple expectations to a suite.

In [4]:
import pandas as pd
import great_expectations as ge
from great_expectations.dataset import PandasDataset
from great_expectations.expectations.core import ColumnMapExpectation
from great_expectations.execution_engine import PandasExecutionEngine

# -----------------------------
# Step 1: Define a custom expectation class
# -----------------------------
class ExpectColumnValuesToBeEven(ColumnMapExpectation):
    """Expect column values to be even numbers."""

    # Use PandasExecutionEngine
    map_metric = "column_values.even"

    def validate_configuration(self, configuration):
        # Optional: add configuration validation if needed
        return super().validate_configuration(configuration)

# -----------------------------
# Step 2: Register custom metric for "even" check
# -----------------------------
from great_expectations.expectations.metrics import column_map_metric
from great_expectations.execution_engine import PandasExecutionEngine

@column_map_metric(engine=PandasExecutionEngine)
def column_values_even(series, **kwargs):
    return series.apply(lambda x: (x % 2 == 0) if pd.notnull(x) else True)

# -----------------------------
# Step 3: Create sample DataFrame and wrap in GE dataset
# -----------------------------
data = {
    'age': [25, 30, 35, 40, 45],
    'income': [50000, 60000, 75000, None, 100000]
}

# Create GE dataset from pandas DataFrame
ge_df = ge.from_pandas(pd.DataFrame(data))

# Add the custom expectation method to the GE dataset instance
ge_df.add_expectation(ExpectColumnValuesToBeEven)

# -----------------------------
# Step 4: Use the custom expectation on 'age' and 'income' columns
# -----------------------------
print("Validate 'age' column for even numbers:")
result_age = ge_df.expect_column_values_to_be_even("age")
print(result_age)

print("\nValidate 'income' column for even numbers:")
result_income = ge_df.expect_column_values_to_be_even("income")
print(result_income)


ModuleNotFoundError: No module named 'great_expectations.dataset'

### Task 2: Real-time Monitoring of Data Quality

**Steps**:
1. Setting up Alerts for Quality Drops
    - Use the logging library to set up a basic alert on failed expectations.
    - Implementing alerts using email notifications.
    - Using a dashboard like Grafana for visual alerts.
        - Note: Example assumes integration with a monitoring system
        - Alert setup would involve creating a data source and alert rule in Grafana

In [None]:
import pandas as pd
import logging
import great_expectations as ge
from great_expectations.core.batch import RuntimeBatchRequest
import smtplib
from email.mime.text import MIMEText

# --------------------------
# Step 1: Setup logging alerts
# --------------------------
logging.basicConfig(
    filename="data_quality_alerts.log",
    level=logging.WARNING,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

# --------------------------
# Step 2: Load data and setup Great Expectations
# --------------------------
df = pd.read_csv("sample_data.csv")  # Replace with your CSV file path

context = ge.get_context()

datasource_name = "my_pandas_datasource"
if datasource_name not in context.list_datasources():
    context.add_datasource(
        name=datasource_name,
        class_name="Datasource",
        execution_engine={"class_name": "PandasExecutionEngine"},
        data_connectors={
            "runtime_data_connector": {
                "class_name": "RuntimeDataConnector",
                "batch_identifiers": ["default_identifier_name"]
            }
        }
    )

batch_request = RuntimeBatchRequest(
    datasource_name=datasource_name,
    data_connector_name="runtime_data_connector",
    data_asset_name="realtime_data",
    runtime_parameters={"batch_data": df},
    batch_identifiers={"default_identifier_name": "default_id"},
)

suite_name = "realtime_suite"
if suite_name not in context.list_expectation_suites():
    context.create_expectation_suite(suite_name)

validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=suite_name
)

# --------------------------
# Step 3: Define expectations
# --------------------------
validator.expect_column_values_to_not_be_null("age")
validator.expect_column_values_to_be_between("age", min_value=18, max_value=90)
validator.expect_column_values_to_not_be_null("income")

validator.save_expectation_suite()

# --------------------------
# Step 4: Run validation and log failures
# --------------------------
results = validator.validate()

if not results.success:
    logging.warning("⚠️ Data validation failed!")
    for result in results["results"]:
        if not result["success"]:
            expectation = result['expectation_config']['expectation_type']
            kwargs = result['expectation_config']['kwargs']
            logging.warning(f"❌ Failed Expectation: {expectation} on {kwargs}")
else:
    logging.info("✅ Data validation passed.")

# --------------------------
# Step 5: Email alert function
# --------------------------
def send_email_alert(subject, body, to_email):
    from_email = "your_email@gmail.com"      # Replace with your email
    password = "your_app_password"            # Use app password if 2FA enabled

    msg = MIMEText(body)
    msg["Subject"] = subject
    msg["From"] = from_email
    msg["To"] = to_email

    try:
        server = smtplib.SMTP_SSL("smtp.gmail.com", 465)
        server.login(from_email, password)
        server.sendmail(from_email, to_email, msg.as_string())
        server.quit()
        print("📧 Email sent!")
    except Exception as e:
        print("Email failed:", e)

# --------------------------
# Step 6: Send email if validation fails
# --------------------------
if not results.success:
    send_email_alert(
        subject="Data Quality Alert 🚨",
        body="One or more data quality expectations failed. Please check logs for details.",
        to_email="recipient@example.com"   # Replace with recipient email
    )

# --------------------------
# Step 7: Notes for Grafana integration
# --------------------------
"""
To integrate with Grafana for visual alerts:

- Use Promtail to ship the 'data_quality_alerts.log' file to Loki (Grafana’s log backend).
- Create a Loki datasource in Grafana.
- Create alerts in Grafana based on log queries matching "Data validation failed" or specific expectation failures.

Example Promtail config snippet:

server:
  http_listen_port: 9080
  grpc_listen_port: 0

positions:
  filename: /tmp/positions.yaml

clients:
  - url: http://localhost:3100/loki/api/v1/push

scrape_configs:
  - job_name: data_quality_logs
    static_configs:
      - targets:
          - localhost
        labels:
          job: data_quality
          __path__: /path/to/data_quality_alerts.log

"""


DataContextError: Datasource is not a FluentDatasource

### Task 3: Using AI for Data Quality Monitoring
**Steps**:
1. Basic AI Models for Monitoring
    - Train a simple anomaly detection model using Isolation Forest.
    - Use a simple custom function based AI logic for outlier detection.
    - Creating a monitoring function that utilizes a pre-trained machine learning model.

In [None]:
# Write your code from here
import pandas as pd
from sklearn.ensemble import IsolationForest
import joblib

# -------------------------------
# Step 1: Train Isolation Forest
# -------------------------------
data = {
    "age": [25, 30, 35, 40, 45, 1000],  # 1000 is an outlier
    "income": [50000, 60000, 75000, 80000, 90000, -5000]  # -5000 is an outlier
}
df = pd.DataFrame(data)

clf = IsolationForest(contamination=0.2, random_state=42)
clf.fit(df)

# Save the trained model for future use
joblib.dump(clf, "isolation_forest_model.pkl")

# Predict anomalies on training data
df['anomaly_iforest'] = clf.predict(df)  # -1 = anomaly, 1 = normal

print("🔍 Anomalies detected by Isolation Forest:")
print(df[df['anomaly_iforest'] == -1])

# -----------------------------------------
# Step 2: Simple rule-based outlier function
# -----------------------------------------
def rule_based_outlier_detection(df):
    flags = []
    for _, row in df.iterrows():
        age_outlier = row['age'] < 18 or row['age'] > 100
        income_outlier = row['income'] < 10000 or row['income'] > 200000
        flags.append(age_outlier or income_outlier)
    return flags

df['anomaly_rule'] = rule_based_outlier_detection(df)

print("\n⚠️ Rule-based outliers:")
print(df[df['anomaly_rule']])

# ---------------------------------------
# Step 3: Monitoring function with model
# ---------------------------------------
def monitor_data_quality(new_data_path: str, model_path: str = "isolation_forest_model.pkl"):
    new_df = pd.read_csv(new_data_path)
    model = joblib.load(model_path)
    new_df['anomaly'] = model.predict(new_df)
    anomalies = new_df[new_df['anomaly'] == -1]
    
    if not anomalies.empty:
        print(f"\n🚨 ALERT: {len(anomalies)} anomalies detected in new data!")
        print(anomalies)
    else:
        print("\n✅ No anomalies detected in new data.")

# ---------------------------------------
# Example: Save data and run monitoring
# ---------------------------------------
df.to_csv("new_data.csv", index=False)
monitor_data_quality("new_data.csv")


🔍 Anomalies detected by Isolation Forest:
    age  income  anomaly_iforest
5  1000   -5000               -1

⚠️ Rule-based outliers:
    age  income  anomaly_iforest  anomaly_rule
5  1000   -5000               -1          True


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- anomaly_iforest
- anomaly_rule
