In [None]:
import great_expectations as gx
import pandas as pd
from pathlib import Path
import warnings
import json

warnings.filterwarnings("ignore", message="`result_format` configured at the Validator-level*")


# --- Utility to identify the transaction file ---
def identify_transaction_csv(data_dir: Path) -> Path:
    for file in data_dir.glob("*.csv"):
        try:
            df = pd.read_csv(file, nrows=1)
            columns = set(df.columns.str.lower())
            if {"transaction_id", "amount", "sender_account"}.issubset(columns):
                return file
        except Exception as e:
            print(f"Failed to read {file}: {e}")
    raise FileNotFoundError("No transaction CSV file found in data/")


# --- Load transaction CSV ---
DATA_DIR = Path().resolve() / "data"
transaction_csv = identify_transaction_csv(DATA_DIR)
df_trans = pd.read_csv(transaction_csv)
print(f"Loaded transaction file: {transaction_csv.name} with {len(df_trans)} rows")

# --- Set up Great Expectations context ---
context = gx.get_context()
datasource = context.data_sources.add_pandas(name="pandas_source")
data_asset = datasource.add_dataframe_asset(name="transactions_data")
batch_def = data_asset.add_batch_definition_whole_dataframe(name="batch_def")
batch = batch_def.get_batch(batch_parameters={"dataframe": df_trans})

# --- Create expectation suite and validator ---
suite = gx.core.ExpectationSuite(name="transactions_suite")
validator = context.get_validator(batch=batch, expectation_suite=suite)

# --- Add validation rules ---
validator.expect_column_values_to_not_be_null("transaction_id")
validator.expect_column_values_to_be_unique("transaction_id")

validator.expect_column_values_to_match_strftime_format("timestamp", "%Y-%m-%d %H:%M:%S")

validator.expect_column_values_to_be_between("amount", min_value=0.01, max_value=100000)

validator.expect_column_values_to_be_in_set(
    "currency",
    ["SEK", "USD", "EUR", "DKK", "JPY", "ZMW", "NOK", "ZAR", "RMB", "GBP"]
)

validator.expect_column_values_to_be_in_set("transaction_type", ["incoming", "outgoing"])

validator.expect_column_values_to_match_regex(
    "receiver_account", r"^(?:SE\d{4}[A-Z]{4}\d{14}|GB\d{2}[A-Z]{4}\d{14})$"
)
validator.expect_column_values_to_match_regex(
    "sender_account", r"^(?:SE\d{4}[A-Z]{4}\d{14}|GB\d{2}[A-Z]{4}\d{14})$"
)

validator.expect_column_values_to_not_be_null("sender_country")
validator.expect_column_values_to_not_be_null("receiver_country")
validator.expect_column_values_to_not_be_null("sender_municipality")
validator.expect_column_values_to_not_be_null("receiver_municipality")

# --- Print rows with missing receiver_country ---
missing_country = df_trans[df_trans["receiver_country"].isnull()]
if not missing_country.empty:
    print("Rows with missing receiver_country:")
    print(missing_country.head())

# --- Print rows with missing receiver_municipality ---
missing_municipality = df_trans[df_trans["receiver_municipality"].isnull()]
if not missing_municipality.empty:
    print("Rows with missing receiver_municipality:")
    print(missing_municipality.head())

# --- Fill null values with placeholder for output copy ---
df_trans_copy = df_trans.copy()
df_trans_copy["receiver_country"].fillna("Unknown", inplace=True)
df_trans_copy["receiver_municipality"].fillna("Unknown", inplace=True)

# --- Flag transactions by criteria ---
df_trans["flagged"] = (
        (df_trans["amount"] > 100000) |
        (df_trans["notes"].str.lower().str.contains("gift|cash|transfer", na=False))
)

flagged_transactions = df_trans[df_trans["flagged"]]
print("Flagged transactions:")
print(flagged_transactions)

# --- Save cleaned copy to CSV ---
report_dir = Path("report")
report_dir.mkdir(exist_ok=True)
df_trans_copy.to_csv(report_dir / "transactions_cleaned.csv", index=False)

# --- Run validation and export results ---
results = validator.validate(result_format={"result_format": "COMPLETE"})

print("Validation results:")
print(results)

# --- Save validation results as JSON ---
output_path = report_dir / "validation_results_transactions.json"
with open(output_path, "w") as f:
    json.dump(results.to_json_dict(), f, indent=4)
