### Init Context

In [None]:
from thetaray.api.context import init_context
from datetime import datetime
import yaml

import logging
logging.basicConfig(level=logging.DEBUG, format='%(message)s')

with open('/thetaray/git/solutions/domains/demo_merchant/config/spark_config.yaml') as spark_config_file:
    spark_config = yaml.load(spark_config_file, yaml.FullLoader)['spark_config_a']

context = init_context(
    execution_date=datetime(1970, 2, 1),
    spark_conf=spark_config,
    # spark_master='local[*]',
)

### Imports

In [None]:
from domains.demo_merchant.datasets.customer_monthly import customer_monthly_dataset
from domains.demo_merchant.datasets.customers import customers_dataset
from domains.demo_merchant.datasets.customer_insights import customer_insights_dataset
from domains.demo_merchant.datasets.transactions import transactions_dataset
from domains.demo_merchant.evaluation_flows.ef import evaluation_flow as ef
from domains.demo_merchant.graphs.graph import graph

from thetaray.api.evaluation import unblock_evaluation_for_execution_date
from thetaray.common.data_environment import DataEnvironment

from sqlalchemy import text
import os
from sqlalchemy import create_engine
import pandas as pd

### Minio

In [None]:
context.drop_spark_execution_partition(customer_monthly_dataset().identifier, context.execution_date, data_environment=DataEnvironment.PUBLIC)
context.drop_spark_execution_partition(customers_dataset().identifier, context.execution_date, data_environment=DataEnvironment.PUBLIC)
context.drop_spark_execution_partition(transactions_dataset().identifier, context.execution_date, data_environment=DataEnvironment.PUBLIC)
context.drop_spark_execution_partition(customer_insights_dataset().identifier, context.execution_date, data_environment=DataEnvironment.PUBLIC)
context.drop_spark_execution_partition(ef().identifier, context.execution_date, data_environment=DataEnvironment.PUBLIC)
unblock_evaluation_for_execution_date(context, ef().identifier, data_environment=DataEnvironment.PUBLIC)

In [None]:
try:
    spark  # noqa
except NameError:
    try:
        spark = context.spark
    except AttributeError:
        try:
            spark = context.spark_session
        except AttributeError:
            # Último recurso: crea/recupera una sesión
            from pyspark.sql import SparkSession
            spark = SparkSession.builder.getOrCreate()

In [None]:
# --- CLEAR TOTAL (sin depender de execution_date/particiones) ---

from pyspark.sql.utils import AnalysisException

def ensure_spark(context):
    if 'spark' in globals():
        try:
            spark.range(1).count()
            return spark
        except Exception:
            pass
    for attr in ("spark", "spark_session"):
        if hasattr(context, attr) and getattr(context, attr) is not None:
            try:
                getattr(context, attr).range(1).count()
                return getattr(context, attr)
            except Exception:
                pass
    if hasattr(context, "get_spark_session"):
        try:
            s = context.get_spark_session()
            s.range(1).count()
            return s
        except Exception:
            pass
    from pyspark.sql import SparkSession
    return SparkSession.builder.getOrCreate()

spark = ensure_spark(context)

short_names = [
    customer_monthly_dataset().identifier,
    customers_dataset().identifier,
    transactions_dataset().identifier,
    customer_insights_dataset().identifier,
]

def find_tables(short_name: str):
    hits = []
    try:
        dbs = [r.databaseName for r in spark.sql("SHOW DATABASES").collect()]
    except Exception:
        dbs = [spark.catalog.currentDatabase()]
    for db in dbs:
        try:
            if spark.sql(f"SHOW TABLES IN {db} LIKE '{short_name}'").count() > 0:
                hits.append(f"{db}.{short_name}")
        except Exception:
            pass
    return hits

full_tables = []
for s in short_names:
    found = find_tables(s)
    if not found:
        print(f"[WARN] No se encontró la tabla '{s}' en ningún DB")
    else:
        for f in found:
            print("[FOUND]", f)
        full_tables.extend(found)

# Parar streams/caché
for q in spark.streams.active:
    try: q.stop()
    except: pass
spark.catalog.clearCache()

def drop_all_partitions_if_any(full_name: str):
    # Intentar listar y dropear todas las particiones (si la tabla es particionada)
    try:
        parts = spark.sql(f"SHOW PARTITIONS {full_name}").collect()
        if parts:
            print(f" - Dropping {len(parts)} partitions")
            for row in parts:
                # row[0] es una cadena como "job_ts=2025-08-01/..." o "year=2025/month=08/..."
                spec = ", ".join([f"{kv.split('=')[0]}='{kv.split('=')[1]}'" for kv in row[0].split('/')])
                try:
                    spark.sql(f"ALTER TABLE {full_name} DROP IF EXISTS PARTITION ({spec})")
                except Exception as e:
                    print("   * DROP PARTITION failed ->", e)
    except AnalysisException:
        # Tabla no particionada o sin soporte de SHOW PARTITIONS
        pass
    except Exception as e:
        print(" - SHOW PARTITIONS failed ->", e)

def strong_clear_table(full_name: str):
    print("\nClearing:", full_name)
    ok = False
    # 1) TRUNCATE
    try:
        spark.sql(f"TRUNCATE TABLE {full_name}")
        ok = True
        print(" - TRUNCATE ok")
    except Exception as e:
        print(" - TRUNCATE failed ->", e)

    # 2) Si TRUNCATE no funcionó, intenta dropear todas las particiones
    if not ok:
        drop_all_partitions_if_any(full_name)

    # 3) DELETE total por si quedan restos (Delta/Hive)
    try:
        spark.sql(f"DELETE FROM {full_name} WHERE 1=1")
        print(" - DELETE all ok")
    except Exception as e:
        print(" - DELETE failed ->", e)

    # 4) Verificar conteo; si siguen filas, último recurso DROP TABLE
    cnt = None
    try:
        cnt = spark.sql(f"SELECT COUNT(*) c FROM {full_name}").collect()[0]["c"]
        print(f" - After clear count = {cnt}")
    except AnalysisException:
        print(" - Table no accesible (posiblemente ya no existe)")
    if cnt is not None and cnt > 0:
        try:
            spark.sql(f"DROP TABLE IF EXISTS {full_name}")
            print(" - DROPPED table (último recurso)")
        except Exception as e:
            print(" - DROP failed ->", e)

for t in full_tables:
    strong_clear_table(t)

# Verificación
errors = []
for t in full_tables:
    try:
        c = spark.sql(f"SELECT COUNT(*) c FROM {t}").collect()[0]["c"]
        print(f"[CHECK] {t}: {c} filas")
        if c != 0:
            errors.append(f"{t} aún tiene {c} filas")
    except AnalysisException:
        print(f"[CHECK] {t}: no existe (OK si esperabas dropearla)")
    except Exception as e:
        errors.append(f"{t}: error al contar -> {e}")

assert not errors, "CLEAR CHECK FAILED -> " + " | ".join(errors)
True


### Postgres

In [None]:
shared_namespace = os.environ["SHARED_NAMESPACE"]

engine_cdd = create_engine(
    f"postgresql+psycopg2://postgres:postgres@postgres.{shared_namespace}.svc.cluster.local:5432/cdd"
)

engine_apps = create_engine(
    f"postgresql+psycopg2://postgres:postgres@postgres.{shared_namespace}.svc.cluster.local:5432/apps_tmdemo"
)

In [None]:
# This for trying no to drop a table does not exists (!)
# =========================
# CLEAR DATA (IDEMPOTENT)
# =========================

# --- Step 1: Imports and helper try_truncate ---
from sqlalchemy import text
from sqlalchemy.exc import ProgrammingError
from psycopg2.errors import UndefinedTable

def try_truncate(conn, default_schema: str, table_or_qualified: str):
    """
    Run TRUNCATE TABLE safely:
    - If the table does not exist (UndefinedTable), skip and continue.
    - Accepts either 'table' or 'schema.table'. If schema not provided, uses default_schema.
    """
    if "." in table_or_qualified:
        schema, table = table_or_qualified.split(".", 1)
    else:
        schema, table = default_schema, table_or_qualified

    try:
        conn.execute(text(f'TRUNCATE TABLE "{schema}"."{table}"'))
    except ProgrammingError as e:
        # Ignore if relation does not exist
        if isinstance(getattr(e, "orig", None), UndefinedTable):
            print(f"[SKIP] {schema}.{table} does not exist (ok).")
        else:
            raise

# =========================
# 1) solution_sonar
# =========================
schema_name = "solution_sonar"
eval_flow_id = ef().identifier  # <- provided by your environment

# Table prefixes for each eval flow
eval_flow_table_prefixes = [
    "activity_",
    "activity_risk_",
    "test_activity_suppressed_",
    "test_activity_risk_",
    "test_activity_risk_thin_",
    "activity_risk_thin_",
    "test_activity_",
    "activity_suppressed_",
]

# Truncate by prefixes
with engine_cdd.connect() as conn:
    for prefix in eval_flow_table_prefixes:
        table_full = prefix + eval_flow_id
        try_truncate(conn, schema_name, table_full)

# Truncate project datasets
with engine_cdd.connect() as conn:
    try_truncate(conn, schema_name, transactions_dataset().identifier)
    try_truncate(conn, schema_name, customer_monthly_dataset().identifier)
    try_truncate(conn, schema_name, customers_dataset().identifier)
    try_truncate(conn, schema_name, customer_insights_dataset().identifier)
    try_truncate(conn, schema_name, f"tr_nodes_{graph().identifier}")
    try_truncate(conn, schema_name, f"tr_edges_{graph().identifier}")

# =========================
# 2) apps_tmdemo + investigation_center
# =========================
schema_name = "apps_tmdemo"
dpv = 'dpv:demo_merchant'

# Get the alert table name mapped to this DPV
with engine_cdd.connect() as conn:
    result = conn.execute(
        text(f'SELECT alert_table_name FROM "{schema_name}".rp_mappers WHERE data_permission = :dpv'),
        {"dpv": dpv}
    ).first()

if result and result[0]:
    alert_table_name = result[0]  # e.g. tr_alert_table_1755790351163 (could also be "schema.table")

    # Truncate in CDD (apps_tmdemo)
    with engine_cdd.connect() as conn:
        try_truncate(conn, schema_name, alert_table_name)

    # Truncate in Apps/IC (may not exist; helper will skip safely)
    with engine_apps.connect() as conn:
        try_truncate(conn, "investigation_center", alert_table_name)

    # Delete rp_alerts rows for this mapper
    mapper = alert_table_name.split('_')[-1]
    with engine_cdd.connect() as conn:
        conn.execute(
            text(f'DELETE FROM "{schema_name}".rp_alerts WHERE alert_mapper_identifier = :m'),
            {"m": mapper}
        )
else:
    print(f"[SKIP] No row in {schema_name}.rp_mappers for {dpv} (ok).")

# =========================
# 3) Spark checks (unchanged from your original)
# =========================
from pyspark.sql.utils import AnalysisException

# Ensure Spark
try:
    spark.range(1).count()
except Exception:
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.getOrCreate()

# Short names of demo merchant datasets
short_names = [
    customer_monthly_dataset().identifier,
    customers_dataset().identifier,
    transactions_dataset().identifier,
    customer_insights_dataset().identifier,
]

# Locate tables in all DBs
def find_tables(short_name: str):
    hits = []
    try:
        dbs = [r.databaseName for r in spark.sql("SHOW DATABASES").collect()]
    except Exception:
        dbs = [spark.catalog.currentDatabase()]
    for db in dbs:
        try:
            if spark.sql(f"SHOW TABLES IN {db} LIKE '{short_name}'").count() > 0:
                hits.append(f"{db}.{short_name}")
        except Exception:
            pass
    return hits

errors = []
found_any = False
for s in short_names:
    fq_names = find_tables(s)
    if not fq_names:
        print(f"[CHECK] {s}: not found in any DB (OK if dropped).")
        continue
    found_any = True
    for fq in fq_names:
        try:
            cnt = spark.sql(f"SELECT COUNT(*) c FROM {fq}").collect()[0]["c"]
            print(f"[CHECK] {fq}: {cnt} rows")
            if cnt != 0:
                errors.append(f"{fq} still has {cnt} rows")
        except AnalysisException as e:
            print(f"[CHECK] {fq}: not accessible (maybe dropped) -> {e}")
        except Exception as e:
            errors.append(f"{fq}: error while counting -> {e}")

# Assert final (fail only if rows remain)
if not found_any:
    print("[CHECK] No project tables found (OK if clear dropped everything).")
else:
    assert not errors, "CLEAR CHECK FAILED -> " + " | ".join(errors)
print("Verification finished.")

# =========================
# 4) API verification (unchanged)
# =========================
from thetaray.api.dataset import dataset_functions
from thetaray.common.data_environment import DataEnvironment
from datetime import datetime

try:
    eff_dt = getattr(context, "effective_execution_date", None) or getattr(context, "execution_date", None)
    if eff_dt is None:
        eff_dt = datetime.utcnow()  # safe fallback
    ds = dataset_functions.read(
        context,
        customer_monthly_dataset().identifier,
        from_job_ts=eff_dt,   # do not use 1970-02-01
        data_environment=DataEnvironment.PUBLIC
    )
    c = ds.count()
    print(f"[CHECK API] {customer_monthly_dataset().identifier} @ {eff_dt}: {c} rows")
    assert c == 0, f"Expected 0, found {c}"
    print("API verification OK.")
except Exception as e:
    print(f"[CHECK API] Skipped (table not found or reader empty): {e}")



In [None]:
# # Set schema and eval flow to delete
# schema_name = "solution_sonar"
# eval_flow_id = ef().identifier

# # Table Prefixes to Truncate for Each Eval Flow
# eval_flow_table_prefixes = [
#     "activity_",
#     "activity_risk_",
#     "test_activity_suppressed_",
#     "test_activity_risk_",
#     "test_activity_risk_thin_",
#     "activity_risk_thin_",
#     "test_activity_",
#     "activity_suppressed_",
# ]

# # Truncate
# with engine_cdd.connect() as conn:
#     for prefix in eval_flow_table_prefixes:
#         table_full = prefix + eval_flow_id
#         query = text(f"TRUNCATE TABLE {schema_name}.{table_full}")
#         conn.execute(query)

# with engine_cdd.connect() as conn:
#     query = text(f"TRUNCATE TABLE {schema_name}.{transactions_dataset().identifier}")
#     conn.execute(query)
#     query = text(f"TRUNCATE TABLE {schema_name}.{customer_monthly_dataset().identifier}")
#     conn.execute(query)
#     query = text(f"TRUNCATE TABLE {schema_name}.{customers_dataset().identifier}")
#     conn.execute(query)
#     query = text(f"TRUNCATE TABLE {schema_name}.{customer_insights_dataset().identifier}")
#     conn.execute(query)
#     query = text(f"TRUNCATE TABLE {schema_name}.tr_nodes_{graph().identifier}")
#     conn.execute(query)
#     query = text(f"TRUNCATE TABLE {schema_name}.tr_edges_{graph().identifier}")
#     conn.execute(query)

In [None]:
# # Set schema and eval flow to delete
# schema_name = "apps_tmdemo"
# dpv = 'dpv:demo_merchant'

# with engine_cdd.connect() as conn:
#     query = text(f"SELECT alert_table_name FROM {schema_name}.rp_mappers WHERE data_permission = '{dpv}'")
#     result = conn.execute(query).first()

# with engine_cdd.connect() as conn:
#     query = text(f"TRUNCATE TABLE {schema_name}.{result[0]}")
#     conn.execute(query)

# with engine_apps.connect() as conn:
#     query = text(f"TRUNCATE TABLE investigation_center.{result[0]}")
#     conn.execute(query)

# mapper = result[0].split('_')[-1]

# with engine_cdd.connect() as conn:
#     query = text(f"DELETE FROM {schema_name}.rp_alerts WHERE alert_mapper_identifier = '{mapper}'")
#     conn.execute(query)

### Check if everything was cleared

In [None]:
from pyspark.sql.utils import AnalysisException

# 1) Asegurar Spark
try:
    spark.range(1).count()
except Exception:
    from pyspark.sql import SparkSession
    spark = SparkSession.builder.getOrCreate()

# 2) Short names de demo merchant
short_names = [
    customer_monthly_dataset().identifier,
    customers_dataset().identifier,
    transactions_dataset().identifier,
    customer_insights_dataset().identifier,
]

# 3) Localizar tablas en todos los DBs
def find_tables(short_name: str):
    hits = []
    try:
        dbs = [r.databaseName for r in spark.sql("SHOW DATABASES").collect()]
    except Exception:
        dbs = [spark.catalog.currentDatabase()]
    for db in dbs:
        try:
            if spark.sql(f"SHOW TABLES IN {db} LIKE '{short_name}'").count() > 0:
                hits.append(f"{db}.{short_name}")
        except Exception:
            pass
    return hits

errors = []
found_any = False
for s in short_names:
    fq_names = find_tables(s)
    if not fq_names:
        print(f"[CHECK] {s}: no existe en ningún DB (OK si fue dropeada).")
        continue
    found_any = True
    for fq in fq_names:
        try:
            cnt = spark.sql(f"SELECT COUNT(*) c FROM {fq}").collect()[0]["c"]
            print(f"[CHECK] {fq}: {cnt} filas")
            if cnt != 0:
                errors.append(f"{fq} aún tiene {cnt} filas")
        except AnalysisException as e:
            print(f"[CHECK] {fq}: no accesible (posible drop) -> {e}")
        except Exception as e:
            errors.append(f"{fq}: error al contar -> {e}")

# 5) Assert final (fallar solo si quedan filas)
if not found_any:
    print("[CHECK] No se encontró ninguna tabla del proyecto (OK si el clear dropeó todo).")
else:
    assert not errors, "CLEAR CHECK FAILED -> " + " | ".join(errors)
print("Verificación terminada.")

# 6) (Opcional) Verificación con API alineada a la fecha efectiva del DAG
from thetaray.api.dataset import dataset_functions
from thetaray.common.data_environment import DataEnvironment
from datetime import datetime

try:
    eff_dt = getattr(context, "effective_execution_date", None) or getattr(context, "execution_date", None)
    if eff_dt is None:
        eff_dt = datetime.utcnow()  # fallback benigno
    ds = dataset_functions.read(
        context,
        customer_monthly_dataset().identifier,
        from_job_ts=eff_dt,   # no uses 1970-02-01
        data_environment=DataEnvironment.PUBLIC
    )
    c = ds.count()
    print(f"[CHECK API] {customer_monthly_dataset().identifier} @ {eff_dt}: {c} filas")
    assert c == 0, f"Esperaba 0, encontré {c}"
    print("Verificación API OK.")
except Exception as e:
    print(f"[CHECK API] Omitido (tabla no existe o lector sin datos): {e}")


In [None]:
context.close()