In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import re
import datetime as dt

In [2]:
!pip install great_expectations


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import great_expectations as gx
# Get the Ephemeral Data Context
context = gx.get_context()
assert type(context).__name__ == "EphemeralDataContext"

In [4]:
customer = pd.read_csv("/work/customer_data.csv")
retail = pd.read_csv("/work/retail_data.csv")
fact_sales = pd.read_csv("/work/fact_sales.csv")

In [5]:
retail = retail.rename(columns={"customer_id": "transaction_id"})
retail = retail.rename(columns={"id": "customer_id"})

In [6]:
customer

Unnamed: 0,id,full_name,email,phone,address,signup_date,name,gender,age
0,1,Kara Kim,kara.kim1924@yahoo.com,804-681-7662,"8971 Pine Ave, Greenville, TX 76366",2022-03-13,Elena,Female,60
1,2,Kelly Peterson,,(467) 700-2147,"7619 Cedar Ln, Madison, CA 32004",10/04/2019,Bob,Female,50
2,3,Henry Martin,henry.martin0365@mail.com,(378) 615-9326,"785 Elm St, Centerville, FL 70040",01/11/2019,Carlos,Male,36
3,4,Walter Evans,walter.evans9757@hotmail.com,406-811-1412,"9383 Elm St, Springfield, TX 27467",09/22/2021,Diana,Female,64
4,5,William Anderson,william.anderson8799@gmail.com,260-206-8340,"615 Sunset Blvd, Riverside, IL 02711",11/15/2020,George,Female,29
...,...,...,...,...,...,...,...,...,...
995,996,Kevin White,kevin.white6863@protonmail.com,(742) 990-9163,,12/08/2019,George,Female,54
996,997,Kyle Mitchell,kyle.mitchell7313@protonmail.com,(905) 994-3879,"6248 Oak St, Georgetown, IL 43690",06/09/2019,Carlos,Male,25
997,998,Holly Chavez,holly.chavez9133@gmail.com,(410) 204-3172,"1969 Pine Ave, Springfield, TX 21039",08/31/2020,Carlos,Male,43
998,999,Frank Foster,frank.foster6200@protonmail.com,609-287-1080,"7204 Elm St, Centerville, IL 00057",09/18/2022,Frank,Female,23


In [7]:
retail

Unnamed: 0,customer_id,purchase_date,product_category,amount,transaction_id
0,230,2025-05-11,Clothing,181.73,1034
1,630,2025-06-19,Electronics,442.32,1020
2,867,2025-02-16,Home & Kitchen,263.12,1049
3,763,2025-02-13,Home & Kitchen,434.05,1058
4,758,2025-04-15,Clothing,489.6,1077
...,...,...,...,...,...
815,494,2025-04-25,Clothing,61.89,1065
816,555,2025-03-04,Sports,74.31,1061
817,491,2025-03-14,Toys,33.54,1096
818,13,2025-06-19,Electronics,371.75,1070


In [8]:
def diagnostico_columna(df, col):
    """Devuelve un resumen con métricas básicas de calidad para la columna indicada."""
    n_total = len(df)
    
    summary = pd.DataFrame({
        "Métrica": [
            "Tipo de dato",
            "Valores nulos",
            "% Nulos",
            "Valores únicos",
            "Duplicados"
        ],
        "Valor": [
            str(df[col].dtype),
            df[col].isna().sum(),
            f"{round(df[col].isna().mean() * 100, 2)}%",
            df[col].nunique(dropna=False),
            f"{df[col].duplicated().sum()} ({round(df[col].duplicated().mean() * 100, 2)}%)"
        ]
    })
    
    return summary

# Retail

### GE

In [9]:
data_source = context.data_sources.add_pandas(name="retail_source")
data_asset = data_source.add_dataframe_asset(name="retail_asset")

In [10]:
retail_batch_definition = data_asset.add_batch_definition_whole_dataframe("ventas_batch")
retail_batch = retail_batch_definition.get_batch(batch_parameters={"dataframe": retail})

In [11]:
suite_name = "retail_quality_suite"
suite = gx.ExpectationSuite(name=suite_name)

In [12]:
# ==================================================
# Retail suite
# ==================================================
retail_suite = gx.ExpectationSuite(name="retail_quality_suite")

# transaction_id: no nulos y únicos
retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="transaction_id")
)
retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeUnique(column="transaction_id")
)

# customer_id: no nulos
retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="customer_id")
)

# purchase_date: no nulos y formato válido
retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="purchase_date")
)
retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToMatchStrftimeFormat(
        column="purchase_date", strftime_format="%Y-%m-%d"
    )
)

# product_category: no nulos y valores válidos
retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="product_category")
)
retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeInSet(
        column="product_category",
        value_set=["Sports", "Clothing", "Electronics", "Toys", "Home & Kitchen"]
    )
)

# amount: no nulos, numérico y > 0
retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="amount")
)
retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(
        column="amount", type_="float"
    )
)
retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(
        column="amount", min_value=0
    )
)

# Registrar retail suite
context.suites.add(retail_suite)

{
  "name": "retail_quality_suite",
  "id": "ba38357e-270e-4b44-a506-722dd605c808",
  "expectations": [
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "transaction_id"
      },
      "meta": {},
      "id": "4aa5105e-d1b7-4408-83a2-da189d6a6b4a",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_be_unique",
      "kwargs": {
        "column": "transaction_id"
      },
      "meta": {},
      "id": "2a4f42b8-23f5-4d57-a1d4-3a34d5a11ab4",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "customer_id"
      },
      "meta": {},
      "id": "33c902c7-7bd9-454b-8b65-e163e2af4e59",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "purchase_date"
      },
      "meta": {},
      "id": "d5cadf1b-696d-4942-9d42-0c282fcb3ebb",
      "severity": "critical"
 

In [13]:
validation_results = retail_batch.validate(retail_suite)
print(validation_results)

Calculating Metrics:  92%|█████████▏| 55/60 [00:00<00:00, 273.63it/s]
{
  "success": false,
  "results": [
    {
      "success": false,
      "expectation_config": {
        "type": "expect_column_values_to_be_between",
        "kwargs": {
          "column": "amount",
          "min_value": 0.0,
          "batch_id": "retail_source-retail_asset"
        },
        "meta": {},
        "id": "e33a6bc7-3862-4ae1-b1bc-741c56581433",
        "severity": "critical"
      },
      "result": {},
      "meta": {},
      "exception_info": {
        "MetricConfigurationID(metric_name='column_values.between.condition', metric_domain_kwargs_id='d9155530eb3bdf670391e28cd8cb9619', metric_value_kwargs_id='9939f13c1eedbd79e1bad2555c54ab5b')": {
          "exception_traceback": "Traceback (most recent call last):\n  File \"/root/venv/lib/python3.10/site-packages/great_expectations/execution_engine/execution_engine.py\", line 534, in _process_direct_and_bundled_metric_computation_configurations\n    me

## Revisión de columnas

In [14]:
# Filtrar NaN y luego detectar letras
mask_letras = retail["amount"].notna() & retail["amount"].astype(str).str.contains(r"[a-zA-Z]")

# Ver las filas con letras
df_con_letras = retail[mask_letras]

print(df_con_letras)

     customer_id purchase_date product_category   amount  transaction_id
76           500    2025-05-23      Electronics  invalid            1043
229          134    2025-06-10      Electronics  invalid            1026
441           95    2025-06-24             Toys  invalid            1056
447          722    2025-05-24      Electronics  invalid            1098
470            8    2025-01-25      Electronics  invalid            1060
545          486    2025-01-04      Electronics  invalid            1011
560          257    2025-07-21   Home & Kitchen  invalid            1057
598          261    2025-03-04              NaN  invalid            1079
649          745    2025-02-20         Clothing  invalid            1019
723          757    2025-01-26         Clothing  invalid            1035


### amount

In [15]:
amount_diagnostic = diagnostico_columna(retail, "amount")
amount_diagnostic

Unnamed: 0,Métrica,Valor
0,Tipo de dato,object
1,Valores nulos,24
2,% Nulos,2.93%
3,Valores únicos,763
4,Duplicados,57 (6.95%)


### purchase_date

In [16]:
purchase_diagnostic = diagnostico_columna(retail, "purchase_date")
purchase_diagnostic

Unnamed: 0,Métrica,Valor
0,Tipo de dato,object
1,Valores nulos,24
2,% Nulos,2.93%
3,Valores únicos,244
4,Duplicados,576 (70.24%)


### product_category

In [17]:
product_category = diagnostico_columna(retail, "product_category")
product_category 

Unnamed: 0,Métrica,Valor
0,Tipo de dato,object
1,Valores nulos,24
2,% Nulos,2.93%
3,Valores únicos,6
4,Duplicados,814 (99.27%)


### customer id

In [18]:
customer_diagnostic = diagnostico_columna(retail, "customer_id")
customer_diagnostic

Unnamed: 0,Métrica,Valor
0,Tipo de dato,int64
1,Valores nulos,0
2,% Nulos,0.0%
3,Valores únicos,800
4,Duplicados,20 (2.44%)


### Transaction id

In [19]:
transaction_diagnostic = diagnostico_columna(retail, "transaction_id")
transaction_diagnostic

Unnamed: 0,Métrica,Valor
0,Tipo de dato,int64
1,Valores nulos,0
2,% Nulos,0.0%
3,Valores únicos,100
4,Duplicados,720 (87.8%)


In [20]:
retail["transaction_id"].value_counts()

transaction_id
1060    16
1078    16
1058    16
1028    15
1061    14
        ..
1043     4
1071     3
1030     3
1069     3
1051     2
Name: count, Length: 100, dtype: int64

# Customers

In [21]:
data_source = context.data_sources.add_pandas(name="customer_source")
data_asset = data_source.add_dataframe_asset(name="customer_asset")

In [22]:
customer_batch_definition = data_asset.add_batch_definition_whole_dataframe("customer_batch")
customer_batch = customer_batch_definition.get_batch(batch_parameters={"dataframe": customer})

In [23]:
suite_name = "customer_quality_suite"
suite = gx.ExpectationSuite(name=suite_name)

In [24]:
# Customer suite
customer_suite = gx.ExpectationSuite(name="customer_quality_suite")

# id: no nulos, únicos y tipo entero
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="id")
)
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeUnique(column="id")
)
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(column="id", type_="int64")
)

# full_name: no nulos y longitudes razonables
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="full_name")
)
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValueLengthsToBeBetween(
        column="full_name", min_value=2, max_value=100
    )
)

# email: no nulos, formato válido y únicos
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="email")
)
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToMatchRegex(
        column="email", regex=r"^[\w\.-]+@[\w\.-]+\.\w+$"
    )
)
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeUnique(column="email")
)

# phone: no nulos, formato válido y únicos
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="phone")
)
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToMatchRegex(
        column="phone", regex=r"^\+?[0-9\- ]{7,15}$"
    )
)
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeUnique(column="phone")
)

# address: no nulos y longitud razonable
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="address")
)
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValueLengthsToBeBetween(
        column="address", min_value=5, max_value=100
    )
)

# signup_date: no nulos y formato fecha
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="signup_date")
)
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToMatchStrftimeFormat(
        column="signup_date", strftime_format="%Y-%m-%d"
    )
)

# name: no nulos y longitudes razonables
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="name")
)
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValueLengthsToBeBetween(
        column="name", min_value=2, max_value=50
    )
)

# gender: no nulos y valores permitidos
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="gender")
)
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeInSet(
        column="gender", value_set=["Male", "Female"]
    )
)

# age: no nulos y rango válido
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="age")
)
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(
        column="age", min_value=13, max_value=120
    )
)

# Registrar customer suite
context.suites.add(customer_suite)

{
  "name": "customer_quality_suite",
  "id": "ac17aade-f6c6-4e74-9f6d-0d8b38fc516c",
  "expectations": [
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "id"
      },
      "meta": {},
      "id": "0fb9f451-e144-4f47-a101-c28ca9980903",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_be_unique",
      "kwargs": {
        "column": "id"
      },
      "meta": {},
      "id": "65ff8551-0fd3-40af-8592-ebdd394f329f",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_be_of_type",
      "kwargs": {
        "column": "id",
        "type_": "int64"
      },
      "meta": {},
      "id": "70526cd7-310e-49b9-bc22-a2d7b1e20e74",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "full_name"
      },
      "meta": {},
      "id": "6df7e2cc-be13-46d9-a64d-857e58e270bc",
      "severity": "critical"
    },
    

In [25]:
validation_results_customers = customer_batch.validate(customer_suite)
print(validation_results_customers)

    {
      "success": true,
      "expectation_config": {
        "type": "expect_column_value_lengths_to_be_between",
        "kwargs": {
          "batch_id": "customer_source-customer_asset",
          "column": "full_name",
          "min_value": 2,
          "max_value": 100
        },
        "meta": {},
        "id": "964953d1-4698-4707-bd61-5d57b4707222",
        "severity": "critical"
      },
      "result": {
        "element_count": 1000,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "partial_unexpected_list": [],
        "missing_count": 84,
        "missing_percent": 8.4,
        "unexpected_percent_total": 0.0,
        "unexpected_percent_nonmissing": 0.0,
        "partial_unexpected_counts": [],
        "partial_unexpected_index_list": []
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_traceback": null,
        "exception_message": null
      }
    },
    {
      "success": false,
  

### email

In [26]:
email_diagnostic = diagnostico_columna(customer, "email")
email_diagnostic

Unnamed: 0,Métrica,Valor
0,Tipo de dato,object
1,Valores nulos,108
2,% Nulos,10.8%
3,Valores únicos,893
4,Duplicados,107 (10.7%)


### full_name

In [27]:
name_diagnostic = diagnostico_columna(customer,"full_name")
name_diagnostic

Unnamed: 0,Métrica,Valor
0,Tipo de dato,object
1,Valores nulos,84
2,% Nulos,8.4%
3,Valores únicos,884
4,Duplicados,116 (11.6%)


### age

In [28]:
age_diagnostic = diagnostico_columna(customer,"age")
age_diagnostic

Unnamed: 0,Métrica,Valor
0,Tipo de dato,int64
1,Valores nulos,0
2,% Nulos,0.0%
3,Valores únicos,47
4,Duplicados,953 (95.3%)


### phone

In [29]:
phone_diagnostic = diagnostico_columna(customer,"phone")
phone_diagnostic

Unnamed: 0,Métrica,Valor
0,Tipo de dato,object
1,Valores nulos,98
2,% Nulos,9.8%
3,Valores únicos,903
4,Duplicados,97 (9.7%)


### address

In [30]:
address_diagnostic = diagnostico_columna(customer,"address")
address_diagnostic

Unnamed: 0,Métrica,Valor
0,Tipo de dato,object
1,Valores nulos,103
2,% Nulos,10.3%
3,Valores únicos,898
4,Duplicados,102 (10.2%)


### gender

In [31]:
gender_diagnostic = diagnostico_columna(customer,"gender")
gender_diagnostic

Unnamed: 0,Métrica,Valor
0,Tipo de dato,object
1,Valores nulos,0
2,% Nulos,0.0%
3,Valores únicos,2
4,Duplicados,998 (99.8%)


### signup_date

In [32]:
signup_date_diagnostic = diagnostico_columna(customer,"signup_date")
signup_date_diagnostic

Unnamed: 0,Métrica,Valor
0,Tipo de dato,object
1,Valores nulos,98
2,% Nulos,9.8%
3,Valores únicos,824
4,Duplicados,176 (17.6%)


### Name

In [33]:
name_diagnostic = diagnostico_columna(customer,"name")
name_diagnostic

Unnamed: 0,Métrica,Valor
0,Tipo de dato,object
1,Valores nulos,0
2,% Nulos,0.0%
3,Valores únicos,8
4,Duplicados,992 (99.2%)


### Reporte de calidad inicial

In [34]:
print("📂 Suites registradas en el contexto:")
for suite in context.suites.all():
    print("-", suite.get)

📂 Suites registradas en el contexto:
- <bound method DictDot.get of {
  "name": "retail_quality_suite",
  "id": "ba38357e-270e-4b44-a506-722dd605c808",
  "expectations": [
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "transaction_id"
      },
      "meta": {},
      "id": "4aa5105e-d1b7-4408-83a2-da189d6a6b4a",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_be_unique",
      "kwargs": {
        "column": "transaction_id"
      },
      "meta": {},
      "id": "2a4f42b8-23f5-4d57-a1d4-3a34d5a11ab4",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "customer_id"
      },
      "meta": {},
      "id": "33c902c7-7bd9-454b-8b65-e163e2af4e59",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "purchase_date"
      },
      "meta": {},
      "id": "d

In [35]:
def run_quality_kpis(context, suite_name, batch):
    suite = context.suites.get(suite_name)
    validator = context.get_validator(batch=batch, expectation_suite=suite)
    results = validator.validate()
    rows = []

    # Intentar obtener el DataFrame real del batch (para calcular unicidad)
    try:
        df_batch = batch.data.dataframe  # PandasBatchData -> DataFrame real
    except AttributeError:
        df_batch = None

    for r in results["results"]:
        exp_config = r["expectation_config"].to_json_dict()
        exp_type = exp_config.get("expectation_type") or exp_config.get("type")
        kwargs = exp_config.get("kwargs", {})
        column = kwargs.get("column")
        res = r["result"]
        success = r["success"]

        # Obtener conteos por defecto
        total = res.get("element_count", 0)
        unexpected = res.get("unexpected_count", 0)

        # ---- Calcular KPI real ----
        if "be_unique" in exp_type and df_batch is not None:
            # Unicidad: % real de valores únicos usando el DataFrame
            total = len(df_batch[column])
            unique_count = df_batch[column].nunique()
            kpi = 100 * unique_count / total if total else 0

        elif "be_of_type" in exp_type:
            # Exactitud: considerar solo filas no nulas
            missing = res.get("missing_count", 0)
            non_null_total = total - missing
            kpi = 100 * (non_null_total - unexpected) / non_null_total if non_null_total else 0

        
        else:
            # Otras expectativas: usar la fórmula estándar
            kpi = 100 * (1 - unexpected / total) if total else 0

        # ---- Mapear dimensión ----
        if "not_be_null" in exp_type:
            dimension = "Completitud"
        elif "be_unique" in exp_type:
            dimension = "Unicidad"
        elif "match_strftime_format" in exp_type:
            dimension = "Consistencia" if column == "purchase_date" else "Exactitud"
        elif "be_of_type" in exp_type:
            dimension = "Exactitud"
        elif "be_between" in exp_type or "be_in_set" in exp_type:
            dimension = "Validez"
        else:
            continue  # ignorar otras expectativas

        rows.append({
            "columna": column,
            "expectativa": exp_type,
            "dimension": dimension,
            "resultado": "✔️" if success else "❌",
            "KPI": round(kpi, 2)
        })

    # Crear DataFrame detallado
    df_report = pd.DataFrame(rows)

    # Resumen por dimensión
    df_summary = df_report.groupby("dimension")["KPI"].mean().reset_index()
    df_summary.rename(columns={"KPI": "KPI_promedio"}, inplace=True)

    return df_report, df_summary


In [36]:
# Ejecutar reporte Retail
retail_report, retail_summary = run_quality_kpis(
    context,
    "retail_quality_suite",   # nombre exacto de la suite
    retail_batch              # tu batch de datos retail
)

# Mostrar reporte detallado
print("=== Reporte detallado Retail ===")
print(retail_report)

# Mostrar resumen por dimensión
print("\n=== Resumen por dimensión (Retail) ===")
print(retail_summary)

Calculating Metrics:  92%|█████████▎| 37/40 [00:00<00:00, 1181.49it/s]=== Reporte detallado Retail ===
            columna                                    expectativa  \
0            amount             expect_column_values_to_be_between   
1    transaction_id            expect_column_values_to_not_be_null   
2    transaction_id              expect_column_values_to_be_unique   
3       customer_id            expect_column_values_to_not_be_null   
4     purchase_date            expect_column_values_to_not_be_null   
5     purchase_date  expect_column_values_to_match_strftime_format   
6  product_category            expect_column_values_to_not_be_null   
7  product_category              expect_column_values_to_be_in_set   
8            amount            expect_column_values_to_not_be_null   
9            amount             expect_column_values_to_be_of_type   

      dimension resultado     KPI  
0       Validez         ❌    0.00  
1   Completitud        ✔️  100.00  
2      Unicidad   

In [37]:
def run_quality_kpis_customers(context, suite_name, batch):
    suite = context.suites.get(suite_name)
    validator = context.get_validator(batch=batch, expectation_suite=suite)
    results = validator.validate()
    rows = []

    # Intentar obtener el DataFrame real del batch
    try:
        df_batch = batch.data.dataframe  # PandasBatchData -> DataFrame real
    except AttributeError:
        df_batch = None

    for r in results["results"]:
        exp_config = r["expectation_config"].to_json_dict()
        exp_type = exp_config.get("expectation_type") or exp_config.get("type")
        kwargs = exp_config.get("kwargs", {})
        column = kwargs.get("column")
        res = r["result"]
        success = r["success"]

        # Conteos por defecto
        total = res.get("element_count", 0)
        unexpected = res.get("unexpected_count", 0)

        # ---- Calcular KPI real ----
        if "be_unique" in exp_type and df_batch is not None:
            # Unicidad: % real de valores únicos sobre valores no nulos
            non_null_series = df_batch[column].dropna()
            total_non_null = len(non_null_series)
            unique_count = non_null_series.nunique()
            kpi = 100 * unique_count / total_non_null if total_non_null else 0

        elif "match_regex" in exp_type:
            # Exactitud: considerar solo filas no nulas
            missing = res.get("missing_count", 0)
            non_null_total = total - missing
            kpi = 100 * (non_null_total - unexpected) / non_null_total if non_null_total else 0

        elif df_batch is not None and ("be_between" in exp_type 
                                       or "value_lengths_to_be_between" in exp_type 
                                       or "be_in_set" in exp_type 
                                       or "match_strftime_format" in exp_type):
            # Validez / Consistencia: calcular % real de valores válidos sobre no nulos
            non_null_series = df_batch[column].dropna()
            total_non_null = len(non_null_series)

            if "be_between" in exp_type:
                min_value = kwargs.get("min_value", float("-inf"))
                max_value = kwargs.get("max_value", float("inf"))
                numeric_col = pd.to_numeric(non_null_series, errors="coerce")
                valid_count = numeric_col.between(min_value, max_value).sum()
            elif "value_lengths_to_be_between" in exp_type:
                min_len = kwargs.get("min_value", 0)
                max_len = kwargs.get("max_value", float("inf"))
                valid_count = non_null_series.astype(str).str.len().between(min_len, max_len).sum()
            elif "be_in_set" in exp_type:
                value_set = kwargs.get("value_set", [])
                valid_count = non_null_series.isin(value_set).sum()
            elif "match_strftime_format" in exp_type:
                str_format = kwargs.get("strftime_format", "%Y-%m-%d")
                valid_count = non_null_series.apply(
                    lambda x: pd.to_datetime(x, format=str_format, errors="coerce")
                ).notna().sum()

            kpi = 100 if success else (100 * valid_count / total_non_null if total_non_null else 0)

        else:
            # Fórmula estándar
            kpi = 100 * (1 - unexpected / total) if total else 0

        # ---- Mapear dimensión según tus reglas ----
        if "not_be_null" in exp_type:
            dimension = "Completitud"
        elif "be_unique" in exp_type:
            dimension = "Unicidad"
        elif "match_regex" in exp_type:
            dimension = "Exactitud"
        elif "match_strftime_format" in exp_type or ("be_in_set" in exp_type and column == "gender"):
            dimension = "Consistencia"
        elif "be_between" in exp_type or "value_lengths_to_be_between" in exp_type:
            dimension = "Validez"
        else:
            continue  # ignorar otras expectativas

        # ---- Agregar fila al reporte
        rows.append({
            "columna": column,
            "expectativa": exp_type,
            "dimension": dimension,
            "resultado": "✔️" if success else "❌",
            "KPI": round(kpi, 2)
        })

    # Crear DataFrame detallado
    df_report = pd.DataFrame(rows)

    #Resumen por dimensión 
    df_summary = df_report.groupby("dimension")["KPI"].mean().reset_index()
    df_summary.rename(columns={"KPI": "KPI_promedio"}, inplace=True)

    return df_report, df_summary

In [38]:
customer_report, customer_summary = run_quality_kpis_customers(
    context,
    "customer_quality_suite",
    customer_batch
)

print("=== Reporte detallado Customer ===")
print(customer_report)

print("\n=== Resumen por dimensión (Customer) ===")
print(customer_summary)

Calculating Metrics: 100%|██████████| 71/71 [00:00<00:00, 1742.15it/s]=== Reporte detallado Customer ===
        columna                                    expectativa     dimension  \
0            id            expect_column_values_to_not_be_null   Completitud   
1            id              expect_column_values_to_be_unique      Unicidad   
2     full_name            expect_column_values_to_not_be_null   Completitud   
3     full_name      expect_column_value_lengths_to_be_between       Validez   
4         email            expect_column_values_to_not_be_null   Completitud   
5         email            expect_column_values_to_match_regex     Exactitud   
6         email              expect_column_values_to_be_unique      Unicidad   
7         phone            expect_column_values_to_not_be_null   Completitud   
8         phone            expect_column_values_to_match_regex     Exactitud   
9         phone              expect_column_values_to_be_unique      Unicidad   
10      address

### Reporte de calidad final

In [39]:
fact_sales

Unnamed: 0,customer_id,purchase_date,product_category,amount,transaction_id,full_name,email,phone,address,signup_date,name,gender,age
0,230,2025-05-11,Clothing,181.73,1034,Adrian Robinson,adrian.robinson3506@aol.com,No especificado,"417 Washington Blvd, Apt 34, Centerville, FL ...",No especificado,Bob,Female,39
1,630,2025-06-19,Electronics,442.32,1020,Caleb Rodriguez,caleb.rodriguez4367@hotmail.com,+1 (572) 662-5444,"7375 Oak St, Greenville, IL 39198",2023-03-13,George,Female,45
2,867,2025-02-16,Home & Kitchen,263.12,1049,Zoey Lopez,zoey.lopez5536@mail.com,+1 (705) 486-9036,"3966 Pine Ave, Apt 18, Greenville, IL 25010",2022-05-07,Elena,Female,61
3,763,2025-02-13,Home & Kitchen,434.05,1058,Queen Clark,queen.clark2238@hotmail.com,+1 (978) 771-6866,"1243 Washington Blvd, Arlington, TX 41545",2019-08-31,Carlos,Male,61
4,758,2025-04-15,Clothing,489.6,1077,Noah Clark,noah.clark7969@gmail.com,+1 (485) 419-3676,"6686 Cedar Ln, Apt 59, Franklin, NY 76676",2021-11-05,Diana,Female,52
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,494,2025-04-25,Clothing,61.89,10658,Beth Morris,beth.morris1186@gmail.com,+1 (495) 481-6873,"2147 Main St, Centerville, TX 18905",2022-05-30,George,Female,31
796,555,2025-03-04,Sports,74.31,106112,Ximena Sanders,ximena.sanders0156@protonmail.com,+1 (334) 835-5192,No especificado,2020-06-11,Bob,Male,31
797,491,2025-03-14,Toys,33.54,10968,Ethan Walker,ethan.walker7818@outlook.com,No especificado,"1455 Elm St, Arlington, CA 30783",2022-08-26,Elena,Female,46
798,13,2025-06-19,Electronics,371.75,10706,Nina Rivera,nina.rivera6765@protonmail.com,+1 (924) 491-1922,"4270 Lakeview Dr, Greenville, NY 52490",2020-10-31,Carlos,Male,57


In [40]:
try:
    context.delete_datasource("fact_source")
    print("Datasource 'fact_source' eliminado.")
except Exception:
    print("Datasource 'fact_source' no encontrado.")

Datasource 'fact_source' no encontrado.


In [41]:
try:
    context.suites.delete(name=suite_name)
except Exception:
    # fallback para versiones anteriores
    try:
        context.delete_expectation_suite(suite_name)
    except Exception:
        pass

In [42]:
data_source = context.data_sources.add_pandas(name="fact_source")
data_asset = data_source.add_dataframe_asset(name="fact_asset")

In [43]:
fact_batch_definition = data_asset.add_batch_definition_whole_dataframe("fact_batch")
fact_batch = customer_batch_definition.get_batch(batch_parameters={"dataframe": fact_sales})

In [44]:
suite_name = "fact_quality_suite"
suite = gx.ExpectationSuite(name=suite_name)

In [45]:
SENTINEL = "No especificado"
def rc(col: str) -> str:
    needs_backticks = any(not (ch.isalnum() or ch == "_") for ch in col)
    c = f"`{col}`" if needs_backticks else col
    return f'{c} != "{SENTINEL}"'   # ← expresión válida para DataFrame.query()
    
fact_quality_suite = gx.ExpectationSuite(name="fact_quality_suite")

# transaction_id: no nulos y únicos
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="transaction_id")
)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeUnique(column="transaction_id")
)

# customer_id: no nulos y tipo entero (ver nota de tipo más abajo)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="customer_id")
)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(column="customer_id", type_="int64")
    # o mejor: ExpectColumnValuesToBeInTypeList(type_list=["int64","int32","Int64"])
)

# purchase_date: no nulos y formato correcto (excluye sentinela)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="purchase_date")
)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToMatchStrftimeFormat(
        column="purchase_date",
        strftime_format="%Y-%m-%d",
        row_condition=rc("purchase_date"),
        condition_parser="pandas",
    )
)

# product_category: no nulos y valores válidos (puedes incluir el sentinela en el set)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="product_category")
)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeInSet(
        column="product_category",
        value_set=["Sports","Clothing","Electronics","Toys","Home & Kitchen","No especificado"]
    )
)

# amount: no nulos y >= 0 (ver nota de tipos)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="amount")
)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(
        column="amount", 
        min_value=0,
        row_condition=rc("amount"), 
        condition_parser="pandas",
    )
)

# full_name: no nulos y longitud razonable
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="full_name")
)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValueLengthsToBeBetween(column="full_name", min_value=2, max_value=100)
)

# email: no nulos, formato válido y únicos (excluye sentinela)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="email")
)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToMatchRegex(
        column="email",
        regex=r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$",
        row_condition=rc("email"), 
        condition_parser="pandas",
    )
)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeUnique(
        column="email",
        row_condition=rc("email"), 
        condition_parser="pandas",
    )
)

# phone: no nulos, formato válido y únicos (excluye sentinela)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="phone")
)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToMatchRegex(
        column="phone",
        regex=r"\+\d{1,3}\s?\(?\d{2,3}\)?[\s.-]?\d{3}[\s.-]?\d{4}",
        row_condition=rc("phone"), 
        condition_parser="pandas",
    )
)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeUnique(
        column="phone",
        row_condition=rc("phone"), condition_parser="pandas",
    )
)

# address: no nulos y longitud
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="address")
)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValueLengthsToBeBetween(column="address", min_value=5, max_value=100)
)

# signup_date: no nulos y formato correcto (excluye sentinela)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="signup_date")
)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToMatchStrftimeFormat(
        column="signup_date",
        strftime_format="%Y-%m-%d",
        row_condition=rc("signup_date"), 
        condition_parser="pandas",
    )
)

# name: no nulos y longitud
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="name")
)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValueLengthsToBeBetween(column="name", min_value=2, max_value=50)
)

# gender: no nulos y valores permitidos (decide si incluyes el sentinela)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="gender")
)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeInSet(
        column="gender",
        value_set=["Male","Female"]  # o ["Male","Female","No especificado"]
    )
)

# age: no nulos y rango válido
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="age")
)
fact_quality_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(
        column="age", 
        min_value=13, 
        max_value=120,
        row_condition=rc("age"), 
        condition_parser="pandas",
    )
)

context.suites.add(fact_quality_suite)

{
  "name": "fact_quality_suite",
  "id": "5e6c18bc-4eda-40a3-8c16-372ec2f2df9e",
  "expectations": [
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "transaction_id"
      },
      "meta": {},
      "id": "a8746c41-d6b9-42f0-92a8-edaf2caafb47",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_be_unique",
      "kwargs": {
        "column": "transaction_id"
      },
      "meta": {},
      "id": "dee2a48e-25d8-4d01-8dde-6d19d2c23d06",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "customer_id"
      },
      "meta": {},
      "id": "6aa00a74-a0e6-44a4-b630-1b75e4f6af1e",
      "severity": "critical"
    },
    {
      "type": "expect_column_values_to_be_of_type",
      "kwargs": {
        "column": "customer_id",
        "type_": "int64"
      },
      "meta": {},
      "id": "ff683738-0774-4886-a9a5-6f971774752e",
      "se

In [46]:
validator = context.get_validator(
    batch=fact_batch,
    expectation_suite=fact_quality_suite
)
validation_results_fact = validator.validate()
print(validation_results_fact)

Calculating Metrics:  95%|█████████▌| 118/124 [00:00<00:00, 335.70it/s]
{
  "success": false,
  "results": [
    {
      "success": false,
      "expectation_config": {
        "type": "expect_column_values_to_be_between",
        "kwargs": {
          "column": "amount",
          "row_condition": "amount != \"No especificado\"",
          "condition_parser": "pandas",
          "min_value": 0.0,
          "batch_id": "customer_source-customer_asset"
        },
        "meta": {},
        "id": "07e97aef-a701-4699-bba4-9088b9bf2bdf",
        "severity": "critical"
      },
      "result": {},
      "meta": {},
      "exception_info": {
        "MetricConfigurationID(metric_name='column_values.between.condition', metric_domain_kwargs_id='bb0bfaafd5d8e73bf9af4a17b51ccc44', metric_value_kwargs_id='9939f13c1eedbd79e1bad2555c54ab5b')": {
          "exception_traceback": "Traceback (most recent call last):\n  File \"/root/venv/lib/python3.10/site-packages/great_expectations/execution_engine

In [47]:
results_dict = validation_results_fact.to_json_dict()

# Exportar a un TXT legible
import json
with open("validation_results_fact.txt", "w", encoding="utf-8") as f:
    f.write(json.dumps(results_dict, indent=4, ensure_ascii=False))

In [48]:
def run_quality_kpis_fact(
    context=None,
    suite_name=None,
    batch=None,
    results_dict=None,
    # --- reglas de negocio ---
    global_ignore_values=("No especificado",),           
    ignore_values_by_col=None,                          
    regex_assume_valid_cols=None,                       
):
   
    if regex_assume_valid_cols is None:
        regex_assume_valid_cols = set()

    # -------------- helpers -----------------
    def _exp_type_kwargs(exp_config):
        if hasattr(exp_config, "to_json_dict"):
            d = exp_config.to_json_dict()
            return d.get("expectation_type") or d.get("type"), d.get("kwargs", {})
        return exp_config.get("expectation_type") or exp_config.get("type"), exp_config.get("kwargs", {})

    def _ignore_set_for_col(col):
        base = set(s.lower() for s in global_ignore_values) if global_ignore_values else set()
        if ignore_values_by_col and col in ignore_values_by_col:
            base |= set(s.lower() for s in ignore_values_by_col[col])
        return base

    def _non_null_non_sentinel_mask(series, col):
        """Filtra nulos y valores sentinela para valoración (case-insensitive, trim)."""
        if series is None:
            return None
        s = series.astype(str).str.strip()
        sent = _ignore_set_for_col(col)
        # casefold para comparaciones robustas en minúsculas
        mask_not_sentinel = ~s.str.casefold().isin(sent) if sent else pd.Series([True]*len(s), index=s.index)
        return series.notna() & mask_not_sentinel

    # -------------- obtener resultados GX -----------------
    if results_dict is None:
        if context is None or suite_name is None or batch is None:
            raise ValueError("Si no pasas 'results_dict', debes pasar (context, suite_name, batch).")
        suite = context.suites.get(suite_name)
        validator = context.get_validator(batch=batch, expectation_suite=suite)
        gx_results = validator.validate()
    else:
        gx_results = results_dict

    # intentar recuperar DataFrame real (recomendado para aplicar exclusión de sentinelas)
    df_batch = None
    if batch is not None:
        try:
            df_batch = batch.data.dataframe
        except AttributeError:
            df_batch = None

    rows = []

    for r in gx_results.get("results", []):
        exp_config = r.get("expectation_config", {})
        exp_type, kwargs = _exp_type_kwargs(exp_config)
        column = kwargs.get("column")
        res = r.get("result", {}) or {}
        success = bool(r.get("success", False))

        # saltamos expectativas sin columna (si no son columna-específicas)
        if not column:
            continue

        total = res.get("element_count", 0)
        unexpected = res.get("unexpected_count", 0)
        missing = res.get("missing_count", 0)

        kpi = None
        note = ""

        # Serie y máscara de valoración (no nulos y no sentinelas)
        series = None
        mask_eff = None
        if df_batch is not None and column in df_batch.columns:
            series = df_batch[column]
            mask_eff = _non_null_non_sentinel_mask(series, column)
            eff_total = int(mask_eff.sum())
        else:
            # sin DF: nos quedamos con totales de GX (NO puede descontar sentinelas con precisión)
            eff_total = (total - missing) if total else 0

        # ---------------- Cálculo de KPI con reglas ----------------

        # 1) Unicidad -> % de valores únicos sobre universo de valoración (excluye sentinelas)
        if "be_unique" in exp_type:
            if series is not None and mask_eff is not None:
                ser_f = series[mask_eff]
                denom = len(ser_f)
                unique_count = ser_f.nunique(dropna=True)
                kpi = 100 * unique_count / denom if denom else 0
            else:
                # sin DF: fallback GX
                kpi = 100 if success else (100 * (1 - unexpected / total) if total else 0)

        # 2) Not null -> completitud sobre universo de valoración (excluye sentinelas)
        elif "not_be_null" in exp_type:
            if series is not None and mask_eff is not None:
                denom = int((~series.astype(str).str.strip().str.casefold().isin(_ignore_set_for_col(column))).sum())
                # numerador: no nulos entre los no-sentinela
                num = int(series[~series.astype(str).str.strip().str.casefold().isin(_ignore_set_for_col(column))].notna().sum())
                kpi = 100 * num / denom if denom else 0
            else:
                # sin DF: fallback (no descuenta sentinelas)
                kpi = 100 * (1 - unexpected / total) if total else (100 if success else 0)

        # 3) Tipo de dato -> validez del dtype (no aplica exclusión de sentinelas)
        elif "be_of_type" in exp_type:
            observed = res.get("observed_value")
            required = kwargs.get("type_") or kwargs.get("type")
            if observed is not None and required is not None:
                kpi = 100 if str(observed) == str(required) else 0
            elif series is not None and required is not None:
                kpi = 100 if str(series.dtype) == str(required) else 0
            else:
                kpi = 100 if success else 0

        # 4) Regex, sets, longitudes, fechas, rangos (sobre universo de valoración)
        elif any(s in exp_type for s in [
            "match_regex", "be_in_set", "value_lengths_to_be_between",
            "match_strftime_format", "be_between"
        ]):
            if series is not None and mask_eff is not None:
                ser_f = series[mask_eff]
                denom = len(ser_f)

                if "match_regex" in exp_type:
                    if column in regex_assume_valid_cols:
                        valid_count = denom  # negocio: todo no-sentinela es válido
                        note = "Regex omitido por regla de negocio (asumido válido)."
                    else:
                        pattern = kwargs.get("regex")
                        s = ser_f.astype(str)
                        valid_count = s.str.fullmatch(pattern).sum() if pattern else denom

                elif "be_in_set" in exp_type:
                    values = set(kwargs.get("value_set", []))
                    valid_count = ser_f.isin(values).sum()

                elif "value_lengths_to_be_between" in exp_type:
                    min_len = kwargs.get("min_value", 0)
                    max_len = kwargs.get("max_value", np.inf)
                    valid_count = ser_f.astype(str).str.len().between(min_len, max_len).sum()

                elif "match_strftime_format" in exp_type:
                    fmt = kwargs.get("strftime_format", "%Y-%m-%d %H:%M:%S")
                    valid_count = pd.to_datetime(ser_f, format=fmt, errors="coerce").notna().sum()

                elif "be_between" in exp_type:
                    min_v = kwargs.get("min_value", -np.inf)
                    max_v = kwargs.get("max_value", np.inf)
                    num = pd.to_numeric(ser_f, errors="coerce")
                    valid = num.ge(min_v)
                    if max_v is not None and max_v != np.inf:
                        valid = valid & num.le(max_v)
                    valid_count = valid.sum()
                    if num.isna().any():
                        note = "Se hallaron valores no numéricos; se excluyeron por conversión con coerción."

                kpi = 100 * valid_count / denom if denom else 0

            else:
                # sin DF: fallback a conteos GX (NO descuenta sentinelas)
                non_null_total = (total - missing) if total else 0
                if "match_regex" in exp_type and column in regex_assume_valid_cols:
                    kpi = 100 if non_null_total else 0
                    note = "Regex omitido por regla de negocio (modo offline)."
                else:
                    kpi = 100 * (non_null_total - unexpected) / non_null_total if non_null_total else 0

        else:
            # Fallback general
            kpi = 100 if success else (100 * (1 - unexpected / total) if total else 0)

        # -------------- Dimensión --------------
        if "not_be_null" in exp_type:
            dimension = "Completitud"
        elif "be_unique" in exp_type:
            dimension = "Unicidad"
        elif "match_regex" in exp_type:
            dimension = "Exactitud"
        elif "be_of_type" in exp_type:
            dimension = "Validez"
        elif "match_strftime_format" in exp_type or "be_in_set" in exp_type:
            dimension = "Consistencia"
        elif "be_between" in exp_type or "value_lengths_to_be_between" in exp_type:
            dimension = "Validez"
        else:
            continue

        rows.append({
            "columna": column,
            "expectativa": exp_type,
            "dimension": dimension,
            "resultado": "✔️" if success else "❌",
            "KPI": round(float(kpi), 2),
            "nota": note
        })

    df_report = pd.DataFrame(rows).sort_values(["dimension", "columna", "expectativa"]).reset_index(drop=True)
    df_summary = (df_report.groupby("dimension", as_index=False)["KPI"]
                           .mean()
                           .rename(columns={"KPI": "KPI_promedio"}))
    return df_report, df_summary

In [49]:
fact_report, fact_summary = run_quality_kpis_fact(
    context=context,
    suite_name="fact_quality_suite",
    batch=fact_batch,
    global_ignore_values=("np.nan", "No especificado",),
    ignore_values_by_col=None,                 
    regex_assume_valid_cols={"phone"}           
)

print("=== Detalle (Fact) ===")
print(fact_report)
print("\n=== Resumen por dimensión (Fact) ===")
print(fact_summary)

Calculating Metrics:  95%|█████████▌| 118/124 [00:00<00:00, 555.08it/s]
=== Detalle (Fact) ===
             columna                                    expectativa  \
0            address            expect_column_values_to_not_be_null   
1                age            expect_column_values_to_not_be_null   
2             amount            expect_column_values_to_not_be_null   
3        customer_id            expect_column_values_to_not_be_null   
4              email            expect_column_values_to_not_be_null   
5          full_name            expect_column_values_to_not_be_null   
6             gender            expect_column_values_to_not_be_null   
7               name            expect_column_values_to_not_be_null   
8              phone            expect_column_values_to_not_be_null   
9   product_category            expect_column_values_to_not_be_null   
10     purchase_date            expect_column_values_to_not_be_null   
11       signup_date            expect_column_values_

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=171bad0a-62a2-42cc-9180-d14c0a1c4c1a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>