In [3]:
import great_expectations as gx
import pandas as pd
import os

In [4]:
# ===============================================================
# CONFIGURACION
# ===============================================================

# Crear contexto
context = gx.get_context()
print("GX context type:", type(context).__name__)
print("GE version:", gx.__version__)

# Definir fuentes de datos
customer_data_source_name = "customer_data_source_clean"
customer_data_source = context.data_sources.add_pandas(name=customer_data_source_name)
customer_data_asset_name = "customer_data_asset_clean"
customer_data_asset = customer_data_source.add_dataframe_asset(name=customer_data_asset_name)

retail_data_source_name = "retail_data_source_clean"
retail_data_source = context.data_sources.add_pandas(name=retail_data_source_name)
retail_data_asset_name = "retail_data_asset_clean"
retail_data_asset = retail_data_source.add_dataframe_asset(name=retail_data_asset_name)

# Crear batch
customer_batch_definition = customer_data_asset.add_batch_definition_whole_dataframe("customer_data_batch_clean")
retail_batch_definition = retail_data_asset.add_batch_definition_whole_dataframe("retail_data_batch_clean")

# Cargar datos
customer_df_clean = pd.read_csv("../data/clean/customer_data_clean.csv", parse_dates=["signup_date"])
retail_df_clean = pd.read_csv("../data/clean/retail_data_clean.csv", parse_dates=["purchase_date"])

print("Customer DataFrame limpio cargado:", customer_df_clean.shape)
print("Retail DataFrame limpio cargado:", retail_df_clean.shape)

# Crear batches con datos limpios
customer_batch = customer_batch_definition.get_batch(batch_parameters={"dataframe": customer_df_clean})
retail_batch = retail_batch_definition.get_batch(batch_parameters={"dataframe": retail_df_clean})


GX context type: EphemeralDataContext
GE version: 1.6.1
Customer DataFrame limpio cargado: (1000, 9)
Retail DataFrame limpio cargado: (100, 5)


In [5]:
# =============================================================================
# CREATE EXPECTATION SUITES - RETAIL DATA (CORREGIDO)
# =============================================================================

print("\nCreando Expectation Suite para Retail Data...")

# Nombre de la suite
retail_expectation_suite_name = "retail_data_suite"
retail_suite = gx.ExpectationSuite(name=retail_expectation_suite_name)

print("Agregando expectativas específicas para Retail Data:")

# 1. Transaction ID debe ser único y no nulo
retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="transaction_id")
)
print("transaction_id: ExpectColumnValuesToNotBeNull")

retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeUnique(column="transaction_id")
)
print("transaction_id: ExpectColumnValuesToBeUnique")

# 2. Customer ID debe existir y no ser nulo (FK a customer table)
retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="customer_id")
)
print("customer_id: ExpectColumnValuesToNotBeNull")

# 3. Fecha de compra no debe ser nula, debe ser tipo fecha y en rango razonable
retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="purchase_date")
)
print("purchase_date: ExpectColumnValuesToNotBeNull")

retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(column="purchase_date", type_="datetime64[ns]")
)
print("purchase_date: ExpectColumnValuesToBeOfType (datetime)")

retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(
        column="purchase_date",
        min_value="2025-01-01",
        max_value=pd.Timestamp.today()
    )
)
print("purchase_date: ExpectColumnValuesToBeBetween (2025-hoy)")

# 4. Categoría de producto no debe ser nula y debe estar en lista predefinida
retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="product_category")
)
print("product_category: ExpectColumnValuesToNotBeNull")

retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeInSet(
        column="product_category",
        value_set=["Clothing", "Electronics", "Home & Kitchen", "Sports", "Toys", "No specified"]
    )
)
print("product_category: ExpectColumnValuesToBeInSet")

# 5. Monto debe ser numérico y positivo
retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="amount")
)
print("amount: ExpectColumnValuesToNotBeNull")

retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(column="amount", type_="float")
)
print("amount: ExpectColumnValuesToBeOfType (float)")

retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(column="amount", min_value=0.01, max_value=10000)
)
print("amount: ExpectColumnValuesToBeBetween (0.01 - 10000)")

# Agregar la suite al contexto
context.suites.add(retail_suite)
print(f"\nRetail suite '{retail_expectation_suite_name}' agregado al contexto correctamente.")



Creando Expectation Suite para Retail Data...
Agregando expectativas específicas para Retail Data:
transaction_id: ExpectColumnValuesToNotBeNull
transaction_id: ExpectColumnValuesToBeUnique
customer_id: ExpectColumnValuesToNotBeNull
purchase_date: ExpectColumnValuesToNotBeNull
purchase_date: ExpectColumnValuesToBeOfType (datetime)
purchase_date: ExpectColumnValuesToBeBetween (2025-hoy)
product_category: ExpectColumnValuesToNotBeNull
product_category: ExpectColumnValuesToBeInSet
amount: ExpectColumnValuesToNotBeNull
amount: ExpectColumnValuesToBeOfType (float)
amount: ExpectColumnValuesToBeBetween (0.01 - 10000)

Retail suite 'retail_data_suite' agregado al contexto correctamente.


In [6]:
# ===============================================================
# VALIDAR CUSTOMER DATA
# ===============================================================
print("\nValidando Customer Data limpia...")

customer_validation_results = customer_batch.validate(customer_suite)

# Resumen de resultados
total_expectations = len(customer_validation_results.results)
passed = sum(1 for r in customer_validation_results.results if r.success)
failed = total_expectations - passed

print("="*70)
print("RESULTADOS DE VALIDACIÓN - CUSTOMER DATA")
print("="*70)
print(f"Total Expectativas: {total_expectations}")
print(f"PASSED: {passed}")
print(f"FAILED: {failed}")
print("="*70)

# Detalle de cada expectativa
for i, result in enumerate(customer_validation_results.results, start=1):
    status = "PASS" if result.success else "FAIL"
    column = result.expectation_config.kwargs.get('column', 'N/A')
    expectation_type = result.expectation_config.type.replace("expect_column_values_to_", "")
    print(f"{i:2d}. {status} - {expectation_type} en columna '{column}'")

    if not result.success:
        unexpected_count = result.result.get('unexpected_count', 'N/A')
        element_count = result.result.get('element_count', 'N/A')
        print(f"    ❌ Unexpected values: {unexpected_count}/{element_count}")


Validando Customer Data limpia...


NameError: name 'customer_suite' is not defined

In [None]:
# Validate the Retail Data Against the Suite  
print("\nValidating Retail Data...")
retail_validation_results = retail_batch.validate(retail_suite)

# Evaluate the Retail Results
print("\nRetail Data Validation Results:")
print("="*70)
print(f"Overall Success: {'PASS' if retail_validation_results.success else 'FAIL'}")
print(f"Total Expectations: {len(retail_validation_results.results)}")
print(f"Successful: {sum(1 for r in retail_validation_results.results if r.success)}")
print(f"Failed: {sum(1 for r in retail_validation_results.results if not r.success)}")

print("\nDetailed Results:")
for i, result in enumerate(retail_validation_results.results, 1):
    status = "PASS" if result.success else "FAIL"
    # expectation_type = result.expectation_config.expectation_type.replace("expect_column_values_to_", "")
    expectation_type = result.expectation_config.type.replace("expect_column_values_to_", "")
    column = result.expectation_config.kwargs.get('column', 'N/A')
    print(f"  {i:2d}. {status} - {expectation_type} on '{column}'")
    
    # Mostrar detalles si falla
    if not result.success:
        unexpected_count = result.result.get('unexpected_count', 'N/A')
        element_count = result.result.get('element_count', 'N/A')
        print(f" Unexpected values: {unexpected_count}/{element_count}")

print("="*70)


Validating Retail Data...


Calculating Metrics:   0%|          | 0/53 [00:00<?, ?it/s]


Retail Data Validation Results:
Overall Success: FAIL
Total Expectations: 11
Successful: 10
Failed: 1

Detailed Results:
   1. FAIL - be_between on 'purchase_date'
 Unexpected values: N/A/N/A
   2. PASS - not_be_null on 'transaction_id'
   3. PASS - be_unique on 'transaction_id'
   4. PASS - not_be_null on 'customer_id'
   5. PASS - not_be_null on 'purchase_date'
   6. PASS - be_of_type on 'purchase_date'
   7. PASS - not_be_null on 'product_category'
   8. PASS - be_in_set on 'product_category'
   9. PASS - not_be_null on 'amount'
  10. PASS - be_of_type on 'amount'
  11. PASS - be_between on 'amount'


In [None]:
# =============================================================================
# RESUMEN FINAL Y RECOMENDACIONES
# =============================================================================

print("\nRESUMEN FINAL DE CALIDAD DE DATOS:")
print("="*70)
print(f"Customer Data Suite: {len(customer_suite.expectations)} expectativas")
print(f"   Success Rate: {sum(1 for r in customer_validation_results.results if r.success)}/{len(customer_validation_results.results)} ({sum(1 for r in customer_validation_results.results if r.success)/len(customer_validation_results.results)*100:.1f}%)")

print(f"\nRetail Data Suite: {len(retail_suite.expectations)} expectativas") 
print(f"   Success Rate: {sum(1 for r in retail_validation_results.results if r.success)}/{len(retail_validation_results.results)} ({sum(1 for r in retail_validation_results.results if r.success)/len(retail_validation_results.results)*100:.1f}%)")

# Función auxiliar para obtener el tipo de expectativa de forma segura
def get_expectation_type(expectation_config):
    if hasattr(expectation_config, 'type'):
        return expectation_config.type.replace("expect_column_values_to_", "").replace("expect_column_", "")
    elif hasattr(expectation_config, 'expectation_type'):
        return expectation_config.expectation_type.replace("expect_column_values_to_", "").replace("expect_column_", "")
    else:
        # Usar el nombre de la clase como fallback
        return expectation_config.__class__.__name__.replace("Expect", "").replace("Column", "")

# Mostrar problemas de calidad encontrados
customer_failures = [r for r in customer_validation_results.results if not r.success]
retail_failures = [r for r in retail_validation_results.results if not r.success]

if customer_failures or retail_failures:
    print(f"\nPROBLEMAS DE CALIDAD DETECTADOS:")
    
    if customer_failures:
        print(f"   Customer Data: {len(customer_failures)} problemas")
        for failure in customer_failures:
            column = failure.expectation_config.kwargs.get('column', 'N/A')
            issue_type = get_expectation_type(failure.expectation_config)
            print(f"     - Columna '{column}': {issue_type}")
    
    if retail_failures:
        print(f"   Retail Data: {len(retail_failures)} problemas")
        for failure in retail_failures:
            column = failure.expectation_config.kwargs.get('column', 'N/A')
            issue_type = get_expectation_type(failure.expectation_config)
            print(f"     - Columna '{column}': {issue_type}")
    
   
else:
    print(f"\n¡EXCELENTE! Toodos los datos pasan las validaciones de calidad")

print("="*70)
print("Validación de calidad de datos completada exitosamente!")


RESUMEN FINAL DE CALIDAD DE DATOS:
Customer Data Suite: 9 expectativas
   Success Rate: 8/9 (88.9%)

Retail Data Suite: 11 expectativas
   Success Rate: 10/11 (90.9%)

PROBLEMAS DE CALIDAD DETECTADOS:
   Customer Data: 1 problemas
     - Columna 'signup_date': be_between
   Retail Data: 1 problemas
     - Columna 'purchase_date': be_between
Validación de calidad de datos completada exitosamente!


In [None]:
customer_results = []
retail_results = []

# Función auxiliar para obtener tipo de expectativa
def get_expectation_type(expectation_config):
    if hasattr(expectation_config, 'type'):
        return expectation_config.type.replace("expect_column_values_to_", "").replace("expect_column_", "")
    elif hasattr(expectation_config, 'expectation_type'):
        return expectation_config.expectation_type.replace("expect_column_values_to_", "").replace("expect_column_", "")
    else:
        return expectation_config.__class__.__name__.replace("Expect", "").replace("Column", "")

# Procesar resultados de Customer Data
for result in customer_validation_results.results:
    column = result.expectation_config.kwargs.get('column', 'N/A')
    expectation = get_expectation_type(result.expectation_config)
    success = "PASS" if result.success else "FAIL"
    unexpected_count = result.result.get('unexpected_count', 'N/A')
    element_count = result.result.get('element_count', 'N/A')
    customer_results.append({
        "Columna": column,
        "Expectativa": expectation,
        "Resultado": success,
        "Valores problemáticos": f"{unexpected_count}/{element_count}"
    })

# Procesar resultados de Retail Data
for result in retail_validation_results.results:
    column = result.expectation_config.kwargs.get('column', 'N/A')
    expectation = get_expectation_type(result.expectation_config)
    success = "PASS" if result.success else "FAIL"
    unexpected_count = result.result.get('unexpected_count', 'N/A')
    element_count = result.result.get('element_count', 'N/A')
    retail_results.append({
        "Columna": column,
        "Expectativa": expectation,
        "Resultado": success,
        "Valores problemáticos": f"{unexpected_count}/{element_count}"
    })

# Convertir resultados a DataFrame
df_customer = pd.DataFrame(customer_results)
df_retail = pd.DataFrame(retail_results)

# ===============================================================
# GUARDAR REPORTE DE CALIDAD EN CARPETA CLEAN
# ===============================================================

# Crear archivo Excel con dos hojas
excel_path = "../data/clean/data_quality_report_CLEAN.xlsx"
with pd.ExcelWriter(excel_path, engine="xlsxwriter") as writer:
    df_customer.to_excel(writer, sheet_name="Customer Data", index=False)
    df_retail.to_excel(writer, sheet_name="Retail Data", index=False)

# ===============================================================
# IMPRIMIR RESULTADOS EN CONSOLA
# ===============================================================
print("="*70)
print("RESUMEN DE CALIDAD DE DATOS (CONSOLA)")
print("="*70)

print("\nCustomer Data:")
print(df_customer.to_string(index=False))

print("\nRetail Data:")
print(df_retail.to_string(index=False))

print("="*70)
print(f"Reporte de calidad de datos exportado exitosamente a: {excel_path}")


RESUMEN DE CALIDAD DE DATOS (CONSOLA)

Customer Data:
    Columna Expectativa Resultado Valores problemáticos
signup_date  be_between      FAIL               N/A/N/A
         id not_be_null      PASS                0/1000
         id   be_unique      PASS                0/1000
      email   be_unique      PASS                0/1000
      email match_regex      PASS                0/1000
signup_date not_be_null      PASS                0/1000
signup_date  be_of_type      PASS               N/A/N/A
        age  be_between      PASS                0/1000
     gender   be_in_set      PASS                0/1000

Retail Data:
         Columna Expectativa Resultado Valores problemáticos
   purchase_date  be_between      FAIL               N/A/N/A
  transaction_id not_be_null      PASS                 0/100
  transaction_id   be_unique      PASS                 0/100
     customer_id not_be_null      PASS                 0/100
   purchase_date not_be_null      PASS                 0/100
   pur

In [None]:
# ============================
# Comprobación
# ============================
min_signup = pd.Timestamp("2020-01-01")
max_signup = pd.Timestamp.today()

# Filtrar nulos o fuera de rango
out_of_range_or_null_signup = customer_df[
    customer_df['signup_date'].isna() |
    (pd.to_datetime(customer_df['signup_date']) < min_signup) |
    (pd.to_datetime(customer_df['signup_date']) > max_signup)
]

print(f"Customer Data - signup_date fuera de rango o nula: {len(out_of_range_or_null_signup)} registros")
if len(out_of_range_or_null_signup) > 0:
    print(out_of_range_or_null_signup[['id', 'signup_date']].head())

min_purchase = pd.Timestamp("2025-01-01")
max_purchase = pd.Timestamp.today()

# Filtrar nulos o fuera de rango
out_of_range_or_null_purchase = retail_df[
    retail_df['purchase_date'].isna() |
    (pd.to_datetime(retail_df['purchase_date']) < min_purchase) |
    (pd.to_datetime(retail_df['purchase_date']) > max_purchase)
]

print(f"Retail Data - purchase_date fuera de rango o nula: {len(out_of_range_or_null_purchase)} registros")
if len(out_of_range_or_null_purchase) > 0:
    print(out_of_range_or_null_purchase[['transaction_id', 'purchase_date']].head())

Customer Data - signup_date fuera de rango o nula: 0 registros
Retail Data - purchase_date fuera de rango o nula: 0 registros
