Initializing **Great Expectations context**

In [1]:
import great_expectations as gx
import pandas as pd
from great_expectations.core.batch import RuntimeBatchRequest
import os

# Inicializar contexto GX (Ephemeral)
context = gx.get_context()
print("GX context type:", type(context).__name__)
print("GE version:", gx.__version__)


GX context type: EphemeralDataContext
GE version: 1.6.1


Configure DataFrames as **GE Data Sources** and **Assets**

In [2]:
customer_data_source_name = "customer_data_source"
customer_data_source = context.data_sources.add_pandas(name=customer_data_source_name)

# Add a Data Asset to the Data Source para customer data
customer_data_asset_name = "customer_data_asset"
customer_data_asset = customer_data_source.add_dataframe_asset(name=customer_data_asset_name)

print(f"Customer data source '{customer_data_source_name}' created")
print(f"Customer data asset '{customer_data_asset_name}' created")


Customer data source 'customer_data_source' created
Customer data asset 'customer_data_asset' created


In [3]:
# Add a Pandas Data Source para retail data
retail_data_source_name = "retail_data_source"
retail_data_source = context.data_sources.add_pandas(name=retail_data_source_name)

# Add a Data Asset to the Data Source para retail data
retail_data_asset_name = "retail_data_asset"
retail_data_asset = retail_data_source.add_dataframe_asset(name=retail_data_asset_name)

print(f"Retail data source '{retail_data_source_name}' created")
print(f"Retail data asset '{retail_data_asset_name}' created")

Retail data source 'retail_data_source' created
Retail data asset 'retail_data_asset' created


In [4]:
# =============================================================================
# VERIFICAR QUE TODO ESTÁ CONFIGURADO
# =============================================================================

print("\nRESUMEN DE CONFIGURACIÓN:")
print("Data Sources disponibles:")
for ds_name in context.data_sources.all():
    print(f"  - {ds_name}")

print("\nData Assets por Data Source:")
for ds_name, ds in context.data_sources.all().items():
    print(f"  {ds_name}:")
    for asset_name in ds.assets:
        print(f"    - {asset_name}")


RESUMEN DE CONFIGURACIÓN:
Data Sources disponibles:
  - customer_data_source
  - retail_data_source

Data Assets por Data Source:
  customer_data_source:
    - batch_metadata: {}
id: 17c749d2-c233-4b9b-9c02-bf21f8d918fb
name: customer_data_asset
type: dataframe

  retail_data_source:
    - batch_metadata: {}
id: d302d238-eeca-4d75-9528-09314b3e1999
name: retail_data_asset
type: dataframe



In [5]:
# Define the Batch Definition name para customer data
customer_batch_definition_name = "customer_data_batch"
# Add the Batch Definition para customer data
customer_batch_definition = customer_data_asset.add_batch_definition_whole_dataframe(customer_batch_definition_name)
assert customer_batch_definition.name == customer_batch_definition_name


# Define the Batch Definition name para retail data
retail_batch_definition_name = "retail_data_batch"  
# Add the Batch Definition para retail data
retail_batch_definition = retail_data_asset.add_batch_definition_whole_dataframe(retail_batch_definition_name)
assert retail_batch_definition.name == retail_batch_definition_name

In [6]:
customer_csv_path = "../data/raw/customer_data.csv"
customer_dataframe = pd.read_csv(customer_csv_path)

customer_batch_parameters = {"dataframe": customer_dataframe}
# Retrieve the Batch para customer data
customer_batch = customer_batch_definition.get_batch(batch_parameters=customer_batch_parameters)

print(f"Customer batch creado desde: {customer_csv_path}")
print(f"   Shape: {customer_dataframe.shape}")
print(f"   Columnas: {list(customer_dataframe.columns)}")

retail_csv_path = "../data/raw/retail_data.csv"
retail_dataframe = pd.read_csv(retail_csv_path)

retail_batch_parameters = {"dataframe": retail_dataframe}
# Retrieve the Batch para retail data
retail_batch = retail_batch_definition.get_batch(batch_parameters=retail_batch_parameters)

print(f"Retail batch creado desde: {retail_csv_path}")
print(f"   Shape: {retail_dataframe.shape}")
print(f"   Columnas: {list(retail_dataframe.columns)}")

Customer batch creado desde: ../data/raw/customer_data.csv
   Shape: (1000, 9)
   Columnas: ['id', 'full_name', 'email', 'phone', 'address', 'signup_date', 'name', 'gender', 'age']
Retail batch creado desde: ../data/raw/retail_data.csv
   Shape: (820, 5)
   Columnas: ['customer_id', 'purchase_date', 'product_category', 'amount', 'transaction_id']


Checks for specific qualities and properties within a dataset related to customers.

In [7]:
# =============================================================================
# CREATE EXPECTATION SUITES - CUSTOMER DATA 
# =============================================================================

print("\nCreando Expectation Suite para Customer Data...")

# Create an Expectation Suite para Customer Data
customer_expectation_suite_name = "customer_data_suite"
customer_suite = gx.ExpectationSuite(name=customer_expectation_suite_name)

print("Agregando expectativas específicas para Customer Data:")

# 1. ID debe ser único y no nulo (columna clave)
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="id")
)
print("id: ExpectColumnValuesToNotBeNull")

customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeUnique(column="id")
)
print("id: ExpectColumnValuesToBeUnique")


# 2. Email debe tener formato válido y ser único
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="email")
)
print("email: ExpectColumnValuesToNotBeNull")

customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeUnique(column="email")
)
print("email: ExpectColumnValuesToBeUnique")

customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToMatchRegex(
        column="email", 
        regex=r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    )
)
print("email: ExpectColumnValuesToMatchRegex (formato email)")

# 3. Teléfono no debe ser nulo y estar en formato válido y unico
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="phone")
)
print("phone: ExpectColumnValuesToNotBeNull")

customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToMatchRegex(
        column="phone",
        regex=r'^\+?\d[\d\s-]+$'
    )
)
print("phone: ExpectColumnValuesToMatchRegex (formato teléfono)")

customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeUnique(column="phone")
)
print("phone: ExpectColumnToBeUnique")

# 4. Edad debe ser numérica y en rango razonable
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(column="age", type_="int")
)
print("age: ExpectColumnValuesToBeOfType (int)")

customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(column="age", min_value=18, max_value=100)
)
print("age: ExpectColumnValuesToBeBetween (18-100)")

# 5. Género debe estar en valores específicos
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeInSet(
        column="gender", 
        value_set=["Male", "Female", "M", "F", "male", "female", "Other", "other"]
    )
)
print("gender: ExpectColumnValuesToBeInSet")

# 6. Fecha de registro debe ser tipo fecha y no nula
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(column="signup_date", type_="datetime64[ns]")
)
print("signup_date: ExpectColumnValuesToBeOfType (datetime)")

# 7. Fecha de registro debe estar en rango razonable (no antes de 1900 y no en el futuro)
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(
        column="signup_date",
        min_value="1900-01-01",
        max_value=pd.Timestamp.today()
    )
)
print("signup_date: ExpectColumnValuesToBeBetween (1900-hoy)")

# 8. Signup_date no debe ser nulo
customer_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="signup_date")
)

# Add the Expectation Suite to the Context
context.suites.add(customer_suite)
print(f"\nCustomer suite '{customer_expectation_suite_name}' agregado al contexto")



Creando Expectation Suite para Customer Data...
Agregando expectativas específicas para Customer Data:
id: ExpectColumnValuesToNotBeNull
id: ExpectColumnValuesToBeUnique
email: ExpectColumnValuesToNotBeNull
email: ExpectColumnValuesToBeUnique
email: ExpectColumnValuesToMatchRegex (formato email)
phone: ExpectColumnValuesToNotBeNull
phone: ExpectColumnValuesToMatchRegex (formato teléfono)
phone: ExpectColumnToBeUnique
age: ExpectColumnValuesToBeOfType (int)
age: ExpectColumnValuesToBeBetween (18-100)
gender: ExpectColumnValuesToBeInSet
signup_date: ExpectColumnValuesToBeOfType (datetime)
signup_date: ExpectColumnValuesToBeBetween (1900-hoy)

Customer suite 'customer_data_suite' agregado al contexto


Checks for specific qualities and properties within a dataset related to retail transactions.

In [8]:
# =============================================================================
# CREATE EXPECTATION SUITES - RETAIL DATA (CORREGIDO)
# =============================================================================

print("\nCreando Expectation Suite para Retail Data...")

# Nombre de la suite
retail_expectation_suite_name = "retail_data_suite"
retail_suite = gx.ExpectationSuite(name=retail_expectation_suite_name)

print("Agregando expectativas específicas para Retail Data:")

# 1. Transaction ID debe ser único y no nulo
retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="transaction_id")
)
print("transaction_id: ExpectColumnValuesToNotBeNull")

retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeUnique(column="transaction_id")
)
print("transaction_id: ExpectColumnValuesToBeUnique")

# 2. Customer ID debe existir y no ser nulo (FK a customer table)
retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="customer_id")
)
print("customer_id: ExpectColumnValuesToNotBeNull")

# 3. Fecha de compra no debe ser nula, debe ser tipo fecha y en rango razonable
retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="purchase_date")
)
print("purchase_date: ExpectColumnValuesToNotBeNull")

retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(column="purchase_date", type_="datetime64[ns]")
)
print("purchase_date: ExpectColumnValuesToBeOfType (datetime)")

retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(
        column="purchase_date",
        min_value="2025-01-01",
        max_value=pd.Timestamp.today()
    )
)
print("purchase_date: ExpectColumnValuesToBeBetween (2025-hoy)")

# 4. Categoría de producto no debe ser nula y debe estar en lista predefinida
retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="product_category")
)
print("product_category: ExpectColumnValuesToNotBeNull")

retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeInSet(
        column="product_category",
        value_set=["Clothing", "Electronics", "Home & Kitchen", "Sports", "Toys", "No specified"]
    )
)
print("product_category: ExpectColumnValuesToBeInSet")

# 5. Monto debe ser numérico y positivo
retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToNotBeNull(column="amount")
)
print("amount: ExpectColumnValuesToNotBeNull")

retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeOfType(column="amount", type_="float")
)
print("amount: ExpectColumnValuesToBeOfType (float)")

retail_suite.add_expectation(
    gx.expectations.ExpectColumnValuesToBeBetween(column="amount", min_value=0.01, max_value=10000)
)
print("amount: ExpectColumnValuesToBeBetween (0.01 - 10000)")

# Agregar la suite al contexto
context.suites.add(retail_suite)
print(f"\nRetail suite '{retail_expectation_suite_name}' agregado al contexto correctamente.")



Creando Expectation Suite para Retail Data...
Agregando expectativas específicas para Retail Data:
transaction_id: ExpectColumnValuesToNotBeNull
transaction_id: ExpectColumnValuesToBeUnique
customer_id: ExpectColumnValuesToNotBeNull
purchase_date: ExpectColumnValuesToNotBeNull
purchase_date: ExpectColumnValuesToBeOfType (datetime)
purchase_date: ExpectColumnValuesToBeBetween (2025-hoy)
product_category: ExpectColumnValuesToNotBeNull
product_category: ExpectColumnValuesToBeInSet
amount: ExpectColumnValuesToNotBeNull
amount: ExpectColumnValuesToBeOfType (float)
amount: ExpectColumnValuesToBeBetween (0.01 - 10000)

Retail suite 'retail_data_suite' agregado al contexto correctamente.


 Execute data quality checks and then **report the results**

In [9]:
# =============================================================================
# VALIDATE THE DATA AGAINST THE SUITES
# =============================================================================

print("\nValidando datos contra las suites...")

# Validate the Customer Data Against the Suite
print("Validating Customer Data...")
customer_validation_results = customer_batch.validate(customer_suite)

# Evaluate the Customer Results
print("\nCustomer Data Validation Results:")
print("="*70)
print(f"Overall Success: {'PASS' if customer_validation_results.success else 'FAIL'}")
print(f"Total Expectations: {len(customer_validation_results.results)}")
print(f"Successful: {sum(1 for r in customer_validation_results.results if r.success)}")
print(f"Failed: {sum(1 for r in customer_validation_results.results if not r.success)}")

print("\nDetailed Results:")
for i, result in enumerate(customer_validation_results.results, 1):
    status = "PASS" if result.success else "FAIL"
    # expectation_type = result.expectation_config.expectation_type.replace("expect_column_values_to_", "")
    expectation_type = result.expectation_config.type.replace("expect_column_values_to_", "")
    column = result.expectation_config.kwargs.get('column', 'N/A')
    print(f"  {i:2d}. {status} - {expectation_type} on '{column}'")
    
    # Mostrar detalles si falla
    if not result.success:
        unexpected_count = result.result.get('unexpected_count', 'N/A')
        element_count = result.result.get('element_count', 'N/A')
        print(f"Unexpected values: {unexpected_count}/{element_count}")

print("="*70)


Validando datos contra las suites...
Validating Customer Data...


Calculating Metrics:  94%|█████████▎| 72/77 [00:00<00:00, 573.70it/s]


Customer Data Validation Results:
Overall Success: FAIL
Total Expectations: 14
Successful: 6
Failed: 8

Detailed Results:
   1. FAIL - be_between on 'signup_date'
Unexpected values: N/A/N/A
   2. PASS - not_be_null on 'id'
   3. PASS - be_unique on 'id'
   4. FAIL - not_be_null on 'email'
Unexpected values: 108/1000
   5. PASS - be_unique on 'email'
   6. PASS - match_regex on 'email'
   7. FAIL - not_be_null on 'phone'
Unexpected values: 98/1000
   8. FAIL - match_regex on 'phone'
Unexpected values: 667/1000
   9. PASS - be_unique on 'phone'
  10. FAIL - be_of_type on 'age'
Unexpected values: N/A/N/A
  11. FAIL - be_between on 'age'
Unexpected values: 25/1000
  12. PASS - be_in_set on 'gender'
  13. FAIL - be_of_type on 'signup_date'
Unexpected values: 902/1000
  14. FAIL - not_be_null on 'signup_date'
Unexpected values: 98/1000





In [10]:
# Validate the Retail Data Against the Suite  
print("\nValidating Retail Data...")
retail_validation_results = retail_batch.validate(retail_suite)

# Evaluate the Retail Results
print("\nRetail Data Validation Results:")
print("="*70)
print(f"Overall Success: {'PASS' if retail_validation_results.success else 'FAIL'}")
print(f"Total Expectations: {len(retail_validation_results.results)}")
print(f"Successful: {sum(1 for r in retail_validation_results.results if r.success)}")
print(f"Failed: {sum(1 for r in retail_validation_results.results if not r.success)}")

print("\nDetailed Results:")
for i, result in enumerate(retail_validation_results.results, 1):
    status = "PASS" if result.success else "FAIL"
    # expectation_type = result.expectation_config.expectation_type.replace("expect_column_values_to_", "")
    expectation_type = result.expectation_config.type.replace("expect_column_values_to_", "")
    column = result.expectation_config.kwargs.get('column', 'N/A')
    print(f"  {i:2d}. {status} - {expectation_type} on '{column}'")
    
    # Mostrar detalles si falla
    if not result.success:
        unexpected_count = result.result.get('unexpected_count', 'N/A')
        element_count = result.result.get('element_count', 'N/A')
        print(f" Unexpected values: {unexpected_count}/{element_count}")

print("="*70)



Validating Retail Data...


Calculating Metrics:  85%|████████▌ | 57/67 [00:00<00:00, 378.06it/s]


Retail Data Validation Results:
Overall Success: FAIL
Total Expectations: 11
Successful: 3
Failed: 8

Detailed Results:
   1. FAIL - be_between on 'purchase_date'
 Unexpected values: N/A/N/A
   2. FAIL - be_between on 'amount'
 Unexpected values: N/A/N/A
   3. PASS - not_be_null on 'transaction_id'
   4. FAIL - be_unique on 'transaction_id'
 Unexpected values: 820/820
   5. PASS - not_be_null on 'customer_id'
   6. FAIL - not_be_null on 'purchase_date'
 Unexpected values: 24/820
   7. FAIL - be_of_type on 'purchase_date'
 Unexpected values: 796/820
   8. FAIL - not_be_null on 'product_category'
 Unexpected values: 24/820
   9. PASS - be_in_set on 'product_category'
  10. FAIL - not_be_null on 'amount'
 Unexpected values: 24/820
  11. FAIL - be_of_type on 'amount'
 Unexpected values: 796/820





Takes the detailed validation results from the **customer_validation_results** and **retail_validation_results** objects and aggregates them into a concise, **easy-to-read report.**

In [11]:
# =============================================================================
# RESUMEN FINAL Y RECOMENDACIONES
# =============================================================================

print("\nRESUMEN FINAL DE CALIDAD DE DATOS:")
print("="*70)
print(f"Customer Data Suite: {len(customer_suite.expectations)} expectativas")
print(f"   Success Rate: {sum(1 for r in customer_validation_results.results if r.success)}/{len(customer_validation_results.results)} ({sum(1 for r in customer_validation_results.results if r.success)/len(customer_validation_results.results)*100:.1f}%)")

print(f"\nRetail Data Suite: {len(retail_suite.expectations)} expectativas") 
print(f"   Success Rate: {sum(1 for r in retail_validation_results.results if r.success)}/{len(retail_validation_results.results)} ({sum(1 for r in retail_validation_results.results if r.success)/len(retail_validation_results.results)*100:.1f}%)")

# Función auxiliar para obtener el tipo de expectativa de forma segura
def get_expectation_type(expectation_config):
    if hasattr(expectation_config, 'type'):
        return expectation_config.type.replace("expect_column_values_to_", "").replace("expect_column_", "")
    elif hasattr(expectation_config, 'expectation_type'):
        return expectation_config.expectation_type.replace("expect_column_values_to_", "").replace("expect_column_", "")
    else:
        # Usar el nombre de la clase como fallback
        return expectation_config.__class__.__name__.replace("Expect", "").replace("Column", "")

# Mostrar problemas de calidad encontrados
customer_failures = [r for r in customer_validation_results.results if not r.success]
retail_failures = [r for r in retail_validation_results.results if not r.success]

if customer_failures or retail_failures:
    print(f"\nPROBLEMAS DE CALIDAD DETECTADOS:")
    
    if customer_failures:
        print(f"   Customer Data: {len(customer_failures)} problemas")
        for failure in customer_failures:
            column = failure.expectation_config.kwargs.get('column', 'N/A')
            issue_type = get_expectation_type(failure.expectation_config)
            print(f"     - Columna '{column}': {issue_type}")
    
    if retail_failures:
        print(f"   Retail Data: {len(retail_failures)} problemas")
        for failure in retail_failures:
            column = failure.expectation_config.kwargs.get('column', 'N/A')
            issue_type = get_expectation_type(failure.expectation_config)
            print(f"     - Columna '{column}': {issue_type}")
    
   
else:
    print(f"\n¡EXCELENTE! Toodos los datos pasan las validaciones de calidad")

print("="*70)
print("Validación de calidad de datos completada exitosamente!")


RESUMEN FINAL DE CALIDAD DE DATOS:
Customer Data Suite: 14 expectativas
   Success Rate: 6/14 (42.9%)

Retail Data Suite: 11 expectativas
   Success Rate: 3/11 (27.3%)

PROBLEMAS DE CALIDAD DETECTADOS:
   Customer Data: 8 problemas
     - Columna 'signup_date': be_between
     - Columna 'email': not_be_null
     - Columna 'phone': not_be_null
     - Columna 'phone': match_regex
     - Columna 'age': be_of_type
     - Columna 'age': be_between
     - Columna 'signup_date': be_of_type
     - Columna 'signup_date': not_be_null
   Retail Data: 8 problemas
     - Columna 'purchase_date': be_between
     - Columna 'amount': be_between
     - Columna 'transaction_id': be_unique
     - Columna 'purchase_date': not_be_null
     - Columna 'purchase_date': be_of_type
     - Columna 'product_category': not_be_null
     - Columna 'amount': not_be_null
     - Columna 'amount': be_of_type
Validación de calidad de datos completada exitosamente!


In [13]:

# Crear listas para almacenar los resultados
customer_results = []
retail_results = []

# Función auxiliar para tipo de expectativa
def get_expectation_type(expectation_config):
    if hasattr(expectation_config, 'type'):
        return expectation_config.type.replace("expect_column_values_to_", "").replace("expect_column_", "")
    elif hasattr(expectation_config, 'expectation_type'):
        return expectation_config.expectation_type.replace("expect_column_values_to_", "").replace("expect_column_", "")
    else:
        return expectation_config.__class__.__name__.replace("Expect", "").replace("Column", "")

# Procesar resultados de Customer Data
for result in customer_validation_results.results:
    column = result.expectation_config.kwargs.get('column', 'N/A')
    expectation = get_expectation_type(result.expectation_config)
    success = "PASS" if result.success else "FAIL"
    unexpected_count = result.result.get('unexpected_count', 'N/A')
    element_count = result.result.get('element_count', 'N/A')
    customer_results.append({
        "Columna": column,
        "Expectativa": expectation,
        "Resultado": success,
        "Valores problemáticos": f"{unexpected_count}/{element_count}"
    })

# Procesar resultados de Retail Data
for result in retail_validation_results.results:
    column = result.expectation_config.kwargs.get('column', 'N/A')
    expectation = get_expectation_type(result.expectation_config)
    success = "PASS" if result.success else "FAIL"
    unexpected_count = result.result.get('unexpected_count', 'N/A')
    element_count = result.result.get('element_count', 'N/A')
    retail_results.append({
        "Columna": column,
        "Expectativa": expectation,
        "Resultado": success,
        "Valores problemáticos": f"{unexpected_count}/{element_count}"
    })

# Convertir a DataFrame
df_customer = pd.DataFrame(customer_results)
df_retail = pd.DataFrame(retail_results)

# Crear carpeta si no existe
os.makedirs("../data/raw", exist_ok=True)

# Guardar en Excel con dos hojas
excel_path = "../data/raw/data_quality_report_RAW.xlsx"
with pd.ExcelWriter(excel_path, engine="xlsxwriter") as writer:
    df_customer.to_excel(writer, sheet_name="Customer Data", index=False)
    df_retail.to_excel(writer, sheet_name="Retail Data", index=False)

# Print resumido en consola
print("="*70)
print("RESUMEN DE CALIDAD DE DATOS (CONSOLa)")
print("="*70)
print("\nCustomer Data:")
print(df_customer.to_string(index=False))
print("\nRetail Data:")
print(df_retail.to_string(index=False))
print("="*70)
print(f"Reporte de calidad de datos exportado exitosamente a: {excel_path}")


ModuleNotFoundError: No module named 'xlsxwriter'