In [9]:
import pandas as pd
import great_expectations as gx
import os
from dotenv import load_dotenv

# Carrega vari√°veis de ambiente (se existirem)
load_dotenv()
context = gx.get_context()

# --- CONFIGURA√á√ÉO DE AMBIENTE (DECIS√ÉO T√âCNICA) ---
# Se tiver credenciais AWS, tenta ler do S3. Se n√£o, l√™ local.
# Isso simula o comportamento "Dev vs Prod"
USE_CLOUD = False # Mude para True se quiser testar conex√£o real S3

# Regex para garantir que s√≥ existem d√≠gitos (0-9).
# O padr√£o r"^\d+(\.0)?$" aceita "123" ou "123.0" (comum em floats do pandas)
NUMERIC_REGEX = r"^\d+(\.0)?$"

BASE_PATH = "../data/raw"
if USE_CLOUD:
    BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
    BASE_PATH = f"s3://{BUCKET_NAME}"

print(f"üåç Ambiente de Execu√ß√£o: {'CLOUD (S3)' if USE_CLOUD else 'LOCAL (Mock)'}")
print(f"üìÇ Caminho dos Dados: {BASE_PATH}")

üåç Ambiente de Execu√ß√£o: LOCAL (Mock)
üìÇ Caminho dos Dados: ../data/raw


In [10]:
try:
    df_listings = pd.read_csv(f"{BASE_PATH}/listings.csv", quotechar='"', on_bad_lines='warn', low_memory=False)
    df_reviews = pd.read_csv(f"{BASE_PATH}/reviews.csv", quotechar='"', on_bad_lines='warn', delimiter=';')
    print(f"‚úÖ Listings carregado: {df_listings.shape}")
    print(f"‚úÖ Reviews carregado: {df_reviews.shape}")
except FileNotFoundError as e:
    print(f"‚ùå Erro: Arquivo n√£o encontrado em {BASE_PATH}. Verifique se moveu os CSVs para a pasta data/raw!")
    raise e

‚úÖ Listings carregado: (43068, 18)
‚úÖ Reviews carregado: (268350, 6)


In [11]:
def print_report(results, table_name):
    print(f"\nüìä RELAT√ìRIO: {table_name}")
    
    is_success = results.get("success") if isinstance(results, dict) else results.success
    print(f"Status Global: {'‚úÖ APROVADO' if is_success else '‚ùå FALHOU (Esperado em dados Raw)'}")
    print("-" * 40)
    
    result_list = results.get("results") if isinstance(results, dict) else results.results
    
    for res in result_list:
        if isinstance(res, dict):
            success = res.get("success")
            config = res.get("expectation_config")
            result_detail = res.get("result")
        else:
            success = res.success
            config = res.expectation_config
            result_detail = res.result

        status = "‚úÖ" if success else "‚ùå"
        if isinstance(config, dict):
             regra = config.get("expectation_type")
             coluna = config.get("kwargs", {}).get("column", "Table")
        else:
             regra = config.expectation_type
             coluna = config.kwargs.get("column", "Table")
        
        print(f"{status} [{coluna}] {regra}")
        
        if not success:
            unexpected_count = result_detail.get('unexpected_count') if isinstance(result_detail, dict) else result_detail.unexpected_count
            print(f"   ‚ö†Ô∏è Falhas encontradas: {unexpected_count} registros")


In [14]:
validator_listings = context.sources.pandas_default.read_dataframe(df_listings)

print("üõ°Ô∏è Great Expectations inicializado com sucesso!")
print(f"Tipo do validador: {type(validator_listings)}")

# --- REGRAS PARA LISTINGS (Im√≥veis) ---
print("üîç Validando Tabela: LISTINGS")

validator_listings.expect_column_values_to_not_be_null("id")
validator_listings.expect_column_values_to_be_unique("id")
validator_listings.expect_column_values_to_match_regex("id", regex=NUMERIC_REGEX)
validator_listings.expect_column_values_to_match_regex("host_id", regex=NUMERIC_REGEX)
validator_listings.expect_column_values_to_not_be_null("price") 
validator_listings.expect_column_values_to_match_regex("price", regex=NUMERIC_REGEX)
validator_listings.expect_column_values_to_be_between("availability_365", min_value=0, max_value=365)

results_listings = validator_listings.validate()
print_report(results_listings, "Listings")

üõ°Ô∏è Great Expectations inicializado com sucesso!
Tipo do validador: <class 'great_expectations.validator.validator.Validator'>
üîç Validando Tabela: LISTINGS


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/28 [00:00<?, ?it/s]


üìä RELAT√ìRIO: Listings
Status Global: ‚ùå FALHOU (Esperado em dados Raw)
----------------------------------------
‚úÖ [id] expect_column_values_to_not_be_null
‚úÖ [id] expect_column_values_to_be_unique
‚úÖ [id] expect_column_values_to_match_regex
‚úÖ [host_id] expect_column_values_to_match_regex
‚ùå [price] expect_column_values_to_not_be_null
   ‚ö†Ô∏è Falhas encontradas: 4398 registros
‚úÖ [price] expect_column_values_to_match_regex
‚úÖ [availability_365] expect_column_values_to_be_between


In [15]:

validator_reviews = context.sources.pandas_default.read_dataframe(df_reviews)

# --- REGRAS PARA REVIEWS (Avalia√ß√µes) ---
print("\nüîç Validando Tabela: REVIEWS")

validator_reviews.expect_column_values_to_match_regex("id", regex=NUMERIC_REGEX)
validator_reviews.expect_column_values_to_not_be_null("listing_id")
validator_reviews.expect_column_values_to_match_regex("listing_id", regex=NUMERIC_REGEX)
validator_reviews.expect_column_to_exist("reviewer_name")

results_reviews = validator_reviews.validate()
print_report(results_reviews, "Reviews")


üîç Validando Tabela: REVIEWS


Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/14 [00:00<?, ?it/s]


üìä RELAT√ìRIO: Reviews
Status Global: ‚ùå FALHOU (Esperado em dados Raw)
----------------------------------------
‚ùå [id] expect_column_values_to_match_regex
   ‚ö†Ô∏è Falhas encontradas: 3789 registros
‚úÖ [listing_id] expect_column_values_to_not_be_null
‚ùå [listing_id] expect_column_values_to_match_regex
   ‚ö†Ô∏è Falhas encontradas: 3789 registros
‚úÖ [reviewer_name] expect_column_to_exist
