In [None]:
import os
import pandas as pd
import great_expectations as gx

from dotenv import load_dotenv
from google.cloud import storage

# Carrega vari√°veis de ambiente (se existirem)
load_dotenv()
context = gx.get_context()

# Regex para garantir que s√≥ existem d√≠gitos (0-9). O padr√£o r"^\d+(\.0)?$" aceita "123" ou "123.0" (comum em floats do pandas)
NUMERIC_REGEX = r"^\d+(\.0)?$"
GCS_PATH = "gs://airbnb-datalake-jpcs/raw"
USE_CLOUD = False
BASE_PATH = "../data/raw" if not USE_CLOUD else GCS_PATH

print(f"Ambiente de Execu√ß√£o: {'CLOUD (GCP)' if USE_CLOUD else 'LOCAL (Mock)'}")
print(f"Caminho dos Dados: {BASE_PATH}")

üåç Ambiente de Execu√ß√£o: LOCAL (Mock)
üìÇ Caminho dos Dados: ../data/silver


In [3]:
try:
    df_listings = pd.read_csv(f"{BASE_PATH}/dim_listings.csv", quotechar='"', on_bad_lines='warn', low_memory=False)
    df_reviews = pd.read_csv(f"{BASE_PATH}/fact_reviews.csv", quotechar='"', on_bad_lines='warn')
    print(f"‚úÖ Listings carregado: {df_listings.shape}")
    print(f"‚úÖ Reviews carregado: {df_reviews.shape}")
except FileNotFoundError as e:
    print(f"‚ùå Erro: Arquivo n√£o encontrado em {BASE_PATH}. Verifique se moveu os CSVs para a pasta data/silver!")
    raise e

‚úÖ Listings carregado: (38670, 11)
‚úÖ Reviews carregado: (264561, 5)


In [4]:
def print_report(results, table_name):
    print(f"\nüìä RELAT√ìRIO: {table_name}")
    
    is_success = results.get("success") if isinstance(results, dict) else results.success
    print(f"Status Global: {'‚úÖ APROVADO' if is_success else '‚ùå FALHOU (Esperado em dados Raw)'}")
    print("-" * 40)
    
    result_list = results.get("results") if isinstance(results, dict) else results.results
    
    for res in result_list:
        if isinstance(res, dict):
            success = res.get("success")
            config = res.get("expectation_config")
            result_detail = res.get("result")
        else:
            success = res.success
            config = res.expectation_config
            result_detail = res.result

        status = "‚úÖ" if success else "‚ùå"
        if isinstance(config, dict):
             regra = config.get("expectation_type")
             coluna = config.get("kwargs", {}).get("column", "Table")
        else:
             regra = config.expectation_type
             coluna = config.kwargs.get("column", "Table")
        
        print(f"{status} [{coluna}] {regra}")
        
        if not success:
            unexpected_count = result_detail.get('unexpected_count') if isinstance(result_detail, dict) else result_detail.unexpected_count
            print(f"   ‚ö†Ô∏è Falhas encontradas: {unexpected_count} registros")


In [5]:
validator_listings = context.sources.pandas_default.read_dataframe(df_listings)

print("üõ°Ô∏è Great Expectations inicializado com sucesso!")
print(f"Tipo do validador: {type(validator_listings)}")

# --- REGRAS PARA LISTINGS (Im√≥veis) ---
print("üîç Validando Tabela: LISTINGS")

validator_listings.expect_column_values_to_not_be_null("SK_LISTING")
validator_listings.expect_column_values_to_be_unique("SK_LISTING")
validator_listings.expect_column_values_to_match_regex("SK_LISTING", regex=NUMERIC_REGEX)
validator_listings.expect_column_values_to_match_regex("SK_HOST", regex=NUMERIC_REGEX)
validator_listings.expect_column_values_to_not_be_null("VLR_DIARIA_BRL") 
validator_listings.expect_column_values_to_match_regex("VLR_DIARIA_BRL", regex=NUMERIC_REGEX)
validator_listings.expect_column_values_to_be_between("QTD_DIAS_DISPONIVEIS", min_value=0, max_value=365)

results_listings = validator_listings.validate()
print_report(results_listings, "Listings")

üõ°Ô∏è Great Expectations inicializado com sucesso!
Tipo do validador: <class 'great_expectations.validator.validator.Validator'>
üîç Validando Tabela: LISTINGS


Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/28 [00:00<?, ?it/s]


üìä RELAT√ìRIO: Listings
Status Global: ‚úÖ APROVADO
----------------------------------------
‚úÖ [SK_LISTING] expect_column_values_to_not_be_null
‚úÖ [SK_LISTING] expect_column_values_to_be_unique
‚úÖ [SK_LISTING] expect_column_values_to_match_regex
‚úÖ [SK_HOST] expect_column_values_to_match_regex
‚úÖ [VLR_DIARIA_BRL] expect_column_values_to_not_be_null
‚úÖ [VLR_DIARIA_BRL] expect_column_values_to_match_regex
‚úÖ [QTD_DIAS_DISPONIVEIS] expect_column_values_to_be_between


In [None]:

validator_reviews = context.sources.pandas_default.read_dataframe(df_reviews)

# --- REGRAS PARA REVIEWS (Avalia√ß√µes) ---
print("\nüîç Validando Tabela: REVIEWS")

validator_reviews.expect_column_values_to_match_regex("SK_REVIEW", regex=NUMERIC_REGEX)
validator_reviews.expect_column_values_to_not_be_null("SK_LISTING")
validator_reviews.expect_column_values_to_match_regex("SK_LISTING", regex=NUMERIC_REGEX)
validator_reviews.expect_column_to_exist("NM_REVIEWER")

results_reviews = validator_reviews.validate()
print_report(results_reviews, "Reviews")