In [27]:
import great_expectations as gx

context = gx.get_context()

# Data Source

In [28]:
source_data_source = context.data_sources.add_pandas_filesystem(name="source", base_directory="./datalake/source/")
raw_data_source = context.data_sources.add_pandas_filesystem(name="raw", base_directory="./datalake/raw/")
trusted_data_source = context.data_sources.add_pandas_filesystem(name="trusted", base_directory="./datalake/trusted/")
delivery_data_source = context.data_sources.add_pandas_filesystem(name="delivery", base_directory="./datalake/delivery/")

## Data Assets

In [29]:
raw_bancos = raw_data_source.add_parquet_asset(name="raw_bancos", glob_directive="bancos/*.parquet")
raw_glassdoor = raw_data_source.add_parquet_asset(name="raw_glassdoor", glob_directive="glassdoor/*.parquet")
raw_reclamacoes = raw_data_source.add_parquet_asset(name="raw_reclamacoes", glob_directive="reclamacoes/*.parquet")

In [30]:
trusted_bancos = trusted_data_source.add_parquet_asset(name="trusted_bancos", glob_directive="bancos/*.parquet")
trusted_glassdoor = trusted_data_source.add_parquet_asset(name="trusted_glassdoor", glob_directive="glassdoor/*.parquet")
trusted_reclamacoes = trusted_data_source.add_parquet_asset(name="trusted_reclamacoes", glob_directive="reclamacoes/*.parquet")

In [31]:
delivery = delivery_data_source.add_parquet_asset(name="delivery", glob_directive="*.parquet")

## Batch Definition

In [32]:
raw_bancos_batch_definition = raw_bancos.add_batch_definition(name="raw_bancos")
raw_glassdoor_batch_definition = raw_glassdoor.add_batch_definition(name="raw_glassdoor")
raw_reclamacoes_batch_definition = raw_reclamacoes.add_batch_definition(name="raw_reclamacoes")

In [33]:
trusted_bancos_batch_definition = trusted_bancos.add_batch_definition(name="trusted_bancos")
trusted_glassdoor_batch_definition = trusted_glassdoor.add_batch_definition(name="trusted_glassdoor")
trusted_reclamacoes_batch_definition = trusted_reclamacoes.add_batch_definition(name="trusted_reclamacoes")

In [34]:
delivery_batch_definition = delivery.add_batch_definition(name="delivery")

# Expectation

## Raw

In [35]:
raw_suite = gx.ExpectationSuite(name="raw_suite")
context.suites.add(raw_suite)

raw_suite.add_expectation(gx.expectations.ExpectTableRowCountToBeBetween(min_value=1))
raw_suite.add_expectation(gx.expectations.ExpectTableColumnCountToBeBetween(min_value=1))

ExpectTableColumnCountToBeBetween(id='13a84e20-817a-48b1-8cce-af75f18d86fd', meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=False, rendered_content=None, severity=<FailureSeverity.CRITICAL: 'critical'>, windows=None, batch_id=None, min_value=1.0, max_value=None)

In [36]:
# raw_bancos_batch_definition.validate(raw_suite)
# raw_glassdoor_batch_definition.validate(raw_suite)
# raw_reclamacoes_batch_definition.validate(raw_suite)

## Trusted

### Bancos

In [37]:
trusted_bancos_suite = gx.ExpectationSuite(name="trusted_bancos_suite")
context.suites.add(trusted_bancos_suite)

# General checks
trusted_bancos_suite.add_expectation(gx.expectations.ExpectTableRowCountToBeBetween(min_value=1))
trusted_bancos_suite.add_expectation(gx.expectations.ExpectTableColumnCountToBeBetween(min_value=3, max_value=3))

# Column-specific checks
trusted_bancos_suite.add_expectation(gx.expectations.ExpectColumnValuesToNotBeNull(column="segment"))
trusted_bancos_suite.add_expectation(gx.expectations.ExpectColumnValuesToNotBeNull(column="cnpj"))
trusted_bancos_suite.add_expectation(gx.expectations.ExpectColumnValuesToNotBeNull(column="name"))
trusted_bancos_suite.add_expectation(gx.expectations.ExpectColumnValuesToBeUnique(column="cnpj"))

ExpectColumnValuesToBeUnique(id='34077452-9631-477e-b78d-661912d9b173', meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=True, rendered_content=None, severity=<FailureSeverity.CRITICAL: 'critical'>, windows=None, batch_id=None, column='cnpj', mostly=1, row_condition=None, condition_parser=None)

In [38]:
# trusted_bancos_batch_definition.validate(trusted_bancos_suite)

### glassdoor

In [39]:
trusted_glassdoor_suite = gx.ExpectationSuite(name="trusted_glassdoor_suite")
context.suites.add(trusted_glassdoor_suite)

trusted_glassdoor_suite.add_expectation(gx.expectations.ExpectTableRowCountToBeBetween(min_value=1))
trusted_glassdoor_suite.add_expectation(gx.expectations.ExpectTableColumnCountToBeBetween(min_value=17, max_value=17))

for col in ["name", "segment", "revenue"]:
    trusted_glassdoor_suite.add_expectation(gx.expectations.ExpectColumnValuesToNotBeNull(column=col))

for col in ["reviews_count", "culture_count", "salaries_count", "benefits_count"]:
    trusted_glassdoor_suite.add_expectation(gx.expectations.ExpectColumnValuesToBeBetween(column=col, min_value=0))

for col in [
    "culture_score", "diversity_score", "quality_of_life_score",
    "leadership_score", "compensation_score", "career_opportunities_score"
]:
    trusted_glassdoor_suite.add_expectation(gx.expectations.ExpectColumnValuesToBeBetween(column=col, min_value=0, max_value=5))

for col in ["recommend_percent", "positive_outlook_percent", "match_percent"]:
    trusted_glassdoor_suite.add_expectation(gx.expectations.ExpectColumnValuesToBeBetween(column=col, min_value=0, max_value=100))

In [40]:
# trusted_glassdoor_batch_definition.validate(trusted_glassdoor_suite)

### reclamacoes

In [41]:
trusted_reclamacoes_suite = gx.ExpectationSuite(name="trusted_reclamacoes_suite")
context.suites.add(trusted_reclamacoes_suite)

trusted_reclamacoes_suite.add_expectation(gx.expectations.ExpectTableRowCountToBeBetween(min_value=1))
trusted_reclamacoes_suite.add_expectation(gx.expectations.ExpectTableColumnCountToBeBetween(min_value=14, max_value=14))

for col in ["cnpj", "name", "category", "type", "quarter", "year"]:
    trusted_reclamacoes_suite.add_expectation(gx.expectations.ExpectColumnValuesToNotBeNull(column=col))

trusted_reclamacoes_suite.add_expectation(gx.expectations.ExpectColumnValuesToBeBetween(column="year", min_value=2000))

for col in [
    "regulated_complaints_upheld", "regulated_complaints_other", "unregulated_complaints", "total_complaints"
]:
    trusted_reclamacoes_suite.add_expectation(gx.expectations.ExpectColumnValuesToBeBetween(column=col, min_value=0))

for col in ["total_clients_ccs_scr", "clients_ccs", "clients_scr"]:
    trusted_reclamacoes_suite.add_expectation(gx.expectations.ExpectColumnValuesToBeBetween(column=col, min_value=0))

In [42]:
# trusted_reclamacoes_batch_definition.validate(trusted_reclamacoes_suite)

## delivery

In [43]:
delivery_suite = gx.ExpectationSuite(name="delivery_suite")
context.suites.add(delivery_suite)

delivery_suite.add_expectation(gx.expectations.ExpectColumnValuesToNotBeNull(column="cnpj"),)
delivery_suite.add_expectation(gx.expectations.ExpectCompoundColumnsToBeUnique(column_list=["cnpj", "year", "quarter"]))
delivery_suite.add_expectation(gx.expectations.ExpectColumnMostCommonValueToBeInSet(column="segment", value_set=["S1", "S2", "S3"]))
delivery_suite.add_expectation(gx.expectations.ExpectColumnValuesToBeBetween(column="culture_score", min_value=0, max_value=5))
delivery_suite.add_expectation(gx.expectations.ExpectColumnValuesToBeBetween(column="diversity_score", min_value=0, max_value=5))
delivery_suite.add_expectation(gx.expectations.ExpectColumnValuesToBeBetween(column="quality_of_life_score", min_value=0, max_value=5))
delivery_suite.add_expectation(gx.expectations.ExpectColumnValuesToBeBetween(column="leadership_score", min_value=0, max_value=5))
delivery_suite.add_expectation(gx.expectations.ExpectColumnValuesToBeBetween(column="compensation_score", min_value=0, max_value=5))
delivery_suite.add_expectation(gx.expectations.ExpectColumnValuesToBeBetween(column="career_opportunities_score", min_value=0, max_value=5))
delivery_suite.add_expectation(gx.expectations.ExpectColumnValuesToBeBetween(column="reviews_count", min_value=0))
delivery_suite.add_expectation(gx.expectations.ExpectColumnValuesToBeBetween(column="culture_count", min_value=0))
delivery_suite.add_expectation(gx.expectations.ExpectColumnValuesToBeBetween(column="salaries_count", min_value=0))

ExpectColumnValuesToBeBetween(id='8b6ca3b9-50d9-41a7-9aac-f0795a99caf9', meta=None, notes=None, result_format=<ResultFormat.BASIC: 'BASIC'>, description=None, catch_exceptions=True, rendered_content=None, severity=<FailureSeverity.CRITICAL: 'critical'>, windows=None, batch_id=None, column='salaries_count', mostly=1, row_condition=None, condition_parser=None, min_value=0.0, max_value=None, strict_min=False, strict_max=False)

In [44]:
# delivery_batch_definition.validate(delivery_suite)

# Documents

In [45]:
from pathlib import Path

In [46]:
base_directory = str(Path().resolve() / "great_expectations/data_docs/local_site/")
base_directory

'/home/miguel/Documents/programming/mba/ingestao de dados/eEDB-011_2025-3_ingestao_de_dados_atividade5/atividade5/great_expectations/data_docs/local_site'

In [47]:
site_config = {
    "class_name": "SiteBuilder",
    "site_index_builder": {"class_name": "DefaultSiteIndexBuilder"},
    "store_backend": {
        "class_name": "TupleFilesystemStoreBackend",
        "base_directory": base_directory,
    },
}

site_name = "atividade5_docs"
context.add_data_docs_site(site_name=site_name, site_config=site_config)
context.build_data_docs(site_names=site_name)

{'atividade5_docs': 'file:///home/miguel/Documents/programming/mba/ingestao de dados/eEDB-011_2025-3_ingestao_de_dados_atividade5/atividade5/great_expectations/data_docs/local_site/index.html'}

## Checkpoints

In [55]:
raw_validation_definition = gx.ValidationDefinition(
    name="raw",
    data=raw_bancos_batch_definition,
    suite=raw_suite,
)
context.validation_definitions.add(raw_validation_definition)

trusted_bancos_validation_definition = gx.ValidationDefinition(
    name="trusted_bancos",
    data=trusted_bancos_batch_definition,
    suite=trusted_bancos_suite,
)
context.validation_definitions.add(trusted_bancos_validation_definition)

trusted_glassdoor_validation_definition = gx.ValidationDefinition(
    name="trusted_glassdoor",
    data=trusted_glassdoor_batch_definition,
    suite=trusted_glassdoor_suite,
)
context.validation_definitions.add(trusted_glassdoor_validation_definition)

trusted_reclamacoes_validation_definition = gx.ValidationDefinition(
    name="trusted_reclamacoes",
    data=trusted_reclamacoes_batch_definition,
    suite=trusted_reclamacoes_suite,
)
context.validation_definitions.add(trusted_reclamacoes_validation_definition)

delivery_validation_definition = gx.ValidationDefinition(
    name="delivery",
    data=delivery_batch_definition,
    suite=delivery_suite,
)
context.validation_definitions.add(delivery_validation_definition)


ValidationDefinition(name='delivery', data=BatchDefinition(id=UUID('a367553a-ac1d-4ba9-9561-f5fd3b12d588'), name='delivery', partitioner=None), suite={
  "name": "delivery_suite",
  "id": "c1b5b90d-aa9c-4d1e-9295-9a0e4465cc82",
  "expectations": [
    {
      "type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "cnpj"
      },
      "meta": {},
      "id": "48810e49-74c1-483f-bee9-6999016da132"
    },
    {
      "type": "expect_compound_columns_to_be_unique",
      "kwargs": {
        "column_list": [
          "cnpj",
          "year",
          "quarter"
        ]
      },
      "meta": {},
      "id": "17c58dbf-bb10-43cb-818e-f9f6bb2fa19a"
    },
    {
      "type": "expect_column_most_common_value_to_be_in_set",
      "kwargs": {
        "column": "segment",
        "value_set": [
          "S1",
          "S2",
          "S3"
        ]
      },
      "meta": {},
      "id": "78aef317-b70a-4dcc-9968-2743aec6fa59"
    },
    {
      "type": "expect_col

In [58]:
checkpoint = context.checkpoints.add(
    gx.Checkpoint(
        name="atividade5_checkpoint",
        validation_definitions=context.validation_definitions.all(),
        actions=[
            gx.checkpoint.actions.UpdateDataDocsAction(
                name="update_my_site", site_names=[site_name]
            )
        ],
    )
)

result = checkpoint.run()

Calculating Metrics: 100%|██████████| 4/4 [00:00<00:00, 777.37it/s] 
Calculating Metrics: 100%|██████████| 25/25 [00:00<00:00, 2274.32it/s]
Calculating Metrics: 100%|██████████| 111/111 [00:00<00:00, 3550.27it/s]
Calculating Metrics: 100%|██████████| 89/89 [00:00<00:00, 2544.17it/s]
Calculating Metrics: 100%|██████████| 78/78 [00:00<00:00, 1910.07it/s]


## Open

In [59]:
context.open_data_docs()