# Data Quality Checks: Bronze vs Silver
This notebook connects to DuckLake and queries paired Bronze and Silver tables to compare raw vs processed data.

In [5]:
%pip install duckdb pandas python-dotenv requests boto3

Note: you may need to restart the kernel to use updated packages.


In [9]:
import duckdb
import pandas as pd
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv('../../.env', override=True) 

True

In [10]:
RUSTFS_HOST = os.getenv('RUSTFS_HOST', 'localhost')
RUSTFS_PORT = os.getenv('RUSTFS_PORT', '8080')
RUSTFS_USER = os.getenv('RUSTFS_USER', 'admin')
RUSTFS_PASSWORD = os.getenv('RUSTFS_PASSWORD', 'password')
RUSTFS_BUCKET = os.getenv('RUSTFS_BUCKET', 'mitma')
RUSTFS_SSL = os.getenv('RUSTFS_SSL', 'false')

# Postgres Configuration
POSTGRES_USER = os.getenv('POSTGRES_USER', 'postgres')
POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD', 'password')
POSTGRES_HOST = os.getenv('POSTGRES_HOST', 'localhost')
POSTGRES_PORT = os.getenv('POSTGRES_PORT', '5432')
POSTGRES_DB = os.getenv('POSTGRES_DB', 'muceim')

# Construct S3 Endpoint with protocol
S3_ENDPOINT = f"{RUSTFS_HOST}:{RUSTFS_PORT}"

In [12]:
print(f"Connecting to RustFS at {S3_ENDPOINT}...")

# Initialize DuckDB Connection
con = duckdb.connect()

# Install and Load extensions
con.execute("INSTALL httpfs;")
con.execute("LOAD httpfs;")
con.execute("INSTALL postgres;")
con.execute("LOAD postgres;")
con.execute("INSTALL ducklake;")
con.execute("LOAD ducklake;")

# Configure S3 Secrets for RustFS
con.execute(f"SET s3_endpoint='{S3_ENDPOINT}';")
con.execute(f"SET s3_access_key_id='{RUSTFS_USER}';")
con.execute(f"SET s3_secret_access_key='{RUSTFS_PASSWORD}';")
con.execute(f"SET s3_use_ssl={RUSTFS_SSL};")
con.execute("SET s3_url_style='path';")
con.execute("SET preserve_insertion_order=false;")
con.execute("SET max_temp_directory_size='40GiB';")

# Attach DuckLake with Postgres Catalog
postgres_connection_string = f"dbname={POSTGRES_DB} host={POSTGRES_HOST} user={POSTGRES_USER} password={POSTGRES_PASSWORD} port={POSTGRES_PORT}"
attach_query = f"ATTACH 'ducklake:postgres:{postgres_connection_string}' AS ducklake (DATA_PATH 's3://{RUSTFS_BUCKET}/');"

print(f"Attaching DuckLake with query: {attach_query}")
con.execute(attach_query)
con.execute("USE ducklake;")

print("DuckLake configured with Postgres catalog and RustFS storage.")

Connecting to RustFS at localhost:9000...
Attaching DuckLake with query: ATTACH 'ducklake:postgres:dbname=mitma host=localhost user=admin password=muceim-duckduck.2025! port=30432' AS ducklake (DATA_PATH 's3://mitma/');


IOException: IO Error: Failed to attach DuckLake MetaData "__ducklake_metadata_ducklake" at path + "postgres:dbname=mitma host=localhost user=admin password=muceim-duckduck.2025! port=30432"Unable to connect to Postgres at "dbname=mitma host=localhost user=admin password=muceim-duckduck.2025! port=30432": connection to server at "localhost" (127.0.0.1), port 30432 failed: Connection refused
	Is the server running on that host and accepting TCP/IP connections?


In [None]:
def SQL(query):
    """Execute a SQL query and return the result as a Pandas DataFrame."""
    try:
        return con.execute(query).fetchdf()
    except Exception as e:
        print(f"Error executing query: {e}")
        return None

# List all tables to verify names
SQL("SHOW TABLES")

## 1. MITMA OD (Origin-Destination)

In [None]:
# Bronze: OD Municipios (Raw)
SQL("SELECT * FROM bronze_mitma_od_municipios LIMIT 5")

In [None]:
# Silver: OD Unified (Processed)
SQL("SELECT * FROM silver_mitma_od LIMIT 5")

## 2. MITMA People Day

In [None]:
# Bronze: People Day Municipios (Raw)
SQL("SELECT * FROM bronze_mitma_people_day_municipios LIMIT 5")

In [None]:
# Silver: People Day Unified (Processed)
SQL("SELECT * FROM silver_mitma_people_day LIMIT 5")

## 3. MITMA Overnight Stay

In [None]:
# Bronze: Overnight Stay Municipios (Raw)
SQL("SELECT * FROM bronze_mitma_overnight_stay_municipios LIMIT 5")

In [None]:
# Silver: Overnight Stay Unified (Processed)
SQL("SELECT * FROM silver_mitma_overnight_stay LIMIT 5")

## 4. MITMA Zonification

In [None]:
# Bronze: Zonification Municipios (Raw)
SQL("SELECT * FROM bronze_mitma_municipios LIMIT 5")

In [None]:
# Silver: Zonification Unified (Processed)
SQL("SELECT * FROM silver_mitma_zonification LIMIT 5")

## 5. INE Empresas

In [None]:
# Bronze: Empresas Municipio (Raw JSON)
SQL("SELECT * FROM bronze_ine_empresas_municipio LIMIT 5")

In [None]:
# Silver: Business Unified (Processed)
SQL("SELECT * FROM silver_business LIMIT 5")

## 6. INE Poblacion

In [None]:
# Bronze: Poblacion Municipio (Raw JSON)
SQL("SELECT * FROM bronze_ine_poblacion_municipio LIMIT 5")

In [None]:
# Silver: Population Unified (Processed)
SQL("SELECT * FROM silver_population LIMIT 5")

## 7. INE Renta

In [None]:
# Bronze: Renta Municipio (Raw JSON)
SQL("SELECT * FROM bronze_ine_renta_municipio LIMIT 5")

In [None]:
# Silver: Income Unified (Processed)
SQL("SELECT * FROM silver_income LIMIT 5")