In [1]:
from pyspark.sql import SparkSession, functions as F
import os
from dotenv import load_dotenv

load_dotenv(".env", override=True)

spark = (
    SparkSession.builder
    .appName("Deber04_CalidadAuditoria")
    .config("spark.sql.shuffle.partitions", "4")
    .getOrCreate()
)

SF_OPTIONS = {
    "sfURL": f"{os.getenv('SNOWFLAKE_ACCOUNT')}.snowflakecomputing.com",
    "sfDatabase": os.getenv("SNOWFLAKE_DATABASE"),
    "sfWarehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
    "sfUser": os.getenv("SNOWFLAKE_USER"),
    "sfPassword": os.getenv("SNOWFLAKE_PASSWORD"),
    "sfSchema": "ANALYTICS"
}

df = (
    spark.read.format("snowflake")
    .options(**SF_OPTIONS)
    .option("dbtable", "TRIPS_ENRICHED")
    .load()
)

print("Datos cargados:", df.count(), "filas")
df.printSchema()


Datos cargados: 16033686 filas
root
 |-- VENDORID: decimal(38,0) (nullable = true)
 |-- VENDOR_NAME: string (nullable = true)
 |-- PICKUP_DATETIME: timestamp (nullable = true)
 |-- DROPOFF_DATETIME: timestamp (nullable = true)
 |-- PASSENGER_COUNT: double (nullable = true)
 |-- TRIP_DISTANCE: double (nullable = true)
 |-- RATECODEID: double (nullable = true)
 |-- RATE_CODE_DESC: string (nullable = true)
 |-- STORE_AND_FWD_FLAG: string (nullable = true)
 |-- PULOCATIONID: decimal(38,0) (nullable = true)
 |-- DOLOCATIONID: decimal(38,0) (nullable = true)
 |-- PAYMENT_TYPE: double (nullable = true)
 |-- PAYMENT_TYPE_DESC: string (nullable = true)
 |-- FARE_AMOUNT: double (nullable = true)
 |-- EXTRA: double (nullable = true)
 |-- MTA_TAX: double (nullable = true)
 |-- TIP_AMOUNT: double (nullable = true)
 |-- TOLLS_AMOUNT: double (nullable = true)
 |-- IMPROVEMENT_SURCHARGE: double (nullable = true)
 |-- TOTAL_AMOUNT: double (nullable = true)
 |-- CONGESTION_SURCHARGE: double (nullable = 

In [5]:
# === Bloque 02: Control de calidad de datos (nulos, duplicados, rangos) ===
print("Ejecutando control de calidad directamente en Snowflake...")

SF_OPTIONS_ANALYTICS = dict(SF_OPTIONS)
SF_OPTIONS_ANALYTICS["sfSchema"] = "ANALYTICS"

df_quality = (
    spark.read.format("snowflake")
    .options(**SF_OPTIONS_ANALYTICS)
    .option("query", """
        SELECT
            COUNT(*) AS total_filas,
            COUNT(DISTINCT run_id) AS ejecuciones,
            SUM(CASE WHEN trip_distance IS NULL THEN 1 ELSE 0 END) AS nulos_trip_distance,
            SUM(CASE WHEN total_amount IS NULL THEN 1 ELSE 0 END) AS nulos_total_amount,
            SUM(CASE WHEN trip_distance < 0 OR total_amount < 0 THEN 1 ELSE 0 END) AS outliers,
            COUNT(*) - COUNT(DISTINCT CONCAT_WS('-', vendorid, pickup_datetime, dropoff_datetime, total_amount)) AS duplicados_aprox
        FROM trips_enriched
    """)
    .load()
)

print("esultados de control de calidad (Snowflake-side):")
df_quality.show(truncate=False)


Ejecutando control de calidad directamente en Snowflake...
esultados de control de calidad (Snowflake-side):
+-----------+-----------+-------------------+------------------+--------+----------------+
|TOTAL_FILAS|EJECUCIONES|NULOS_TRIP_DISTANCE|NULOS_TOTAL_AMOUNT|OUTLIERS|DUPLICADOS_APROX|
+-----------+-----------+-------------------+------------------+--------+----------------+
|16033686   |4          |0                  |0                 |21257   |1146            |
+-----------+-----------+-------------------+------------------+--------+----------------+



In [8]:
# Verificación de auditoría RAW.INGEST_AUDIT
import snowflake.connector
import pandas as pd
from dotenv import load_dotenv
import os
import warnings

# Silenciar advertencias de pandas sobre conectores no-SQLAlchemy
warnings.filterwarnings("ignore", category=UserWarning, module="pandas")

# Cargar variables de entorno
load_dotenv(".env", override=True)

# Conexión directa con Snowflake
conn = snowflake.connector.connect(
    account=os.getenv("SNOWFLAKE_ACCOUNT"),
    user=os.getenv("SNOWFLAKE_USER"),
    password=os.getenv("SNOWFLAKE_PASSWORD"),
    warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
    database=os.getenv("SNOWFLAKE_DATABASE"),
    schema="RAW"
)

# Consulta de auditoría
query = """
SELECT
    service_type,
    source_year,
    source_month,
    run_id,
    rows_ingested,
    logged_at_utc
FROM INGEST_AUDIT
ORDER BY logged_at_utc DESC
LIMIT 20
"""

# Ejecutar y leer resultados
with conn.cursor() as cur:
    cur.execute(query)
    df_audit = pd.DataFrame(cur.fetchall(), columns=[desc[0] for desc in cur.description])

conn.close()

print("Últimos registros de auditoría:")
display(df_audit)



Últimos registros de auditoría:


Unnamed: 0,SERVICE_TYPE,SOURCE_YEAR,SOURCE_MONTH,RUN_ID,ROWS_INGESTED,LOGGED_AT_UTC
0,green,2019,2,p3_run_20251017_205741,615594,2025-10-17T20:58:05.784979
1,green,2019,1,p3_run_20251017_205708,672105,2025-10-17T20:57:34.634569
2,yellow,2019,2,p3_run_20251017_205443,7049370,2025-10-17T20:57:01.882916
3,yellow,2019,1,p3_run_20251017_205201,7696617,2025-10-17T20:54:29.091247


In [10]:
# === Bloque 04: Resumen global de control de calidad + auditoría ===
from datetime import datetime

#Extraer resumen del control de calidad (bloque 02)
quality = df_quality.collect()[0].asDict()

# Extraer resumen de auditoría
total_auditorias = len(df_audit)
ultimo_run = df_audit["RUN_ID"].iloc[0] if total_auditorias > 0 else "N/A"
ultima_fecha = df_audit["LOGGED_AT_UTC"].iloc[0] if total_auditorias > 0 else "N/A"

# Mostrar informe resumen
print("\n**Resumen de Control de Calidad y Auditoría**")
print("----------------------------------------------------")
print(f"Fecha de verificación: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"Total de filas en ANALYTICS.TRIPS_ENRICHED: {quality['TOTAL_FILAS']}")
print(f"Ejecuciones únicas: {quality['EJECUCIONES']}")
print(f"Nulos en trip_distance: {quality['NULOS_TRIP_DISTANCE']}")
print(f"Nulos en total_amount: {quality['NULOS_TOTAL_AMOUNT']}")
print(f"Valores fuera de rango: {quality['OUTLIERS']}")
print(f"Duplicados aproximados: {quality['DUPLICADOS_APROX']}")
print("----------------------------------------------------")
print(f"Auditorías registradas: {total_auditorias}")
print(f"Último run_id: {ultimo_run}")
print(f"Último log registrado: {ultima_fecha}")
print("----------------------------------------------------")
print("Control y auditoría completados correctamente.")



**Resumen de Control de Calidad y Auditoría**
----------------------------------------------------
Fecha de verificación: 2025-10-18 23:17:39
Total de filas en ANALYTICS.TRIPS_ENRICHED: 16033686
Ejecuciones únicas: 4
Nulos en trip_distance: 0
Nulos en total_amount: 0
Valores fuera de rango: 21257
Duplicados aproximados: 1146
----------------------------------------------------
Auditorías registradas: 4
Último run_id: p3_run_20251017_205741
Último log registrado: 2025-10-17T20:58:05.784979
----------------------------------------------------
Control y auditoría completados correctamente.
