In [2]:
pip install tabulate

Collecting tabulate
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Installing collected packages: tabulate
Successfully installed tabulate-0.9.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
# -*- coding: utf-8 -*-
# 04_validaciones_y_exploracion.ipynb
# ----------------------------------
# Objetivo:
# - Validar la calidad de datos en ANALYTICS.OBT_TRIPS
# - Revisar nulos, rangos, coherencia temporal y conteos
# - Generar KPIs de sanity check (porcentaje de nulos, distancias, duraciones, etc.)

import os
import pandas as pd
from snowflake.snowpark import Session
from tabulate import tabulate

# --- Conexión Snowflake ---
cfg = {
    "account":   os.getenv("SNOWFLAKE_ACCOUNT"),
    "user":      os.getenv("SNOWFLAKE_USER"),
    "password":  os.getenv("SNOWFLAKE_PASSWORD"),
    "role":      os.getenv("SNOWFLAKE_ROLE", "SYSADMIN"),
    "warehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
    "database":  os.getenv("SNOWFLAKE_DATABASE"),
}
session = Session.builder.configs(cfg).create()
DB = cfg["database"]
AN_SCHEMA = "ANALYTICS"
OBT_TABLE = f"{DB}.{AN_SCHEMA}.OBT_TRIPS"

print(f"✅ Conectado a {DB} (schema={AN_SCHEMA})")

# ------------------------------------------------------------
# 1️⃣ VALIDACIÓN DE NULOS
# ------------------------------------------------------------
print("\n 1. Nulos por campo principal")
nulls_sql = f"""
SELECT
  SERVICE,
  COUNT_IF(PICKUP_DATETIME IS NULL) AS N_PU_NULL,
  COUNT_IF(DROPOFF_DATETIME IS NULL) AS N_DO_NULL,
  COUNT_IF(PULOCATIONID IS NULL) AS N_PU_LOC_NULL,
  COUNT_IF(DOLOCATIONID IS NULL) AS N_DO_LOC_NULL,
  COUNT_IF(TRIP_DISTANCE IS NULL) AS N_DIST_NULL,
  COUNT_IF(TRIP_DURATION_MIN IS NULL) AS N_DUR_NULL,
  COUNT_IF(TOTAL_AMOUNT IS NULL) AS N_TOTAL_NULL
FROM {OBT_TABLE}
GROUP BY SERVICE
ORDER BY SERVICE;
"""
df_nulls = session.sql(nulls_sql).to_pandas()
print(tabulate(df_nulls, headers="keys", tablefmt="pretty"))

# ------------------------------------------------------------
# 2️⃣ RANGOS Y COHERENCIA DE FECHAS
# ------------------------------------------------------------
print("\n 2. Coherencia temporal y rangos")
time_checks_sql = f"""
SELECT
  SERVICE,
  MIN(PICKUP_DATETIME) AS MIN_PU,
  MAX(PICKUP_DATETIME) AS MAX_PU,
  MIN(DROPOFF_DATETIME) AS MIN_DO,
  MAX(DROPOFF_DATETIME) AS MAX_DO,
  MIN(TRIP_DURATION_MIN) AS MIN_DUR,
  MAX(TRIP_DURATION_MIN) AS MAX_DUR,
  MIN(TRIP_DISTANCE) AS MIN_DIST,
  MAX(TRIP_DISTANCE) AS MAX_DIST,
  MIN(AVG_SPEED_MPH) AS MIN_VEL,
  MAX(AVG_SPEED_MPH) AS MAX_VEL
FROM {OBT_TABLE}
GROUP BY SERVICE
ORDER BY SERVICE;
"""
df_time = session.sql(time_checks_sql).to_pandas()
print(tabulate(df_time, headers="keys", tablefmt="pretty"))

# ------------------------------------------------------------
# 3️⃣ VALIDACIÓN DE RANGOS LÓGICOS (FLAGS)
# ------------------------------------------------------------
print("\n 3. Rango lógico (fuera de rango)")
range_sql = f"""
SELECT
  SERVICE,
  SUM(IFF(TRIP_DURATION_MIN < 0 OR TRIP_DURATION_MIN > 48*60,1,0)) AS DUR_OUT,
  SUM(IFF(TRIP_DISTANCE < 0 OR TRIP_DISTANCE > 150,1,0)) AS DIST_OUT,
  SUM(IFF(AVG_SPEED_MPH > 100,1,0)) AS SPEED_OUT
FROM {OBT_TABLE}
GROUP BY SERVICE
ORDER BY SERVICE;
"""
df_range = session.sql(range_sql).to_pandas()
print(tabulate(df_range, headers="keys", tablefmt="pretty"))

# ------------------------------------------------------------
# 4️⃣ COHERENCIA ENTRE FECHAS Y CAMPOS
# ------------------------------------------------------------
print("\n 4. Checks de coherencia (orden temporal, mes/año, etc.)")
coherence_sql = f"""
SELECT
  SERVICE,
  SUM(IFF(DROPOFF_DATETIME < PICKUP_DATETIME,1,0)) AS BAD_ORDER,
  SUM(IFF(EXTRACT(YEAR FROM PICKUP_DATETIME) != YEAR,1,0)) AS YEAR_MISMATCH,
  SUM(IFF(EXTRACT(MONTH FROM PICKUP_DATETIME) != MONTH,1,0)) AS MONTH_MISMATCH
FROM {OBT_TABLE}
GROUP BY SERVICE
ORDER BY SERVICE;
"""
df_coherence = session.sql(coherence_sql).to_pandas()
print(tabulate(df_coherence, headers="keys", tablefmt="pretty"))

# ------------------------------------------------------------
# 5️⃣ CONTEOS POR MES Y SERVICIO
# ------------------------------------------------------------
print("\n 5. Conteos por servicio/mes (control de cobertura)")
counts_sql = f"""
SELECT
  SERVICE, YEAR, MONTH,
  COUNT(*) AS N_TRIPS,
  COUNT(DISTINCT TRIP_ID) AS N_UNIQUE
FROM {OBT_TABLE}
GROUP BY 1,2,3
ORDER BY 1,2,3;
"""
df_counts = session.sql(counts_sql).to_pandas()
print(tabulate(df_counts.head(12), headers="keys", tablefmt="pretty"))

# ------------------------------------------------------------
# 6️⃣ PORCENTAJE DE NULOS GLOBALES Y METRICAS DE CALIDAD
# ------------------------------------------------------------
print("\n 6. Métricas de calidad globales")
quality_sql = f"""
SELECT
  SERVICE,
  COUNT(*) AS TOTAL,
  ROUND(100 * COUNT_IF(TRIP_DURATION_MIN IS NULL) / COUNT(*), 2) AS PCT_DUR_NULL,
  ROUND(100 * COUNT_IF(TRIP_DISTANCE IS NULL) / COUNT(*), 2) AS PCT_DIST_NULL,
  ROUND(100 * COUNT_IF(TOTAL_AMOUNT IS NULL) / COUNT(*), 2) AS PCT_TOTAL_NULL,
  ROUND(100 * COUNT_IF(TIP_AMOUNT > TOTAL_AMOUNT) / COUNT(*), 2) AS PCT_TIP_OVER_TOTAL
FROM {OBT_TABLE}
GROUP BY SERVICE
ORDER BY SERVICE;
"""
df_quality = session.sql(quality_sql).to_pandas()
print(tabulate(df_quality, headers="keys", tablefmt="pretty"))

# ------------------------------------------------------------
# 7️⃣ EXPORT opcional de resultados a CSV local
# ------------------------------------------------------------
df_quality.to_csv("validacion_obt_quality_summary.csv", index=False)
print("\n💾 Resultados guardados en validacion_obt_quality_summary.csv")

# ------------------------------------------------------------
# 8️⃣ Resumen general
# ------------------------------------------------------------
print("\n✅ Validación terminada.")


✅ Conectado a NYC_TAXI_DM (schema=ANALYTICS)

 1. Nulos por campo principal
+---+---------+-----------+-----------+---------------+---------------+-------------+------------+--------------+
|   | SERVICE | N_PU_NULL | N_DO_NULL | N_PU_LOC_NULL | N_DO_LOC_NULL | N_DIST_NULL | N_DUR_NULL | N_TOTAL_NULL |
+---+---------+-----------+-----------+---------------+---------------+-------------+------------+--------------+
| 0 |  green  |     0     |     0     |       0       |       0       |      0      |     0      |      0       |
| 1 | yellow  |     0     |     0     |       0       |       0       |      0      |     0      |      0       |
+---+---------+-----------+-----------+---------------+---------------+-------------+------------+--------------+

 2. Coherencia temporal y rangos
+---+---------+---------------------+---------------------+---------------------+---------------------+---------+---------+----------+----------+------------------------+---------+
|   | SERVICE |       MIN