In [1]:
from snowflake.snowpark import Session
import os, time, requests, tempfile, sys
from pathlib import Path

# Conexión Snowflake (usa .env de tu docker-compose)
cfg = {
    "account":   os.getenv("SNOWFLAKE_ACCOUNT"),
    "user":      os.getenv("SNOWFLAKE_USER"),
    "password":  os.getenv("SNOWFLAKE_PASSWORD"),
    "role":      os.getenv("SNOWFLAKE_ROLE","SYSADMIN"),
    "warehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
    "database":  os.getenv("SNOWFLAKE_DATABASE"),
    "schema":    os.getenv("SNOWFLAKE_SCHEMA_RAW","RAW"),
}
session = Session.builder.configs(cfg).create()

DB       = cfg["database"]
RAW_SCH  = cfg["schema"]
AN_SCH   = os.getenv("SNOWFLAKE_SCHEMA_ANALYTICS","ANALYTICS")
RAW_TBL  = f"{DB}.{RAW_SCH}.TRIPS_RAW"
STAGE    = f"{DB}.{RAW_SCH}.RAW_STAGE"            # lo creaste en Paso 2
FF_PQ    = f"{DB}.{RAW_SCH}.FF_PARQUET_TRIPS"
FF_CSV   = f"{DB}.{RAW_SCH}.FF_CSV"               # lo crearemos aquí
ZONES_T  = f"{DB}.{RAW_SCH}.DIM_TAXI_ZONES"       # dimensiones
CAT_VENDOR_T = f"{DB}.{RAW_SCH}.DIM_VENDOR"
CAT_PAY_T    = f"{DB}.{RAW_SCH}.DIM_PAYMENT_TYPE"
CAT_RATE_T   = f"{DB}.{RAW_SCH}.DIM_RATE_CODE"
STG_TBL  = f"{DB}.{AN_SCH}.STG_TRIPS_ENRICHED"    # salida enriquecida/unificada

print(f"✅ Conectado a {DB}.{RAW_SCH}  (RAW={RAW_TBL})")


✅ Conectado a NYC_TAXI_DM.RAW  (RAW=NYC_TAXI_DM.RAW.TRIPS_RAW)


In [2]:
# Crea el esquema ANALYTICS si no existe (para staging de salida)
session.sql(f"CREATE SCHEMA IF NOT EXISTS {DB}.{AN_SCH}").collect()

# File format CSV genérico
session.sql(f"CREATE FILE FORMAT IF NOT EXISTS {FF_CSV} TYPE=CSV SKIP_HEADER=1 FIELD_OPTIONALLY_ENCLOSED_BY='\"'").collect()

# (1) Descargar a temp (URL configurable por env si quieres)
ZONES_URL = os.getenv("TAXI_ZONES_URL", "https://d37ci6vzurychx.cloudfront.net/misc/taxi+_zone_lookup.csv")

def download_temp(url: str, suffix=".csv"):
    r = requests.get(url, stream=True, timeout=120)
    r.raise_for_status()
    total = int(r.headers.get("Content-Length", 0))
    done = 0; chunk = 1024*1024
    fd, tmp_path = tempfile.mkstemp(suffix=suffix)
    with os.fdopen(fd, "wb") as f:
        for part in r.iter_content(chunk_size=chunk):
            if part:
                f.write(part); done += len(part)
                if total:
                    pct = 100*done/max(total,1)
                    sys.stdout.write(f"\r⬇️  {Path(tmp_path).name} {pct:5.1f}%")
                    sys.stdout.flush()
    if total: sys.stdout.write("\n")
    return Path(tmp_path)

tmp = download_temp(ZONES_URL, ".csv")

# (2) Subir al stage y cargar a dimensión
prefix = f"@{STAGE}/ref/lookup_zones/"
session.file.put(str(tmp), prefix, auto_compress=False, overwrite=True, parallel=8)
tmp.unlink(missing_ok=True)

# DIM_TAXI_ZONES (idempotente: DROP/CREATE)
session.sql(f"DROP TABLE IF EXISTS {ZONES_T}").collect()
session.sql(f"""
CREATE TABLE {ZONES_T} (
  LOCATIONID   NUMBER(38,0),
  BOROUGH      STRING,
  ZONE         STRING,
  SERVICE_ZONE STRING
)
""").collect()

session.sql(f"""
COPY INTO {ZONES_T}
FROM (
  SELECT
    TO_NUMBER($1) AS LOCATIONID,
    $2::STRING    AS BOROUGH,
    $3::STRING    AS ZONE,
    $4::STRING    AS SERVICE_ZONE
  FROM {prefix} (FILE_FORMAT => {FF_CSV})
)
ON_ERROR = CONTINUE;
""").collect()

print("✅ DIM_TAXI_ZONES cargada")


✅ DIM_TAXI_ZONES cargada


In [3]:
# Vendors (incluye códigos presentes: 1,2,6,7)
session.sql(f"DROP TABLE IF EXISTS {CAT_VENDOR_T}").collect()
session.sql(f"""
CREATE TABLE {CAT_VENDOR_T} (VENDORID NUMBER(38,0), VENDOR_NAME STRING)
""").collect()
session.sql(f"""
INSERT INTO {CAT_VENDOR_T} (VENDORID, VENDOR_NAME) VALUES
(1,'Creative Mobile Technologies, LLC'),
(2,'Curb Mobility, LLC'),
(6,'Myle Technologies Inc'),
(7,'Helix')
""").collect()

# Payment types (incluye flex fare 0 y voided 6)
session.sql(f"DROP TABLE IF EXISTS {CAT_PAY_T}").collect()
session.sql(f"""
CREATE TABLE {CAT_PAY_T} (PAYMENT_TYPE NUMBER(38,0), PAYMENT_DESC STRING)
""").collect()
session.sql(f"""
INSERT INTO {CAT_PAY_T} VALUES
(0,'Flex Fare trip'),
(1,'Credit card'),
(2,'Cash'),
(3,'No charge'),
(4,'Dispute'),
(5,'Unknown'),
(6,'Voided trip')
""").collect()

# Rate codes
session.sql(f"DROP TABLE IF EXISTS {CAT_RATE_T}").collect()
session.sql(f"""
CREATE TABLE {CAT_RATE_T} (RATECODEID NUMBER(38,0), RATE_DESC STRING)
""").collect()
session.sql(f"""
INSERT INTO {CAT_RATE_T} VALUES
(1,'Standard rate'),
(2,'JFK'),
(3,'Newark'),
(4,'Nassau or Westchester'),
(5,'Negotiated fare'),
(6,'Group ride'),
(99,'Null/unknown')
""").collect()

print("✅ Catálogos normalizados listos")


✅ Catálogos normalizados listos


In [4]:
# Por defecto SOLO enero-2015 de yellow y green (rápido).
PROCESS_ALL = True   # ← pon True para todos los años/meses

if PROCESS_ALL:
    services = ["yellow","green"]
    years    = list(range(2015, 2026))
    months   = list(range(1, 13))
else:
    services = ["yellow","green"]
    years    = [2015]
    months   = [1]

print("Plan:", services, years, months)


Plan: ['yellow', 'green'] [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025] [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


In [5]:
# Crea tabla staging si no existe
session.sql(f"""
CREATE TABLE IF NOT EXISTS {STG_TBL} (
  SERVICE               STRING,
  YEAR                  NUMBER(38,0),
  MONTH                 NUMBER(38,0),

  VENDORID              NUMBER(38,0),
  VENDOR_NAME           STRING,
  PICKUP_DATETIME       TIMESTAMP_NTZ,
  DROPOFF_DATETIME      TIMESTAMP_NTZ,
  PASSENGER_COUNT       NUMBER(38,0),
  TRIP_DISTANCE         FLOAT,

  RATECODEID            NUMBER(38,0),
  RATE_DESC             STRING,
  STORE_AND_FWD_FLAG    STRING,

  PULOCATIONID          NUMBER(38,0),
  PU_BOROUGH            STRING,
  PU_ZONE               STRING,
  PU_SERVICE_ZONE       STRING,

  DOLOCATIONID          NUMBER(38,0),
  DO_BOROUGH            STRING,
  DO_ZONE               STRING,
  DO_SERVICE_ZONE       STRING,

  PAYMENT_TYPE          NUMBER(38,0),
  PAYMENT_DESC          STRING,

  FARE_AMOUNT           FLOAT,
  EXTRA                 FLOAT,
  MTA_TAX               FLOAT,
  TIP_AMOUNT            FLOAT,
  TOLLS_AMOUNT          FLOAT,
  IMPROVEMENT_SURCHARGE FLOAT,
  TOTAL_AMOUNT          FLOAT,
  CONGESTION_SURCHARGE  FLOAT,
  AIRPORT_FEE           FLOAT,
  TRIP_TYPE             NUMBER(38,0),
  CBD_CONGESTION_FEE    FLOAT,

  _RUN_ID               STRING,
  _ENRICHED_AT          TIMESTAMP_NTZ
)
CLUSTER BY (SERVICE, YEAR, MONTH)
""").collect()

def build_enriched_month(service: str, year: int, month: int, run_id: str="step2_enrich"):
    print(f"\n🧩 Enriqueciendo {service} {year}-{str(month).zfill(2)}")

    # Idempotencia: borra el mes/servicio
    session.sql(f"""
      DELETE FROM {STG_TBL}
      WHERE SERVICE='{service}' AND YEAR={year} AND MONTH={month}
    """).collect()

    # Insert enriquecido con joins a DIM_ZONES y catálogos
    ins_sql = f"""
    INSERT INTO {STG_TBL}
    SELECT
      r.SERVICE, r.YEAR, r.MONTH,
      r.VENDORID, v.VENDOR_NAME,
      r.PICKUP_DATETIME, r.DROPOFF_DATETIME,
      r.PASSENGER_COUNT, r.TRIP_DISTANCE,

      r.RATECODEID, rc.RATE_DESC, r.STORE_AND_FWD_FLAG,

      r.PULOCATIONID, zpu.BOROUGH AS PU_BOROUGH, zpu.ZONE AS PU_ZONE, zpu.SERVICE_ZONE AS PU_SERVICE_ZONE,
      r.DOLOCATIONID, zdo.BOROUGH AS DO_BOROUGH, zdo.ZONE AS DO_ZONE, zdo.SERVICE_ZONE AS DO_SERVICE_ZONE,

      r.PAYMENT_TYPE, p.PAYMENT_DESC,

      r.FARE_AMOUNT, r.EXTRA, r.MTA_TAX, r.TIP_AMOUNT, r.TOLLS_AMOUNT,
      r.IMPROVEMENT_SURCHARGE, r.TOTAL_AMOUNT, r.CONGESTION_SURCHARGE,
      r.AIRPORT_FEE, r.TRIP_TYPE, r.CBD_CONGESTION_FEE,

      '{run_id}' AS _RUN_ID, SYSDATE() AS _ENRICHED_AT
    FROM {RAW_TBL} r
    LEFT JOIN {CAT_VENDOR_T} v ON r.VENDORID=v.VENDORID
    LEFT JOIN {CAT_RATE_T}   rc ON r.RATECODEID=rc.RATECODEID
    LEFT JOIN {CAT_PAY_T}    p  ON r.PAYMENT_TYPE=p.PAYMENT_TYPE
    LEFT JOIN {ZONES_T}      zpu ON r.PULOCATIONID=zpu.LOCATIONID
    LEFT JOIN {ZONES_T}      zdo ON r.DOLOCATIONID=zdo.LOCATIONID
    WHERE r.SERVICE='{service}' AND r.YEAR={year} AND r.MONTH={month}
    """

    session.sql(ins_sql).collect()

    cnt = session.sql(f"""
      SELECT COUNT(*) AS C FROM {STG_TBL}
      WHERE SERVICE='{service}' AND YEAR={year} AND MONTH={month}
    """).collect()[0]['C']
    print(f"✅ Enriquecidos: {cnt:,} filas")
    return int(cnt)

# Ejecuta plan según flag
summary = []
for s in services:
    for y in years:
        for m in months:
            summary.append((s,y,m, build_enriched_month(s,y,m)))
summary[-5:]



🧩 Enriqueciendo yellow 2015-01
✅ Enriquecidos: 12,741,035 filas

🧩 Enriqueciendo yellow 2015-02
✅ Enriquecidos: 12,442,394 filas

🧩 Enriqueciendo yellow 2015-03
✅ Enriquecidos: 13,342,951 filas

🧩 Enriqueciendo yellow 2015-04
✅ Enriquecidos: 13,063,758 filas

🧩 Enriqueciendo yellow 2015-05
✅ Enriquecidos: 13,157,677 filas

🧩 Enriqueciendo yellow 2015-06
✅ Enriquecidos: 12,324,936 filas

🧩 Enriqueciendo yellow 2015-07
✅ Enriquecidos: 11,559,666 filas

🧩 Enriqueciendo yellow 2015-08
✅ Enriquecidos: 11,123,123 filas

🧩 Enriqueciendo yellow 2015-09
✅ Enriquecidos: 11,218,122 filas

🧩 Enriqueciendo yellow 2015-10
✅ Enriquecidos: 12,307,333 filas

🧩 Enriqueciendo yellow 2015-11
✅ Enriquecidos: 11,305,240 filas

🧩 Enriqueciendo yellow 2015-12
✅ Enriquecidos: 11,452,996 filas

🧩 Enriqueciendo yellow 2016-01
✅ Enriquecidos: 10,905,067 filas

🧩 Enriqueciendo yellow 2016-02
✅ Enriquecidos: 11,375,412 filas

🧩 Enriqueciendo yellow 2016-03
✅ Enriquecidos: 12,203,824 filas

🧩 Enriqueciendo yellow 2

[('green', 2025, 8, 46306),
 ('green', 2025, 9, 0),
 ('green', 2025, 10, 0),
 ('green', 2025, 11, 0),
 ('green', 2025, 12, 0)]

In [6]:
# Conteo por mes/servicio
session.sql(f"""
SELECT SERVICE, YEAR, MONTH, COUNT(*) AS rowss
FROM {STG_TBL}
GROUP BY 1,2,3
ORDER BY YEAR, MONTH, SERVICE
""").show()

# Taxi zones de ejemplo
session.sql(f"""
SELECT SERVICE, YEAR, MONTH, PU_BOROUGH, PU_ZONE, DO_BOROUGH, DO_ZONE, COUNT(*) AS trips
FROM {STG_TBL}
GROUP BY 1,2,3,4,5,6,7
ORDER BY trips DESC
LIMIT 10
""").show()

# Catálogos
session.sql(f"""
SELECT PAYMENT_TYPE, PAYMENT_DESC, COUNT(*) c
FROM {STG_TBL}
GROUP BY 1,2
ORDER BY c DESC
""").show()


-------------------------------------------
|"SERVICE"  |"YEAR"  |"MONTH"  |"ROWSS"   |
-------------------------------------------
|green      |2016    |5        |1536979   |
|yellow     |2016    |5        |11832049  |
|green      |2016    |6        |1404727   |
|yellow     |2016    |6        |11131645  |
|green      |2016    |7        |1332510   |
|yellow     |2016    |7        |10294080  |
|green      |2016    |8        |1247675   |
|yellow     |2016    |8        |9942263   |
|green      |2016    |9        |1162373   |
|yellow     |2016    |9        |10116018  |
-------------------------------------------

------------------------------------------------------------------------------------------------
|"SERVICE"  |"YEAR"  |"MONTH"  |"PU_BOROUGH"  |"PU_ZONE"  |"DO_BOROUGH"  |"DO_ZONE"  |"TRIPS"  |
------------------------------------------------------------------------------------------------
|yellow     |2015    |3        |Unknown       |N/A        |Unknown       |N/A        |216220