In [2]:
pip install "snowflake-snowpark-python[pandas]"

Collecting snowflake-snowpark-python[pandas]
  Downloading snowflake_snowpark_python-1.40.0-py3-none-any.whl.metadata (170 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m170.6/170.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting snowflake-connector-python<4.0.0,>=3.17.0 (from snowflake-snowpark-python[pandas])
  Downloading snowflake_connector_python-3.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.8/74.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting tzlocal (from snowflake-snowpark-python[pandas])
  Downloading tzlocal-5.3.1-py3-none-any.whl.metadata (7.6 kB)
Collecting asn1crypto<2.0.0,>0.24.0 (from snowflake-connector-python<4.0.0,>=3.17.0->snowflake-snowpark-python[pandas])
  Downloading asn1crypto-1.5.1-py2.py3-none-any.whl.metadata (13 kB)
Collecting boto3>=1.24 (from snowflake-connector-python<4.0.0,>=3.17.0->snowflake-sn

In [3]:
import os, time
from snowflake.snowpark import Session

# --- Conexión desde variables de entorno (.env en Docker) ---
cfg = {
    "account":   os.getenv("SNOWFLAKE_ACCOUNT"),
    "user":      os.getenv("SNOWFLAKE_USER"),
    "password":  os.getenv("SNOWFLAKE_PASSWORD"),
    "role":      os.getenv("SNOWFLAKE_ROLE", "SYSADMIN"),
    "warehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
    "database":  os.getenv("SNOWFLAKE_DATABASE"),
    "schema":    os.getenv("SNOWFLAKE_SCHEMA_RAW", "RAW"),
}

session = Session.builder.configs(cfg).create()
print(f"✅ Conectado: {cfg['database']}.{cfg['schema']}  (WH={cfg['warehouse']}, ROLE={cfg['role']})")

# --- Variables globales ---
DB, SCHEMA = cfg["database"], cfg["schema"]
STAGE  = "RAW_STAGE"
FF_PQ  = "FF_PARQUET_TRIPS"
TABLE  = f"{DB}.{SCHEMA}.TRIPS_RAW"

# --- Limpieza y creación de objetos ---
session.sql(f"CREATE SCHEMA IF NOT EXISTS {DB}.{SCHEMA}").collect()
session.sql(f"DROP TABLE IF EXISTS {TABLE}").collect()
session.sql(f"DROP STAGE IF EXISTS {DB}.{SCHEMA}.{STAGE}").collect()
session.sql(f"DROP FILE FORMAT IF EXISTS {DB}.{SCHEMA}.{FF_PQ}").collect()

# --- Crear stage y file format (PARQUET no necesita opciones para timestamp) ---
session.sql(f"CREATE STAGE {DB}.{SCHEMA}.{STAGE}").collect()
session.sql(f"CREATE FILE FORMAT {DB}.{SCHEMA}.{FF_PQ} TYPE = PARQUET").collect()

# --- Crear tabla RAW (superset Yellow + Green) ---
session.sql(f"""
CREATE TABLE {TABLE} (
  SERVICE                STRING,
  YEAR                   NUMBER(38,0),
  MONTH                  NUMBER(38,0),

  VENDORID               NUMBER(38,0),
  PICKUP_DATETIME        TIMESTAMP_NTZ,
  DROPOFF_DATETIME       TIMESTAMP_NTZ,
  PASSENGER_COUNT        NUMBER(38,0),
  TRIP_DISTANCE          FLOAT,
  RATECODEID             NUMBER(38,0),
  STORE_AND_FWD_FLAG     STRING,
  PULOCATIONID           NUMBER(38,0),
  DOLOCATIONID           NUMBER(38,0),
  PAYMENT_TYPE           NUMBER(38,0),
  FARE_AMOUNT            FLOAT,
  EXTRA                  FLOAT,
  MTA_TAX                FLOAT,
  TIP_AMOUNT             FLOAT,
  TOLLS_AMOUNT           FLOAT,
  IMPROVEMENT_SURCHARGE  FLOAT,
  TOTAL_AMOUNT           FLOAT,
  CONGESTION_SURCHARGE   FLOAT,
  AIRPORT_FEE            FLOAT,          -- solo aplica a yellow
  TRIP_TYPE              NUMBER(38,0),   -- solo aplica a green
  CBD_CONGESTION_FEE     FLOAT,          -- recientes

  _RUN_ID                STRING,
  _INGESTED_AT           TIMESTAMP_NTZ,
  _BATCH_TAG             STRING,
  _CHUNK_ID              NUMBER(38,0)
);
""").collect()

print("✅ Objetos listos:")
print("   Tabla RAW:", TABLE)
print("   Stage:", f"@{DB}.{SCHEMA}.{STAGE}")
print("   File Format:", f"{DB}.{SCHEMA}.{FF_PQ}")


✅ Conectado: NYC_TAXI_DM.RAW  (WH=COMPUTE_WH, ROLE=ACCOUNTADMIN)
✅ Objetos listos:
   Tabla RAW: NYC_TAXI_DM.RAW.TRIPS_RAW
   Stage: @NYC_TAXI_DM.RAW.RAW_STAGE
   File Format: NYC_TAXI_DM.RAW.FF_PARQUET_TRIPS


In [4]:
import sys, tempfile, requests
from pathlib import Path

# Parámetros del .env
SERVICES = ["green", "yellow"]
YEARS    = os.getenv("YEARS", "2019-2019")
MONTHS   = os.getenv("MONTHS", "01-03")
# SERVICES = ["yellow", "green"]
# YEARS    = "2015-2015"
# MONTHS   = "01-01"

RUN_ID   = os.getenv("RUN_ID", "manual_0001")

# Descarga
DOWNLOAD_TIMEOUT = int(os.getenv("DOWNLOAD_TIMEOUT", "120"))
RETRIES          = int(os.getenv("DOWNLOAD_RETRIES", "3"))

print(f"🗓️ SERVICES={SERVICES}  YEARS={YEARS}  MONTHS={MONTHS}  RUN_ID={RUN_ID}")

def expand_range(rng: str, pad=False):
    a,b = rng.split("-")
    xs = [str(x) for x in range(int(a), int(b)+1)]
    return [s.zfill(2) for s in xs] if pad else xs

YEARS_LIST  = expand_range(YEARS)
MONTHS_LIST = expand_range(MONTHS, pad=True)

def tlc_url(service, year, month, ext="parquet"):
    return f"https://d37ci6vzurychx.cloudfront.net/trip-data/{service}_tripdata_{year}-{month}.{ext}"

def download_temp(url: str, suffix: str):
    """Descarga por streaming a archivo temporal efímero y devuelve su Path."""
    for attempt in range(1, RETRIES+1):
        try:
            r = requests.get(url, stream=True, timeout=DOWNLOAD_TIMEOUT)
            if r.status_code != 200:
                return None
            total = int(r.headers.get("Content-Length", 0))
            done = 0; chunk = 1024*1024
            fd, tmp_path = tempfile.mkstemp(suffix=suffix)
            with os.fdopen(fd, "wb") as f:
                for part in r.iter_content(chunk_size=chunk):
                    if part:
                        f.write(part); done += len(part)
                        if total:
                            pct = 100*done/max(total,1)
                            sys.stdout.write(f"\r⬇️  {Path(tmp_path).name} {pct:5.1f}%")
                            sys.stdout.flush()
            if total: sys.stdout.write("\n")
            return Path(tmp_path)
        except Exception as e:
            print(f"   intento {attempt}/{RETRIES} falló → {e}")
    return None


🗓️ SERVICES=['green', 'yellow']  YEARS=2015-2025  MONTHS=01-12  RUN_ID=manual_0001


In [5]:
def make_copy_select_sql(service: str, year: str, month: str, run_id: str) -> str:
    db, sc, stg, ff, table = DB, SCHEMA, STAGE, FF_PQ, TABLE
    y, m = int(year), int(month)

    def fld(name: str) -> str:
        # Devuelve un VARIANT del parquet con búsqueda case-insensitive
        cand = [
            f'$1:"{name}"',
            f'$1:"{name.lower()}"',
            f'$1:"{name.upper()}"',
            f'$1:"{name.capitalize()}"',
        ]
        return "COALESCE(" + ", ".join(cand) + ")"

    def ts_auto(name: str) -> str:
        v = fld(name)  # VARIANT
        # 1) Si ya viene como TIMESTAMP_* en el VARIANT, castear directo a NTZ
        # 2) Intentar parsear como string
        # 3) Intentar como número (detectando escala por longitud)
        num_expr = f"TRY_TO_NUMBER(TO_VARCHAR({v}))"
        num_len  = f"LENGTH(TO_VARCHAR({num_expr}))"
        return f"""
        CASE
          WHEN TYPEOF({v}) IN ('TIMESTAMP_NTZ','TIMESTAMP_LTZ','TIMESTAMP_TZ') THEN {v}::TIMESTAMP_NTZ
          WHEN TRY_TO_TIMESTAMP_NTZ(TO_VARCHAR({v})) IS NOT NULL THEN TRY_TO_TIMESTAMP_NTZ(TO_VARCHAR({v}))
          WHEN {num_expr} IS NOT NULL THEN
            CASE
              WHEN {num_len} >= 16 THEN TO_TIMESTAMP_NTZ({num_expr}, 6)  -- microsegundos
              WHEN {num_len} >= 13 THEN TO_TIMESTAMP_NTZ({num_expr}, 3)  -- milisegundos
              ELSE TO_TIMESTAMP_NTZ({num_expr}, 0)                       -- segundos
            END
          ELSE NULL
        END
        """

    # Campos comunes
    vendorid              = f"TRY_TO_NUMBER(TO_VARCHAR({fld('VendorID')}))"
    passenger_count       = f"TRY_TO_NUMBER(TO_VARCHAR({fld('passenger_count')}))"
    trip_distance         = f"TRY_TO_DOUBLE(TO_VARCHAR({fld('trip_distance')}))"
    ratecodeid            = f"TRY_TO_NUMBER(TO_VARCHAR({fld('RatecodeID')}))"
    store_and_fwd_flag    = f"TO_VARCHAR({fld('store_and_fwd_flag')})"
    pulocationid          = f"TRY_TO_NUMBER(TO_VARCHAR({fld('PULocationID')}))"
    dolocationid          = f"TRY_TO_NUMBER(TO_VARCHAR({fld('DOLocationID')}))"
    payment_type          = f"TRY_TO_NUMBER(TO_VARCHAR({fld('payment_type')}))"
    fare_amount           = f"TRY_TO_DOUBLE(TO_VARCHAR({fld('fare_amount')}))"
    extra                 = f"TRY_TO_DOUBLE(TO_VARCHAR({fld('extra')}))"
    mta_tax               = f"TRY_TO_DOUBLE(TO_VARCHAR({fld('mta_tax')}))"
    tip_amount            = f"TRY_TO_DOUBLE(TO_VARCHAR({fld('tip_amount')}))"
    tolls_amount          = f"TRY_TO_DOUBLE(TO_VARCHAR({fld('tolls_amount')}))"
    improvement_surcharge = f"TRY_TO_DOUBLE(TO_VARCHAR({fld('improvement_surcharge')}))"
    total_amount          = f"TRY_TO_DOUBLE(TO_VARCHAR({fld('total_amount')}))"
    congestion_surcharge  = f"TRY_TO_DOUBLE(TO_VARCHAR({fld('congestion_surcharge')}))"
    airport_fee           = f"TRY_TO_DOUBLE(TO_VARCHAR({fld('airport_fee')}))"   # yellow
    trip_type             = f"TRY_TO_NUMBER(TO_VARCHAR({fld('trip_type')}))"     # green
    cbd_congestion_fee    = f"TRY_TO_DOUBLE(TO_VARCHAR({fld('cbd_congestion_fee')}))"

    if service == "yellow":
        pu_ts = ts_auto('tpep_pickup_datetime')
        do_ts = ts_auto('tpep_dropoff_datetime')
    elif service == "green":
        pu_ts = ts_auto('lpep_pickup_datetime')
        do_ts = ts_auto('lpep_dropoff_datetime')
    else:
        raise ValueError("service debe ser 'yellow' o 'green'")

    select_sql = f"""
    SELECT
      '{service}'::STRING                    AS SERVICE,
      {y}::NUMBER                            AS YEAR,
      {m}::NUMBER                            AS MONTH,

      {vendorid}                             AS VENDORID,
      {pu_ts}                                AS PICKUP_DATETIME,
      {do_ts}                                AS DROPOFF_DATETIME,
      {passenger_count}                      AS PASSENGER_COUNT,
      {trip_distance}                        AS TRIP_DISTANCE,
      {ratecodeid}                           AS RATECODEID,
      {store_and_fwd_flag}                   AS STORE_AND_FWD_FLAG,
      {pulocationid}                         AS PULOCATIONID,
      {dolocationid}                         AS DOLOCATIONID,
      {payment_type}                         AS PAYMENT_TYPE,
      {fare_amount}                          AS FARE_AMOUNT,
      {extra}                                AS EXTRA,
      {mta_tax}                              AS MTA_TAX,
      {tip_amount}                           AS TIP_AMOUNT,
      {tolls_amount}                         AS TOLLS_AMOUNT,
      {improvement_surcharge}                AS IMPROVEMENT_SURCHARGE,
      {total_amount}                         AS TOTAL_AMOUNT,
      {congestion_surcharge}                 AS CONGESTION_SURCHARGE,
      {airport_fee}                          AS AIRPORT_FEE,
      {trip_type}                            AS TRIP_TYPE,
      {cbd_congestion_fee}                   AS CBD_CONGESTION_FEE,

      '{run_id}'::STRING                     AS _RUN_ID,
      CURRENT_TIMESTAMP()                    AS _INGESTED_AT,
      '{service}/{y}/{str(m).zfill(2)}'      AS _BATCH_TAG,
      NULL::NUMBER                           AS _CHUNK_ID
    FROM @{db}.{sc}.{stg}/{service}/{y}/{str(m).zfill(2)}/
      (FILE_FORMAT => {db}.{sc}.{ff})
    """

    return f"""
    COPY INTO {table}
    FROM (
      {select_sql}
    )
    ON_ERROR = 'ABORT_STATEMENT'
    PURGE = FALSE
    FORCE = TRUE;
    """


In [6]:
def ingest_month(service: str, year: str, month: str, run_id: str) -> int:
    print(f"\n📅 {service} {year}-{month} — inicio")

    # Descargar Parquet a temp
    url = tlc_url(service, year, month, "parquet")
    print("→ descargando:", url)
    tmp = download_temp(url, ".parquet")
    if tmp is None:
        print("⚠️ no encontrado (parquet). Saltando.")
        return 0

    try:
        prefix = f"@{DB}.{SCHEMA}.{STAGE}/{service}/{year}/{month}/"

        # PUT al stage (paralelo)
        t_put = time.time()
        put_res = session.file.put(
            local_file_name=str(tmp),
            stage_location=prefix,
            auto_compress=False,
            overwrite=True,
            parallel=8
        )
        print(f"   ⬆️ PUT ({len(put_res)} file/s) t={time.time()-t_put:,.1f}s")

        # Idempotencia: borra ese mes/servicio
        session.sql(f"""
          DELETE FROM {TABLE}
          WHERE SERVICE='{service}' AND YEAR={int(year)} AND MONTH={int(month)}
        """).collect()

        # COPY SELECT (mapeo por data dictionary)
        t_copy = time.time()
        copy_sql = make_copy_select_sql(service, year, month, run_id)
        res = session.sql(copy_sql).collect()
        loaded = sum(r.asDict().get('rows_loaded', 0) for r in res)
        print(f"   ✅ COPY rows={loaded:,}  t={time.time()-t_copy:,.1f}s")

        # Limpieza del stage del mes (opcional pero recomendable)
        session.sql(f"REMOVE {prefix}").collect()
        print("🧽 Stage limpio para", f"{service}/{year}/{month}")

        print(f"✅ {service} {year}-{month} — OK")
        return int(loaded)
    finally:
        try:
            tmp.unlink(missing_ok=True)
            print(f"🧹 temp borrado: {tmp.name}")
        except Exception as e:
            print("⚠️ no pude borrar temp:", e)


In [7]:
import pandas as pd

summary = []
t_all = time.time()
for s in SERVICES:
    for y in YEARS_LIST:
        for m in MONTHS_LIST:
            try:
                n = ingest_month(s, y, m, RUN_ID)
                summary.append((s, y, m, n))
            except Exception as e:
                print(f"❌ Error en {s} {y}-{m} → {e}")
                summary.append((s, y, m, -1))

print(f"\n🏁 Proceso completo en {time.time()-t_all:,.1f}s")
df = pd.DataFrame(summary, columns=["service","year","month","rows_loaded"])
display(df.tail(12))

from pathlib import Path
EVID = Path("/home/jovyan/work/evidence"); EVID.mkdir(parents=True, exist_ok=True)
out = EVID / f"raw_ingest_summary_{RUN_ID}.csv"
df.to_csv(out, index=False)
print("📄 evidencia:", out)



📅 green 2015-01 — inicio
→ descargando: https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2015-01.parquet
⬇️  tmpxem1_u0t.parquet 100.0%
   ⬆️ PUT (1 file/s) t=3.9s
   ✅ COPY rows=1,508,493  t=35.1s
🧽 Stage limpio para green/2015/01
✅ green 2015-01 — OK
🧹 temp borrado: tmpxem1_u0t.parquet

📅 green 2015-02 — inicio
→ descargando: https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2015-02.parquet
⬇️  tmpew1txu93.parquet 100.0%
   ⬆️ PUT (1 file/s) t=3.2s
   ✅ COPY rows=1,574,830  t=35.7s
🧽 Stage limpio para green/2015/02
✅ green 2015-02 — OK
🧹 temp borrado: tmpew1txu93.parquet

📅 green 2015-03 — inicio
→ descargando: https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2015-03.parquet
⬇️  tmprx0a44s8.parquet 100.0%
   ⬆️ PUT (1 file/s) t=3.8s
   ✅ COPY rows=1,722,574  t=39.3s
🧽 Stage limpio para green/2015/03
✅ green 2015-03 — OK
🧹 temp borrado: tmprx0a44s8.parquet

📅 green 2015-04 — inicio
→ descargando: https://d37ci6vzurychx.cloudfront.net/trip-data

Unnamed: 0,service,year,month,rows_loaded
252,yellow,2025,1,3475226
253,yellow,2025,2,3577543
254,yellow,2025,3,4145257
255,yellow,2025,4,3970553
256,yellow,2025,5,4591845
257,yellow,2025,6,4322960
258,yellow,2025,7,3898963
259,yellow,2025,8,3574091
260,yellow,2025,9,0
261,yellow,2025,10,0


📄 evidencia: /home/jovyan/work/evidence/raw_ingest_summary_manual_0001.csv
