In [1]:
import os
from datetime import datetime

def expand_range_csv(spec):
    if "-" in spec and "," not in spec:
        a,b = spec.split("-")
        return list(range(int(a), int(b)+1))
    return [int(x.strip()) for x in spec.split(",") if x.strip()]

YEARS = expand_range_csv(os.getenv("YEARS", "2015"))
MONTHS = expand_range_csv(os.getenv("MONTHS", "1-12"))
SERVICES = [s.strip().lower() for s in os.getenv("SERVICES","yellow,green").split(",")]

RUN_ID = os.getenv("RUN_ID", f"run_{datetime.utcnow().strftime('%Y%m%d_%H%M%S')}")
PARQUET_BASE_URL = os.getenv("PARQUET_BASE_URL")

print("RUN_ID:", RUN_ID)
print("YEARS:", YEARS)
print("MONTHS:", MONTHS)
print("SERVICES:", SERVICES)
print("PARQUET_BASE_URL:", PARQUET_BASE_URL)


RUN_ID: dev_0001
YEARS: [2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024, 2025]
MONTHS: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
SERVICES: ['yellow', 'green']
PARQUET_BASE_URL: https://d37ci6vzurychx.cloudfront.net/trip-data


In [2]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("Deber03_Ingesta")
    .config("spark.sql.shuffle.partitions", "4")
    .getOrCreate()
)
spark


In [18]:
SF_OPTIONS = {
    "sfURL": f"{os.getenv('SNOWFLAKE_ACCOUNT')}.snowflakecomputing.com",
    "sfAccount": os.getenv("SNOWFLAKE_ACCOUNT"),
    "sfUser": os.getenv("SNOWFLAKE_USER"),
    "sfPassword": os.getenv("SNOWFLAKE_PASSWORD"),
    "sfDatabase": os.getenv("SNOWFLAKE_DATABASE"),
    "sfWarehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
    "sfSchema": os.getenv("SNOWFLAKE_SCHEMA", "RAW"),
}
SCHEMA_RAW = os.getenv("SNOWFLAKE_SCHEMA_RAW", "RAW")
print("Conectando a Snowflake en:", SF_OPTIONS["sfURL"])


Conectando a Snowflake en: ALNWMMV-NJ56428.snowflakecomputing.com


In [19]:
!cat .env

# --- Snowflake ---
SNOWFLAKE_ACCOUNT=ALNWMMV-NJ56428
SNOWFLAKE_USER=USERSNOW
SNOWFLAKE_PASSWORD=Password123456
SNOWFLAKE_DATABASE=NYCTAXI_P3
SNOWFLAKE_SCHEMA_RAW=RAW
SNOWFLAKE_SCHEMA_ANALYTICS=ANALYTICS
SNOWFLAKE_WAREHOUSE=COMPUTE_WH
SNOWFLAKE_ROLE=ACCOUNTADMIN
SNOWFLAKE_PRIVATE_KEY_PATH=null
SNOWFLAKE_PRIVATE_KEY_PASSPHRASE=null
SNOWFLAKE_TIMEOUT=null

# --- Parámetros de procesamiento ---
YEARS=2015-2025
MONTHS=1-12
SERVICES=yellow,green
RUN_ID=dev_0001
PARQUET_BASE_URL=https://d37ci6vzurychx.cloudfront.net/trip-data

# --- Jupyter / Spark ---
JUPYTER_TOKEN=class123
JUPYTER_PORT=8888
SPARK_UI_PORT=4040


In [20]:
import snowflake.connector, os
from dotenv import load_dotenv
load_dotenv(".env", override=True)

conn = snowflake.connector.connect(
    account=os.getenv("SNOWFLAKE_ACCOUNT"),
    user=os.getenv("SNOWFLAKE_USER"),
    password=os.getenv("SNOWFLAKE_PASSWORD"),
    warehouse=os.getenv("SNOWFLAKE_WAREHOUSE"),
    database=os.getenv("SNOWFLAKE_DATABASE"),
    schema=os.getenv("SNOWFLAKE_SCHEMA_RAW")
)
cur = conn.cursor()
cur.execute("SELECT current_region(), current_version()")
print("Conexión OK:", cur.fetchone())
cur.close()
conn.close()


Conexión OK: ('AWS_SA_EAST_1', '9.32.1')


In [21]:
from pyspark.sql import SparkSession
import os
from dotenv import load_dotenv

# Cargar variables desde .env
load_dotenv(".env", override=True)

spark = (
    SparkSession.builder
    .appName("Deber03_Ingesta_RAW")
    .config("spark.sql.shuffle.partitions", "4")
    .getOrCreate()
)

SF_OPTIONS = {
    "sfURL": f"{os.getenv('SNOWFLAKE_ACCOUNT')}.snowflakecomputing.com",
    "sfAccount": os.getenv("SNOWFLAKE_ACCOUNT"),
    "sfUser": os.getenv("SNOWFLAKE_USER"),
    "sfPassword": os.getenv("SNOWFLAKE_PASSWORD"),
    "sfDatabase": os.getenv("SNOWFLAKE_DATABASE"),
    "sfWarehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
    "sfSchema": os.getenv("SNOWFLAKE_SCHEMA_RAW", "RAW"),
}

print("Probando conexión Spark–Snowflake...")
test_df = spark.createDataFrame([(1, "ok")], ["id", "msg"])
(
    test_df.write
    .format("snowflake")
    .options(**SF_OPTIONS)
    .option("dbtable", "RAW.CONN_TEST_SPARK")
    .mode("overwrite")
    .save()
)
print("Spark conectado correctamente a Snowflake RAW")


Probando conexión Spark–Snowflake...
Spark conectado correctamente a Snowflake RAW


In [22]:
import os, requests, tempfile
from pyspark.sql import functions as F
from datetime import datetime

BASE_URL = os.getenv("PARQUET_BASE_URL", "https://d37ci6vzurychx.cloudfront.net/trip-data")
SCHEMA_RAW = os.getenv("SNOWFLAKE_SCHEMA_RAW", "RAW")

SERVICES = ["yellow", "green"]
YEARS = [2019]   # luego puedes ampliar a 2015–2025
MONTHS = [1, 2]  # luego ampliar a 1–12


In [23]:
def read_parquet(service, year, month):
    """Descarga un Parquet, lo lee con Spark, estandariza columnas y agrega metadatos."""
    file_name = f"{service}_tripdata_{year}-{month:02d}.parquet"
    url = f"{BASE_URL}/{file_name}"
    print(f"Descargando {url}")

    tmp_path = os.path.join(tempfile.gettempdir(), file_name)
    try:
        r = requests.get(url, timeout=60)
        r.raise_for_status()
        open(tmp_path, "wb").write(r.content)
    except Exception as e:
        raise RuntimeError(f"No se pudo descargar {url}: {e}")

    # Leer con Spark
    df = spark.read.parquet(tmp_path)

    # Estandarizar fechas
    if service == "yellow":
        df = (
            df
            .withColumnRenamed("tpep_pickup_datetime", "pickup_datetime")
            .withColumnRenamed("tpep_dropoff_datetime", "dropoff_datetime")
        )
    else:
        df = (
            df
            .withColumnRenamed("lpep_pickup_datetime", "pickup_datetime")
            .withColumnRenamed("lpep_dropoff_datetime", "dropoff_datetime")
        )

    # Agregar metadatos
    run_id = f"p3_run_{datetime.utcnow():%Y%m%d_%H%M%S}"
    df = (
        df
        .withColumn("service_type", F.lit(service))
        .withColumn("source_year", F.lit(year))
        .withColumn("source_month", F.lit(month))
        .withColumn("run_id", F.lit(run_id))
        .withColumn("ingested_at_utc", F.lit(datetime.utcnow().isoformat()))
        .withColumn("source_path", F.lit(url))
    )

    print(f"{service}_{year}-{month:02d} leído correctamente ({df.count():,} filas)")
    return df


In [24]:
def write_to_raw(df, service, year, month):
    """Escribe DataFrame en Snowflake RAW.{SERVICE}_{YEAR}_{MONTH}."""
    table_name = f"{SCHEMA_RAW}.{service}_{year}_{month:02d}"
    options = dict(SF_OPTIONS)
    options["dbtable"] = table_name

    # Convertir timestamps a string
    for c in ["pickup_datetime", "dropoff_datetime"]:
        if c in df.columns:
            df = df.withColumn(c, F.date_format(F.col(c), "yyyy-MM-dd HH:mm:ss"))

    total_rows = df.count()
    print(f"Guardando en {table_name} ({total_rows:,} filas)...")

    (
        df.write
        .format("snowflake")
        .options(**options)
        .mode("overwrite")  # idempotente por mes/servicio
        .save()
    )

    print(f"Escrito correctamente → {table_name}")


In [27]:
from pyspark.sql import Row

def log_audit(service, year, month, rows_ingested, run_id):
    """Registra metadatos de cada carga en la tabla RAW.INGEST_AUDIT."""
    audit_df = spark.createDataFrame([
        Row(
            service_type=service,
            source_year=year,
            source_month=month,
            run_id=run_id,
            rows_ingested=rows_ingested,
            logged_at_utc=datetime.utcnow().isoformat()
        )
    ])

    (
        audit_df.write
        .format("snowflake")
        .options(**SF_OPTIONS)
        .option("dbtable", f"{SCHEMA_RAW}.INGEST_AUDIT")
        .mode("append")
        .save()
    )

    print(f"Registro auditoría → RAW.INGEST_AUDIT ({service} {year}-{month:02d}: {rows_ingested:,} filas)")


In [28]:
for s in SERVICES:
    for y in YEARS:
        for m in MONTHS:
            try:
                df = read_parquet(s, y, m)
                write_to_raw(df, s, y, m)
                rows = df.count()
                run_id = df.select("run_id").first()["run_id"]
                log_audit(s, y, m, rows, run_id)
            except Exception as e:
                print(f"Error con {s} {y}-{m:02d}: {e}")


Descargando https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2019-01.parquet
yellow_2019-01 leído correctamente (7,696,617 filas)
Guardando en RAW.yellow_2019_01 (7,696,617 filas)...
Escrito correctamente → RAW.yellow_2019_01
Registro auditoría → RAW.INGEST_AUDIT (yellow 2019-01: 7,696,617 filas)
Descargando https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2019-02.parquet
yellow_2019-02 leído correctamente (7,049,370 filas)
Guardando en RAW.yellow_2019_02 (7,049,370 filas)...
Escrito correctamente → RAW.yellow_2019_02
Registro auditoría → RAW.INGEST_AUDIT (yellow 2019-02: 7,049,370 filas)
Descargando https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2019-01.parquet
green_2019-01 leído correctamente (672,105 filas)
Guardando en RAW.green_2019_01 (672,105 filas)...
Escrito correctamente → RAW.green_2019_01
Registro auditoría → RAW.INGEST_AUDIT (green 2019-01: 672,105 filas)
Descargando https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripda