In [None]:
import os
# Leer variables de entorno
SNOWFLAKE_ACCOUNT = os.getenv('SNOWFLAKE_ACCOUNT')
SNOWFLAKE_USER = os.getenv('SNOWFLAKE_USER')
SNOWFLAKE_PASSWORD = os.getenv('SNOWFLAKE_PASSWORD')
SNOWFLAKE_DATABASE = os.getenv('SNOWFLAKE_DATABASE')
SNOWFLAKE_SCHEMA_ANALYTICS = os.getenv('SNOWFLAKE_SCHEMA_ANALYTICS', 'ANALYTICS')
SNOWFLAKE_WAREHOUSE = os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH')

In [None]:
# ============================================================
# CONEXIÓN DIRECTA A SNOWFLAKE (SQL NATIVO)
# ============================================================
print("=== CONEXIÓN SNOWFLAKE PARA SQL NATIVO ===")

try:
    import snowflake.connector
    print("snowflake-connector-python disponible")

    # Configuración de conexión
    conn_config = {
        'user': SNOWFLAKE_USER,
        'password': SNOWFLAKE_PASSWORD,
        'account': SNOWFLAKE_ACCOUNT,
        'warehouse': SNOWFLAKE_WAREHOUSE,
        'database': SNOWFLAKE_DATABASE,
        'schema': SNOWFLAKE_SCHEMA_ANALYTICS
    }

    # ------------------------------------------------------------
    # Función para ejecutar SQL (multi-sentencia o con resultados)
    # ------------------------------------------------------------
    def execute_snowflake_sql(sql_query: str, fetch_results: bool = False):
        """
        Ejecuta uno o varios comandos SQL directamente en Snowflake.
        Si fetch_results=True, retorna los resultados del último query.
        """
        try:
            conn = snowflake.connector.connect(**conn_config)
            cursor = conn.cursor()

            # Permitir múltiples sentencias separadas por ';'
            statements = [s.strip() for s in sql_query.split(';') if s.strip()]
            results = None

            for stmt in statements:
                print(f"Ejecutando: {stmt[:80]}{'...' if len(stmt) > 80 else ''}")
                cursor.execute(stmt)

                # Si se pidió obtener resultados (por ejemplo, un SELECT)
                if fetch_results and stmt.lower().startswith("select"):
                    results = cursor.fetchall()
                    print(f"🔹 {len(results)} filas obtenidas.")

            conn.commit()
            print("Ejecución completada correctamente.")
            return results

        except Exception as e:
            print(f" Error ejecutando SQL: {e}")
            return None

        finally:
            try:
                cursor.close()
                conn.close()
            except Exception:
                pass

    print("Función execute_snowflake_sql() configurada y lista")

except ImportError:
    print(" snowflake-connector-python no instalado")
    print("Instala con: pip install snowflake-connector-python")

    def execute_snowflake_sql(sql_query, fetch_results=False):
        print("Connector no disponible. SQL para ejecutar manualmente:")
        print("-" * 60)
        print(sql_query)
        print("-" * 60)
        return None

print("Configuración lista para ejecutar SQL nativo en Snowflake.")


=== CONEXIÓN SNOWFLAKE PARA SQL NATIVO ===
snowflake-connector-python disponible
Función execute_snowflake_sql() configurada y lista
Configuración lista para ejecutar SQL nativo en Snowflake.


## Construcción de la tabla obt

In [13]:
# CREAR SCHEMA OBT Y ESTRUCTURA DE TABLA
create_obt_schema_sql = """
-- =====================================================
-- CREAR SCHEMA ANALYTICS Y TABLA OBT_TRIPS
-- Grano: 1 fila = 1 viaje con todas las derivadas
-- =====================================================

-- Crear schema ANALYTICS si no existe
CREATE SCHEMA IF NOT EXISTS analytics
COMMENT = 'Schema para tablas OBT (Operational Business Tables) de NYC Taxi';

-- Crear tabla OBT_TRIPS con estructura completa
CREATE OR REPLACE TABLE analytics.obt_trips (
    -- IDENTIFICADORES ÚNICOS
    trip_id STRING PRIMARY KEY,  -- Clave primaria generada
    natural_key STRING,          -- Clave natural original

    -- DIMENSIONES TEMPORALES ENRIQUECIDAS
    pickup_datetime TIMESTAMP_NTZ,
    dropoff_datetime TIMESTAMP_NTZ,
    pickup_date DATE,
    pickup_hour INT,
    dropoff_date DATE,
    dropoff_hour INT,
    day_of_week INT,      -- 1=Lunes, 7=Domingo
    day_of_week_name STRING,
    month INT,
    month_name STRING,
    year INT,
    quarter INT,
    is_weekend BOOLEAN,

    -- DIMENSIONES GEOGRÁFICAS
    pu_location_id INT,
    pu_zone STRING,
    pu_borough STRING,
    pu_service_zone STRING,
    do_location_id INT,
    do_zone STRING,
    do_borough STRING,
    do_service_zone STRING,

    -- DIMENSIONES DE SERVICIO Y CÓDIGOS
    service_type STRING,      -- yellow/green
    vendor_id INT,
    vendor_name STRING,
    rate_code_id INT,
    rate_code_desc STRING,
    payment_type INT,
    payment_type_desc STRING,
    trip_type FLOAT,          -- Solo para Green taxi
    trip_type_desc STRING,
    store_and_fwd_flag STRING,

    -- MÉTRICAS DEL VIAJE
    passenger_count INT,
    trip_distance FLOAT,

    -- MÉTRICAS FINANCIERAS
    fare_amount FLOAT,
    extra FLOAT,
    mta_tax FLOAT,
    tip_amount FLOAT,
    tolls_amount FLOAT,
    improvement_surcharge FLOAT,
    congestion_surcharge FLOAT,
    airport_fee FLOAT,
    cbd_congestion_fee FLOAT,
    total_amount FLOAT,

    -- MÉTRICAS DERIVADAS
    trip_duration_min FLOAT,     -- Duración en minutos
    avg_speed_mph FLOAT,         -- Velocidad promedio MPH
    tip_pct FLOAT,               -- Porcentaje de propina
    revenue_per_mile FLOAT,      -- Ingreso por milla
    cost_per_minute FLOAT,       -- Costo por minuto

    -- INDICADORES ANALÍTICOS
    is_airport_trip BOOLEAN,     -- Si involucra aeropuerto
    is_long_trip BOOLEAN,        -- Viaje largo (>30 min)
    is_short_trip BOOLEAN,       -- Viaje corto (<5 min)
    is_premium_fare BOOLEAN,     -- Tarifa premium (>$50)
    trip_category STRING,        -- Categorización del viaje

    -- LINEAGE Y CALIDAD DE DATOS
    run_id STRING,
    ingested_at_utc TIMESTAMP_NTZ,
    source_service STRING,       -- Servicio origen (yellow/green)
    source_year INT,
    source_month INT,
    source_path STRING,
    obt_created_at TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP(),
    obt_version STRING DEFAULT '1.0'
)
COMMENT = 'OBT principal para análisis de viajes NYC Taxi - Grano: 1 fila = 1 viaje';

"""

print("SQL para crear schema OBT y tabla OBT_TRIPS generado")
print()
print("ESTRUCTURA OBT_TRIPS:")
print("• Grano: 1 fila = 1 viaje")
print("• Dimensiones temporales: 14 campos (fecha, hora, día semana, etc.)")
print("• Dimensiones geográficas: 8 campos (pickup/dropoff zones)")
print("• Dimensiones de servicio: 10 campos (vendor, payment, rate codes)")
print("• Métricas financieras: 10 campos (tarifas, impuestos, propinas)")
print("• Métricas derivadas: 5 campos (duración, velocidad, ratios)")
print("• Indicadores analíticos: 5 campos (categorización de viajes)")
print("• Lineage/Calidad: 8 campos (tracking de origen y versión)")
print()
execute_snowflake_sql(create_obt_schema_sql)

SQL para crear schema OBT y tabla OBT_TRIPS generado

ESTRUCTURA OBT_TRIPS:
• Grano: 1 fila = 1 viaje
• Dimensiones temporales: 14 campos (fecha, hora, día semana, etc.)
• Dimensiones geográficas: 8 campos (pickup/dropoff zones)
• Dimensiones de servicio: 10 campos (vendor, payment, rate codes)
• Métricas financieras: 10 campos (tarifas, impuestos, propinas)
• Métricas derivadas: 5 campos (duración, velocidad, ratios)
• Indicadores analíticos: 5 campos (categorización de viajes)
• Lineage/Calidad: 8 campos (tracking de origen y versión)

-- CREAR SCHEMA ANALYTI...
Ejecutando: -- Crear tabla OBT_TRIPS con estructura completa
CREATE OR REPLACE TABLE analyti...
Ejecución completada correctamente.


## Insercion de datos en la tabla

In [14]:
# SQL PARA CONSTRUCCIÓN COMPLETA DE OBT CON DERIVADAS
build_obt_sql = """
-- =====================================================
-- CONSTRUCCIÓN OBT_TRIPS CON TODAS LAS DERIVADAS
-- Fuente: NY_TAXI.ENRICHED.UNIFIED_TAXI_ENRICHED
-- Destino: analytics.obt_trips
-- =====================================================

INSERT INTO analytics.obt_trips
SELECT
    -- IDENTIFICADORES ÚNICOS
    CONCAT(service_type, '_', natural_key, '_', DATE_PART('epoch', pickup_datetime)) as trip_id,
    natural_key,

    -- DIMENSIONES TEMPORALES ENRIQUECIDAS
    pickup_datetime,
    dropoff_datetime,
    DATE(pickup_datetime) as pickup_date,
    HOUR(pickup_datetime) as pickup_hour,
    DATE(dropoff_datetime) as dropoff_date,
    HOUR(dropoff_datetime) as dropoff_hour,
    DAYOFWEEK(pickup_datetime) as day_of_week,  -- 1=Domingo, 7=Sábado
    DAYNAME(pickup_datetime) as day_of_week_name,
    MONTH(pickup_datetime) as month,
    MONTHNAME(pickup_datetime) as month_name,
    YEAR(pickup_datetime) as year,
    QUARTER(pickup_datetime) as quarter,
    CASE WHEN DAYOFWEEK(pickup_datetime) IN (1, 7) THEN TRUE ELSE FALSE END as is_weekend,

    -- DIMENSIONES GEOGRÁFICAS
    pickup_location_id as pu_location_id,
    pickup_zone as pu_zone,
    pickup_borough as pu_borough,
    pickup_service_zone as pu_service_zone,
    dropoff_location_id as do_location_id,
    dropoff_zone as do_zone,
    dropoff_borough as do_borough,
    dropoff_service_zone as do_service_zone,

    -- DIMENSIONES DE SERVICIO Y CÓDIGOS
    service_type,
    VendorID as vendor_id,
    vendor_name,
    rate_code_id,
    rate_code_name as rate_code_desc,
    payment_type_id as payment_type,
    payment_type_name as payment_type_desc,
    trip_type,
    trip_type_name as trip_type_desc,
    store_and_fwd_flag,

    -- MÉTRICAS DEL VIAJE
    passenger_count,
    trip_distance,

    -- MÉTRICAS FINANCIERAS
    fare_amount,
    extra,
    mta_tax,
    tip_amount,
    tolls_amount,
    improvement_surcharge,
    congestion_surcharge,
    airport_fee,
    cbd_congestion_fee,
    total_amount,

    -- MÉTRICAS DERIVADAS CALCULADAS
    trip_duration_minutes as trip_duration_min,

    -- Velocidad promedio (MPH) = millas / (minutos/60)
    CASE
        WHEN trip_distance > 0 AND trip_duration_minutes > 0
        THEN ROUND(trip_distance / (trip_duration_minutes / 60.0), 2)
        ELSE NULL
    END as avg_speed_mph,

    -- Porcentaje de propina
    CASE
        WHEN fare_amount > 0
        THEN ROUND((tip_amount / fare_amount) * 100, 2)
        ELSE 0
    END as tip_pct,

    -- Revenue per mile (ya existe como revenue_per_mile)
    revenue_per_mile,

    -- Costo por minuto
    CASE
        WHEN trip_duration_minutes > 0
        THEN ROUND(total_amount / trip_duration_minutes, 3)
        ELSE NULL
    END as cost_per_minute,

    -- INDICADORES ANALÍTICOS
    -- Es viaje a aeropuerto (JFK, Newark, etc.)
    CASE
        WHEN rate_code_name IN ('JFK', 'Newark')
             OR pickup_zone LIKE '%Airport%'
             OR dropoff_zone LIKE '%Airport%'
        THEN TRUE
        ELSE FALSE
    END as is_airport_trip,

    -- Es viaje largo (>30 minutos)
    CASE WHEN trip_duration_minutes > 30 THEN TRUE ELSE FALSE END as is_long_trip,

    -- Es viaje corto (<5 minutos)
    CASE WHEN trip_duration_minutes < 5 THEN TRUE ELSE FALSE END as is_short_trip,

    -- Es tarifa premium (>$50)
    CASE WHEN total_amount > 50 THEN TRUE ELSE FALSE END as is_premium_fare,

    -- Categorización del viaje
    CASE
        WHEN trip_duration_minutes < 5 THEN 'Short Trip'
        WHEN trip_duration_minutes > 60 THEN 'Long Trip'
        WHEN rate_code_name IN ('JFK', 'Newark') THEN 'Airport Trip'
        WHEN total_amount > 100 THEN 'Premium Trip'
        WHEN pickup_borough = dropoff_borough THEN 'Intra-Borough'
        WHEN pickup_borough != dropoff_borough THEN 'Inter-Borough'
        ELSE 'Standard Trip'
    END as trip_category,

    -- LINEAGE Y CALIDAD DE DATOS
    run_id,
    ingested_at_utc,
    service_type as source_service,
    source_year,
    source_month,
    source_path,
    CURRENT_TIMESTAMP() as obt_created_at,
    '1.0' as obt_version

FROM NY_TAXI.ENRICHED.UNIFIED_TAXI_ENRICHED

-- Filtros de calidad para OBT
WHERE pickup_datetime IS NOT NULL
    AND dropoff_datetime IS NOT NULL
    AND pickup_datetime < dropoff_datetime
    AND total_amount >= 0
    AND trip_distance >= 0
    AND trip_duration_minutes > 0
    AND trip_duration_minutes < 1440  -- Menos de 24 horas

ORDER BY pickup_datetime;
"""

print("SQL para construcción OBT generado")
print()
print("CARACTERÍSTICAS DEL SQL OBT:")
print("Fuente: NY_TAXI.ENRICHED.UNIFIED_TAXI_ENRICHED")
print("Destino: analytics.obt_trips")
print("Grano: 1 fila = 1 viaje")
print("Dimensiones temporales enriquecidas (14 campos)")
print("Métricas derivadas calculadas (velocidad, tip_pct, etc.)")
print("Indicadores analíticos (airport_trip, long_trip, etc.)")
print("Categorización automática de viajes")
print("Filtros de calidad de datos")
print("Lineage completo y versionado")
print()
print(" Para construir OBT:")
execute_snowflake_sql(build_obt_sql)

SQL para construcción OBT generado

CARACTERÍSTICAS DEL SQL OBT:
Fuente: NY_TAXI.ENRICHED.UNIFIED_TAXI_ENRICHED
Destino: analytics.obt_trips
Grano: 1 fila = 1 viaje
Dimensiones temporales enriquecidas (14 campos)
Métricas derivadas calculadas (velocidad, tip_pct, etc.)
Indicadores analíticos (airport_trip, long_trip, etc.)
Categorización automática de viajes
Filtros de calidad de datos
Lineage completo y versionado

💡 Para construir OBT:
-- CONSTRUCCIÓN OBT_TRI...
Ejecución completada correctamente.
