In [None]:
import os
# Leer variables de entorno
SNOWFLAKE_ACCOUNT = os.getenv('SNOWFLAKE_ACCOUNT')
SNOWFLAKE_USER = os.getenv('SNOWFLAKE_USER')
SNOWFLAKE_PASSWORD = os.getenv('SNOWFLAKE_PASSWORD')
SNOWFLAKE_DATABASE = os.getenv('SNOWFLAKE_DATABASE')
SNOWFLAKE_SCHEMA_ENRICHED = os.getenv('SNOWFLAKE_SCHEMA_ENRICHED', 'ENRICHED')
SNOWFLAKE_WAREHOUSE = os.getenv('SNOWFLAKE_WAREHOUSE', 'COMPUTE_WH')

In [None]:
# CONEXIÓN DIRECTA A SNOWFLAKE (SQL PURO)
print("=== CONEXIÓN SNOWFLAKE PARA SQL NATIVO ===")

# Opción 1: Usar snowflake-connector-python
try:
    import snowflake.connector
    print(" snowflake-connector-python disponible")

    # Configuración de conexión
    conn_config = {
        'user': SNOWFLAKE_USER,
        'password': SNOWFLAKE_PASSWORD,
        'account': SNOWFLAKE_ACCOUNT,
        'warehouse': SNOWFLAKE_WAREHOUSE,
        'database': SNOWFLAKE_DATABASE,
        'schema': SNOWFLAKE_SCHEMA_ENRICHED
    }

    def execute_snowflake_sql(sql_query, fetch_results=False):
        """
        Ejecuta SQL directamente en Snowflake
        """
        try:
            conn = snowflake.connector.connect(**conn_config)
            cursor = conn.cursor()

            print(f"Ejecutando SQL en Snowflake...")
            cursor.execute(sql_query)

            if fetch_results:
                results = cursor.fetchall()
                columns = [desc[0] for desc in cursor.description]
                return results, columns
            else:
                rows_affected = cursor.rowcount
                print(f"SQL ejecutado. Filas afectadas: {rows_affected:,}")
                return rows_affected

        except Exception as e:
            print(f" Error ejecutando SQL: {e}")
            return None
        finally:
            try:
                cursor.close()
                conn.close()
            except:
                pass

    print("Función execute_snowflake_sql() configurada")

except ImportError:
    print("snowflake-connector-python no instalado")
    print("Instalar con: pip install snowflake-connector-python")

    def execute_snowflake_sql(sql_query, fetch_results=False):
        print("Connector no disponible. SQL para ejecutar manualmente:")
        print("-" * 60)
        print(sql_query)
        print("-" * 60)
        return None

print("Configuración lista para ejecutar SQL nativo en Snowflake")

## Unificacion y enriquecimiento

In [None]:
# SQL PARA CREAR TABLA UNIFICADA CON NORMALIZACIÓN COMPLETA
unified_taxi_sql = """
-- =====================================================
-- CREAR TABLA UNIFICADA: YELLOW + GREEN TAXI
-- Procesa 760M+ registros con normalización completa
-- =====================================================

CREATE OR REPLACE TABLE NY_TAXI.ENRICHED.UNIFIED_TAXI_ENRICHED AS
WITH yellow_normalized AS (
    SELECT
        -- Metadatos
        run_id,
        'yellow' as service_type,  -- Columna unificada service_type
        source_year,
        source_month,
        ingested_at_utc,
        source_path,
        natural_key,

        -- Información del viaje básica
        VendorID,
        CASE VendorID
            WHEN 1 THEN 'Creative Mobile Technologies, LLC'
            WHEN 2 THEN 'Curb Mobility, LLC'
            WHEN 6 THEN 'Myle Technologies Inc'
            WHEN 7 THEN 'Helix'
            ELSE 'Unknown'
        END as vendor_name,

        -- Timestamps unificados (nombres consistentes)
        tpep_pickup_datetime as pickup_datetime,
        tpep_dropoff_datetime as dropoff_datetime,

        -- Información geográfica
        PULocationID as pickup_location_id,
        DOLocationID as dropoff_location_id,

        -- Información del viaje
        passenger_count,
        trip_distance,
        RatecodeID as rate_code_id,
        CASE RatecodeID
            WHEN 1 THEN 'Standard rate'
            WHEN 2 THEN 'JFK'
            WHEN 3 THEN 'Newark'
            WHEN 4 THEN 'Nassau or Westchester'
            WHEN 5 THEN 'Negotiated fare'
            WHEN 6 THEN 'Group ride'
            WHEN 99 THEN 'Null/unknown'
            ELSE 'Unknown'
        END as rate_code_name,
        store_and_fwd_flag,

        -- Información financiera
        fare_amount,
        extra,
        mta_tax,
        tip_amount,
        tolls_amount,
        improvement_surcharge,
        total_amount,
        congestion_surcharge,
        airport_fee,
        cbd_congestion_fee,

        -- Información de pago normalizada
        payment_type as payment_type_id,
        CASE payment_type
            WHEN 0 THEN 'Flex Fare trip'
            WHEN 1 THEN 'Credit card'
            WHEN 2 THEN 'Cash'
            WHEN 3 THEN 'No charge'
            WHEN 4 THEN 'Dispute'
            WHEN 5 THEN 'Unknown'
            WHEN 6 THEN 'Voided trip'
            ELSE 'Unknown'
        END as payment_type_name,

        -- Campos específicos de Yellow (NULL para compatibilidad con Green)
        NULL as ehail_fee,
        NULL as trip_type,
        NULL as trip_type_name

    FROM NY_TAXI.RAW.YELLOW_TAXI
    WHERE pickup_datetime IS NOT NULL
        AND total_amount > 0
),

green_normalized AS (
    SELECT
        -- Metadatos
        run_id,
        'green' as service_type,  -- Columna unificada service_type
        source_year,
        source_month,
        ingested_at_utc,
        source_path,
        natural_key,

        -- Información del viaje básica
        VendorID,
        CASE VendorID
            WHEN 1 THEN 'Creative Mobile Technologies, LLC'
            WHEN 2 THEN 'Curb Mobility, LLC'
            WHEN 6 THEN 'Myle Technologies Inc'
            ELSE 'Unknown'
        END as vendor_name,

        -- Timestamps unificados (nombres consistentes)
        lpep_pickup_datetime as pickup_datetime,
        lpep_dropoff_datetime as dropoff_datetime,

        -- Información geográfica
        PULocationID as pickup_location_id,
        DOLocationID as dropoff_location_id,

        -- Información del viaje
        passenger_count,
        trip_distance,
        RatecodeID as rate_code_id,
        CASE RatecodeID
            WHEN 1 THEN 'Standard rate'
            WHEN 2 THEN 'JFK'
            WHEN 3 THEN 'Newark'
            WHEN 4 THEN 'Nassau or Westchester'
            WHEN 5 THEN 'Negotiated fare'
            WHEN 6 THEN 'Group ride'
            WHEN 99 THEN 'Null/unknown'
            ELSE 'Unknown'
        END as rate_code_name,
        store_and_fwd_flag,

        -- Información financiera
        fare_amount,
        extra,
        mta_tax,
        tip_amount,
        tolls_amount,
        improvement_surcharge,
        total_amount,
        congestion_surcharge,
        airport_fee,
        NULL as cbd_congestion_fee,  -- Solo en Yellow reciente

        -- Información de pago normalizada (mismos códigos que Yellow)
        payment_type as payment_type_id,
        CASE payment_type
            WHEN 0 THEN 'Flex Fare trip'
            WHEN 1 THEN 'Credit card'
            WHEN 2 THEN 'Cash'
            WHEN 3 THEN 'No charge'
            WHEN 4 THEN 'Dispute'
            WHEN 5 THEN 'Unknown'
            WHEN 6 THEN 'Voided trip'
            ELSE 'Unknown'
        END as payment_type_name,

        -- Campos específicos de Green
        ehail_fee,
        trip_type,
        CASE trip_type
            WHEN 1 THEN 'Street-hail'
            WHEN 2 THEN 'Dispatch'
            ELSE 'Unknown'
        END as trip_type_name

    FROM NY_TAXI.RAW.GREEN_TAXI
    WHERE pickup_datetime IS NOT NULL
        AND total_amount > 0
),

unified_data AS (
    -- UNION de Yellow y Green con esquema idéntico
    SELECT * FROM yellow_normalized
    UNION ALL
    SELECT * FROM green_normalized
)

-- ENRIQUECIMIENTO CON TAXI ZONES Y RESULTADO FINAL
SELECT
    -- Metadatos originales
    u.run_id,
    u.service_type,
    u.source_year,
    u.source_month,
    u.ingested_at_utc,
    u.source_path,
    u.natural_key,

    -- Información del viaje normalizada
    u.VendorID,
    u.vendor_name,
    u.pickup_datetime,
    u.dropoff_datetime,

    -- Información geográfica ENRIQUECIDA con Taxi Zones
    u.pickup_location_id,
    pickup_zone.Borough as pickup_borough,
    pickup_zone.Zone as pickup_zone,
    pickup_zone.service_zone as pickup_service_zone,

    u.dropoff_location_id,
    dropoff_zone.Borough as dropoff_borough,
    dropoff_zone.Zone as dropoff_zone,
    dropoff_zone.service_zone as dropoff_service_zone,

    -- Información del viaje
    u.passenger_count,
    u.trip_distance,
    u.rate_code_id,
    u.rate_code_name,
    u.store_and_fwd_flag,

    -- Información financiera
    u.fare_amount,
    u.extra,
    u.mta_tax,
    u.tip_amount,
    u.tolls_amount,
    u.improvement_surcharge,
    u.total_amount,
    u.congestion_surcharge,
    u.airport_fee,
    u.cbd_congestion_fee,

    -- Información de pago normalizada
    u.payment_type_id,
    u.payment_type_name,

    -- Campos específicos por tipo
    u.ehail_fee,      -- Solo Green
    u.trip_type,      -- Solo Green
    u.trip_type_name, -- Solo Green

    -- Métricas calculadas
    DATEDIFF('minute', u.pickup_datetime, u.dropoff_datetime) as trip_duration_minutes,
    CASE
        WHEN u.trip_distance > 0 THEN ROUND(u.total_amount / u.trip_distance, 2)
        ELSE NULL
    END as revenue_per_mile,

    -- Timestamps de procesamiento
    CURRENT_TIMESTAMP() as processed_at_utc

FROM unified_data u

-- LEFT JOIN para enriquecimiento con Pickup Zones
LEFT JOIN NY_TAXI.RAW.TAXI_ZONE_LOOKUP pickup_zone
    ON u.pickup_location_id = pickup_zone.LocationID

-- LEFT JOIN para enriquecimiento con Dropoff Zones
LEFT JOIN NY_TAXI.RAW.TAXI_ZONE_LOOKUP dropoff_zone
    ON u.dropoff_location_id = dropoff_zone.LocationID

-- Filtros de calidad final
WHERE u.pickup_datetime IS NOT NULL
    AND u.dropoff_datetime IS NOT NULL
    AND u.pickup_datetime < u.dropoff_datetime
    AND u.total_amount >= 0
    AND u.trip_distance >= 0;

"""

print("SQL de Unificación y Normalización generado")
print("Características del SQL:")
print("Unifica Yellow + Green con service_type")
print("Normaliza VendorID, RatecodeID, payment_type")
print("Enriquece con taxi zones (pickup/dropoff)")
print("Calcula métricas adicionales")
print("Filtros de calidad de datos")
print("Índices para performance")

# Para ejecutar:
execute_snowflake_sql(unified_taxi_sql)