In [81]:
from pyspark.sql import SparkSession, functions as F, types as T
from pyspark import SparkConf
from dotenv import load_dotenv
import os


In [82]:
conf = SparkConf()
conf.set("spark.jars.packages", "net.snowflake:snowflake-jdbc:3.24.2,net.snowflake:spark-snowflake_2.12:3.1.2") 
spark = SparkSession.builder \
    .config(conf=conf) \
    .getOrCreate() 

In [None]:
load_dotenv(dotenv_path="/home/jovyan/work/.env")

sfOptions = { #Es un diccionario de Python que está reuniendo todos los parámetros necesarios para que el conector Spark-Snowflake (que configuraste en tu SparkConf anterior) sepa dónde y cómo conectarse a Snowflake.
    "sfURL": os.getenv("URL"),
    "sfDatabase": os.getenv("DB"),
    "sfSchema": "RAW",
    "sfWarehouse": os.getenv("WAREHOUSE"),
    "sfRole": os.getenv("ROLE"),
    "sfUser": os.getenv("USER"),
    "sfPassword": os.getenv("PASSWORD")
}


In [83]:
sfOptions = { #Es un diccionario de Python que está reuniendo todos los parámetros necesarios para que el conector Spark-Snowflake (que configuraste en tu SparkConf anterior) sepa dónde y cómo conectarse a Snowflake.
    "sfURL" : "TLZAPUN-PKC06603.snowflakecomputing.com",
    "sfDatabase" : "NY_TAXI",
    "sfSchema" : "RAW",
    "sfWarehouse" :"COMPUTE_WH",
    "sfRole" : "ACCOUNTADMIN",
    "sfUser" : "MARE122510",
    "sfPassword" : "MyTurnEra2025100%"
}

In [84]:
query = """
create or replace table NY_TAXI.RAW.TAXI_TRIPS_ENRICHED as
with yellow as (
  select
    run_id,VendorID,
    tpep_pickup_datetime as PICKUP_DATETIME,
    tpep_dropoff_datetime  as DROPOFF_DATETIME,
    passenger_count,trip_distance,
    RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,
    mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,
    congestion_surcharge,cbd_congestion_fee,
    null as EHAIL_FEE,
    null as TRIP_TYPE,
    Airport_fee,
    service_type,source_year,source_month,ingested_at_utc,source_path
  from NY_TAXI.RAW.YELLOW_TRIPS
),
green as (
  select
    run_id,VendorID,
    lpep_pickup_datetime  as PICKUP_DATETIME,
    lpep_dropoff_datetime  as DROPOFF_DATETIME,passenger_count,trip_distance,
    RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,
    mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,
    cbd_congestion_fee,ehail_fee,trip_type,
    null as AIRPORT_FEE,
    service_type,source_year,source_month,ingested_at_utc,source_path
  from NY_TAXI.RAW.GREEN_TRIPS
),
unioned_trips as (
  select * from green
  union all
  select * from yellow
),
-- Estandarización de zonas horarias y normalización
standardized_trips as (
    select
        *,
        -- Estandarizar zonas horarias
        convert_timezone('UTC', 'America/New_York', PICKUP_DATETIME) as PICKUP_DATETIME_EST,
        convert_timezone('UTC', 'America/New_York', DROPOFF_DATETIME) as DROPOFF_DATETIME_EST,
        -- Normalizar
        case VENDORID
            when 1 then 'Creative Mobile Technologies, LLC'
            when 2 then 'Curb Mobility, LLC'
            when 6 then 'Myle Technologies Inc'
            when 7 then 'Helix'
            else 'Not specified'
        end as VENDORID_DESC,
        case RATECODEID
            when 1 then 'Standard rate'
            when 2 then 'JFK'
            when 3 then 'Newark'
            when 4 then 'Nassau or Westchester'
            when 5 then 'Negotiated fare'
            when 6 then 'Group ride'
            else 'Unknown'
        end as RATECODE_DESC,

        case PAYMENT_TYPE
            when 0 then 'Flex Fare trip '
            when 1 then 'Credit card'
            when 2 then 'Cash'
            when 3 then 'No charge'
            when 4 then 'Dispute'
            when 5 then 'Unknown'
            when 6 then 'Voided trip'
            else 'Not specified'
        end as PAYMENT_TYPE_DESC,       
        
        case TRIP_TYPE
            when 1 then 'Street-hall'
            when 2 then 'Dispatch'
            else 'Unknown'
         end as TRIP_TYPE_DESC,

        case STORE_AND_FWD_FLAG
            when 'Y' then 'Yes'
            when 'N' then 'No'
            else 'Unknown'
        end as STORE_AND_FWD_FLAG_DESC,
        
        -- Duración del viaje en minutos
        datediff('minute', PICKUP_DATETIME, DROPOFF_DATETIME) as TRIP_DURATION_MINUTES,
    from unioned_trips
),
-- Enriquecer con Taxi Zones
enriched_with_zones as (
    select
        st.*,
        -- Información de pickup location
        pz.Zone as PICKUP_ZONE,
        pz.Borough as PICKUP_BOROUGH,
        pz.service_zone as PICKUP_SERVICE_ZONE,
        
        -- Información de dropoff location  
        dz.Zone as DROPOFF_ZONE,
        dz.Borough as DROPOFF_BOROUGH,
        dz.service_zone as DROPOFF_SERVICE_ZONE,

    from standardized_trips st
    left join NY_TAXI.RAW.taxi_zones pz 
        on st.PULOCATIONID = pz.LocationID
    left join NY_TAXI.RAW.taxi_zones dz 
        on st.DOLOCATIONID = dz.LocationID
),
-- Métricas adicionales y limpieza final
final as (
    select
        -- Identificadores y metadatos
        RUN_ID,
        INGESTED_AT_UTC,
        SERVICE_TYPE,
                
        -- Fechas y tiempos
        SOURCE_YEAR,
        SOURCE_MONTH,
        PICKUP_DATETIME_EST as PICKUP_DATETIME,
        DROPOFF_DATETIME_EST as DROPOFF_DATETIME,
        TRIP_DURATION_MINUTES,
        
        -- Datos del viaje

        VENDORID,
        VENDORID_DESC,
        PASSENGER_COUNT,
        TRIP_DISTANCE,
        RATECODEID,
        RATECODE_DESC,
        STORE_AND_FWD_FLAG_DESC,

        -- Información de ubicación
        PULOCATIONID,
        PICKUP_ZONE,
        PICKUP_BOROUGH,
        PICKUP_SERVICE_ZONE,
        
        DOLOCATIONID, 
        DROPOFF_ZONE,
        DROPOFF_BOROUGH,
        DROPOFF_SERVICE_ZONE,
        
        -- Información de pago
        PAYMENT_TYPE,
        PAYMENT_TYPE_DESC,
        FARE_AMOUNT,
        TIP_AMOUNT,
        EXTRA,
        MTA_TAX,
        TOLLS_AMOUNT,
        IMPROVEMENT_SURCHARGE,
        CONGESTION_SURCHARGE,
        CBD_CONGESTION_FEE,
        EHAIL_FEE,
        TOTAL_AMOUNT,
        AIRPORT_FEE,
        
        -- Campos específicos
        TRIP_TYPE,
        TRIP_TYPE_DESC,

    from enriched_with_zones
)

select * from final
"""


In [86]:
import snowflake.connector

conn = snowflake.connector.connect(
    #Se tiene que cambiar a los secretosAAAAAAA
    user="MARE122510",
    password="MyTurnEra2025100%",
    account="TLZAPUN-PKC06603",
    warehouse="COMPUTE_WH",
    database="NY_TAXI",
    schema="RAW",
    role =os.getenv('ROLE')
)

cur = conn.cursor()
print("Cursor creado")

cur.execute(query)
print("Tabla RAW.TAXI_TRIPS_ENRICHED creada exitosamente")

cur.close()
conn.close()


Cursor creado
Tabla RAW.TAXI_TRIPS_ENRICHED creada exitosamente
