In [4]:
# -*- coding: utf-8 -*-
# 05_data_analysis.ipynb (versión Snowpark Python)
# - Lee ANALYTICS.OBT_TRIPS y responde (a)–(t)
# - Sin Spark. Solo Snowflake + Snowpark Python
# - Resultados también a CSV en evidence/analysis_05/

import os, time, textwrap, pathlib
import pandas as pd
from snowflake.snowpark import Session

# -----------------------------
# 0) Conexión Snowflake
# -----------------------------
cfg = {
    "account":   os.getenv("SNOWFLAKE_ACCOUNT"),
    "user":      os.getenv("SNOWFLAKE_USER"),
    "password":  os.getenv("SNOWFLAKE_PASSWORD"),
    "role":      os.getenv("SNOWFLAKE_ROLE", "SYSADMIN"),
    "warehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
    "database":  os.getenv("SNOWFLAKE_DATABASE"),
}
session = Session.builder.configs(cfg).create()
DB = cfg["database"]
SCH = os.getenv("SNOWFLAKE_SCHEMA", "ANALYTICS")
TABLE = f"{DB}.{SCH}.OBT_TRIPS"

session.sql(f"USE DATABASE {DB}").collect()
session.sql(f"USE SCHEMA {SCH}").collect()
print(f"✅ Conectado a {DB} (schema={SCH})")

# Carpeta de evidencia
OUTDIR = pathlib.Path("/home/jovyan/work/evidence/analysis_05")
OUTDIR.mkdir(parents=True, exist_ok=True)

def run(name: str, sql: str) -> pd.DataFrame:
    """Ejecuta SQL, devuelve pandas y guarda CSV."""
    print(f"\n▶ {name}")
    df = session.sql(textwrap.dedent(sql)).to_pandas()
    print(df.head(20).to_string(index=False))
    out = OUTDIR / f"{name}.csv"
    df.to_csv(out, index=False)
    print(f"💾 Guardado: {out}")
    return df

# Parámetros globales (puedes acotar periodo si quieres)
# year_from, year_to = 2015, 2015  # ejemplo de filtro
where_period = "YEAR BETWEEN 2015 AND 2025"  # o f"YEAR BETWEEN {year_from} AND {year_to}"

# Helper: franjas horarias solicitadas
FRANJA_CASE = """
CASE
  WHEN PICKUP_HOUR BETWEEN 6  AND 9  THEN '06-09'
  WHEN PICKUP_HOUR BETWEEN 17 AND 20 THEN '17-20'
  ELSE 'OTRAS'
END
"""

# Helper: viajes cortos/largos (tú puedes ajustar umbrales)
SHORT_LONG_CASE = """
CASE
  WHEN TRIP_DISTANCE < 1.0 THEN 'SHORT<1mi'
  WHEN TRIP_DISTANCE < 5.0 THEN 'MED<5mi'
  ELSE 'LONG>=5mi'
END
"""

# -----------------------------
# (a) Top 10 zonas de pickup por volumen mensual
# -----------------------------
run("a_top10_pickup_por_mes", f"""
SELECT
  YEAR, MONTH, PU_BOROUGH, PU_ZONE,
  COUNT(*) AS TRIPS
FROM {TABLE}
WHERE {where_period}
GROUP BY YEAR, MONTH, PU_BOROUGH, PU_ZONE
QUALIFY ROW_NUMBER() OVER (PARTITION BY YEAR, MONTH ORDER BY COUNT(*) DESC) <= 10
ORDER BY YEAR, MONTH, TRIPS DESC, PU_BOROUGH, PU_ZONE
""")

# -----------------------------
# (b) Top 10 zonas de dropoff por volumen mensual
# -----------------------------
run("b_top10_dropoff_por_mes", f"""
SELECT
  YEAR, MONTH, DO_BOROUGH, DO_ZONE,
  COUNT(*) AS TRIPS
FROM {TABLE}
WHERE {where_period}
GROUP BY YEAR, MONTH, DO_BOROUGH, DO_ZONE
QUALIFY ROW_NUMBER() OVER (PARTITION BY YEAR, MONTH ORDER BY COUNT(*) DESC) <= 10
ORDER BY YEAR, MONTH, TRIPS DESC, DO_BOROUGH, DO_ZONE
""")

# -----------------------------
# (c) Evolución mensual de total_amount y tip_pct por borough (pickup)
# -----------------------------
run("c_evol_total_y_tip_por_borough", f"""
SELECT
  YEAR, MONTH, PU_BOROUGH,
  AVG(TOTAL_AMOUNT) AS AVG_TOTAL,
  AVG(TIP_PCT)      AS AVG_TIP_PCT
FROM {TABLE}
WHERE {where_period}
GROUP BY YEAR, MONTH, PU_BOROUGH
ORDER BY YEAR, MONTH, PU_BOROUGH
""")

# -----------------------------
# (d) Ticket promedio (avg total_amount) por service_type y mes
# -----------------------------
run("d_ticket_promedio_por_service_mes", f"""
SELECT
  YEAR, MONTH, SERVICE AS SERVICE_TYPE,
  AVG(TOTAL_AMOUNT) AS AVG_TICKET
FROM {TABLE}
WHERE {where_period}
GROUP BY YEAR, MONTH, SERVICE
ORDER BY YEAR, MONTH, SERVICE
""")

# -----------------------------
# (e) Viajes por hora del día y día de semana (picos)
# -----------------------------
run("e_trips_por_hora_y_dow", f"""
SELECT
  DAY_OF_WEEK, PICKUP_HOUR,
  COUNT(*) AS TRIPS
FROM {TABLE}
WHERE {where_period}
GROUP BY DAY_OF_WEEK, PICKUP_HOUR
ORDER BY DAY_OF_WEEK, PICKUP_HOUR
""")

# -----------------------------
# (f) p50/p90 de trip_duration_min por borough de pickup
# -----------------------------
run("f_p50_p90_duracion_por_borough", f"""
SELECT
  PU_BOROUGH,
  APPROX_PERCENTILE(TRIP_DURATION_MIN, 0.50) AS P50_MIN,
  APPROX_PERCENTILE(TRIP_DURATION_MIN, 0.90) AS P90_MIN
FROM {TABLE}
WHERE {where_period}
GROUP BY PU_BOROUGH
ORDER BY PU_BOROUGH
""")

# -----------------------------
# (g) avg_speed_mph por franja horaria (6–9, 17–20) y borough
# -----------------------------
run("g_speed_por_franja_y_borough", f"""
SELECT
  {FRANJA_CASE} AS FRANJA,
  PU_BOROUGH,
  AVG(AVG_SPEED_MPH) AS AVG_SPEED
FROM {TABLE}
WHERE {where_period}
GROUP BY FRANJA, PU_BOROUGH
HAVING FRANJA IN ('06-09','17-20') -- solo franjas pedidas
ORDER BY FRANJA, PU_BOROUGH
""")

# -----------------------------
# (h) Participación por payment_desc y relación con tip_pct
# -----------------------------
run("h_share_pago_y_tip", f"""
WITH base AS (
  SELECT PAYMENT_DESC, TIP_PCT FROM {TABLE} WHERE {where_period}
),
agg AS (
  SELECT
    PAYMENT_DESC,
    COUNT(*) AS TRIPS,
    AVG(TIP_PCT) AS AVG_TIP_PCT
  FROM base
  GROUP BY PAYMENT_DESC
),
total AS (
  SELECT SUM(TRIPS) AS T FROM agg
)
SELECT
  a.PAYMENT_DESC,
  a.TRIPS,
  (a.TRIPS / t.T)::FLOAT AS SHARE_TRIPS,
  a.AVG_TIP_PCT
FROM agg a CROSS JOIN total t
ORDER BY a.TRIPS DESC
""")

# -----------------------------
# (i) Rate_code_desc con mayor trip_distance y total_amount
# -----------------------------
run("i_ratecode_dist_y_total", f"""
SELECT
  RATE_DESC,
  SUM(TRIP_DISTANCE) AS SUM_DISTANCE,
  SUM(TOTAL_AMOUNT)  AS SUM_TOTAL
FROM {TABLE}
WHERE {where_period}
GROUP BY RATE_DESC
ORDER BY SUM_DISTANCE DESC, SUM_TOTAL DESC
""")

# -----------------------------
# (j) Mix yellow vs green por mes y borough
# -----------------------------
run("j_mix_service_por_mes_borough", f"""
WITH grp AS (
  SELECT YEAR, MONTH, PU_BOROUGH, SERVICE, COUNT(*) AS TRIPS
  FROM {TABLE}
  WHERE {where_period}
  GROUP BY YEAR, MONTH, PU_BOROUGH, SERVICE
),
tot AS (
  SELECT YEAR, MONTH, PU_BOROUGH, SUM(TRIPS) AS TOT FROM grp
  GROUP BY YEAR, MONTH, PU_BOROUGH
)
SELECT
  g.YEAR, g.MONTH, g.PU_BOROUGH,
  g.SERVICE,
  g.TRIPS,
  (g.TRIPS / t.TOT)::FLOAT AS MIX
FROM grp g
JOIN tot t USING (YEAR, MONTH, PU_BOROUGH)
ORDER BY YEAR, MONTH, PU_BOROUGH, g.SERVICE
""")

# -----------------------------
# (k) Top 20 flujos PU→DO por volumen y su ticket promedio
# -----------------------------
run("k_top20_flujos_pu_do", f"""
SELECT
  PU_BOROUGH, PU_ZONE, DO_BOROUGH, DO_ZONE,
  COUNT(*) AS TRIPS,
  AVG(TOTAL_AMOUNT) AS AVG_TICKET
FROM {TABLE}
WHERE {where_period}
GROUP BY PU_BOROUGH, PU_ZONE, DO_BOROUGH, DO_ZONE
ORDER BY TRIPS DESC
LIMIT 20
""")

# -----------------------------
# (l) Distribución de passenger_count y efecto en total_amount
# -----------------------------
run("l_dist_passenger_y_ticket", f"""
SELECT
  PASSENGER_COUNT,
  COUNT(*) AS TRIPS,
  AVG(TOTAL_AMOUNT) AS AVG_TICKET,
  APPROX_PERCENTILE(TOTAL_AMOUNT, 0.5) AS P50_TICKET
FROM {TABLE}
WHERE {where_period}
GROUP BY PASSENGER_COUNT
ORDER BY PASSENGER_COUNT
""")

# -----------------------------
# (m) Impacto de tolls_amount y congestion_surcharge por zona
# -----------------------------
run("m_impacto_tolls_congestion_por_zona", f"""
SELECT
  PU_BOROUGH, PU_ZONE,
  AVG(TOLLS_AMOUNT)            AS AVG_TOLLS,
  AVG(CONGESTION_SURCHARGE)    AS AVG_CONG,
  AVG(TOTAL_AMOUNT)            AS AVG_TOTAL
FROM {TABLE}
WHERE {where_period}
GROUP BY PU_BOROUGH, PU_ZONE
ORDER BY AVG_CONG DESC, AVG_TOLLS DESC
""")

# -----------------------------
# (n) Proporción de viajes cortos vs largos por borough y estacionalidad (mes)
# -----------------------------
run("n_short_long_por_borough_mes", f"""
WITH b AS (
  SELECT YEAR, MONTH, PU_BOROUGH,
         {SHORT_LONG_CASE} AS LENGTH_BUCKET
  FROM {TABLE}
  WHERE {where_period}
),
agg AS (
  SELECT YEAR, MONTH, PU_BOROUGH, LENGTH_BUCKET, COUNT(*) AS TRIPS
  FROM b
  GROUP BY YEAR, MONTH, PU_BOROUGH, LENGTH_BUCKET
),
tot AS (
  SELECT YEAR, MONTH, PU_BOROUGH, SUM(TRIPS) AS T FROM agg
  GROUP BY YEAR, MONTH, PU_BOROUGH
)
SELECT
  a.YEAR, a.MONTH, a.PU_BOROUGH, a.LENGTH_BUCKET,
  a.TRIPS,
  (a.TRIPS / t.T)::FLOAT AS SHARE_BUCKET
FROM agg a JOIN tot t
  ON a.YEAR=t.YEAR AND a.MONTH=t.MONTH AND a.PU_BOROUGH=t.PU_BOROUGH
ORDER BY a.YEAR, a.MONTH, a.PU_BOROUGH, a.LENGTH_BUCKET
""")

# -----------------------------
# (o) Diferencias por vendor en avg_speed_mph y trip_duration_min
# -----------------------------
run("o_vendor_speed_y_duracion", f"""
SELECT
  VENDOR_NAME,
  COUNT(*) AS TRIPS,
  AVG(AVG_SPEED_MPH)     AS AVG_SPEED,
  AVG(TRIP_DURATION_MIN) AS AVG_DUR,
  APPROX_PERCENTILE(TRIP_DURATION_MIN, 0.50) AS P50_DUR,
  APPROX_PERCENTILE(TRIP_DURATION_MIN, 0.90) AS P90_DUR
FROM {TABLE}
WHERE {where_period}
GROUP BY VENDOR_NAME
ORDER BY TRIPS DESC
""")

# -----------------------------
# (p) Relación método de pago ↔ tip_amount por hora
# -----------------------------
run("p_tip_por_pago_y_hora", f"""
SELECT
  PAYMENT_DESC,
  PICKUP_HOUR,
  COUNT(*) AS TRIPS,
  AVG(TIP_AMOUNT) AS AVG_TIP,
  AVG(TIP_PCT)    AS AVG_TIP_PCT
FROM {TABLE}
WHERE {where_period}
GROUP BY PAYMENT_DESC, PICKUP_HOUR
ORDER BY PAYMENT_DESC, PICKUP_HOUR
""")

# -----------------------------
# (q) Zonas con p99 de duración/distancia altos (posible congestión/eventos)
# -----------------------------
run("q_p99_outliers_por_zona", f"""
SELECT
  PU_BOROUGH, PU_ZONE,
  APPROX_PERCENTILE(TRIP_DURATION_MIN, 0.99) AS P99_DUR,
  APPROX_PERCENTILE(TRIP_DISTANCE,     0.99) AS P99_DIST
FROM {TABLE}
WHERE {where_period}
GROUP BY PU_BOROUGH, PU_ZONE
ORDER BY P99_DUR DESC, P99_DIST DESC
""")

# -----------------------------
# (r) Yield por milla (total_amount / trip_distance) por borough y hora
# -----------------------------
run("r_yield_por_borough_y_hora", f"""
SELECT
  PU_BOROUGH,
  PICKUP_HOUR,
  AVG(CASE WHEN TRIP_DISTANCE > 0 THEN TOTAL_AMOUNT / TRIP_DISTANCE END) AS AVG_YIELD_PER_MI
FROM {TABLE}
WHERE {where_period}
GROUP BY PU_BOROUGH, PICKUP_HOUR
ORDER BY PU_BOROUGH, PICKUP_HOUR
""")

# -----------------------------
# (s) Cambios YoY en volumen y ticket promedio por service_type
# -----------------------------
# Construimos año-mes y comparamos con lag de 12 meses a nivel service
run("s_yoy_vol_y_ticket_por_service", f"""
WITH m AS (
  SELECT
    SERVICE,
    YEAR,
    MONTH,
    DATE_FROM_PARTS(YEAR, MONTH, 1) AS YM,
    COUNT(*) AS TRIPS,
    AVG(TOTAL_AMOUNT) AS AVG_TICKET
  FROM {TABLE}
  WHERE {where_period}
  GROUP BY SERVICE, YEAR, MONTH
),
y AS (
  SELECT
    SERVICE, YM, TRIPS, AVG_TICKET,
    LAG(TRIPS, 12)      OVER (PARTITION BY SERVICE ORDER BY YM) AS TRIPS_YAGO,
    LAG(AVG_TICKET, 12) OVER (PARTITION BY SERVICE ORDER BY YM) AS AVG_TICKET_YAGO
  FROM m
)
SELECT
  SERVICE, YM,
  TRIPS, TRIPS_YAGO,
  CASE WHEN TRIPS_YAGO IS NOT NULL AND TRIPS_YAGO <> 0
       THEN (TRIPS - TRIPS_YAGO) / TRIPS_YAGO::FLOAT END AS YOY_TRIPS,
  AVG_TICKET, AVG_TICKET_YAGO,
  CASE WHEN AVG_TICKET_YAGO IS NOT NULL AND AVG_TICKET_YAGO <> 0
       THEN (AVG_TICKET - AVG_TICKET_YAGO) / AVG_TICKET_YAGO::FLOAT END AS YOY_TICKET
FROM y
ORDER BY SERVICE, YM
""")


# -----------------------------
# (t) Días con alta congestion_surcharge: efecto en total_amount vs “normales”
# -----------------------------
# Definimos "alto" como > p75 diario por borough/zona; ajusta si prefieres.
run("t_impacto_congestion_dias_altos", f"""
WITH d AS (
  SELECT
    TO_DATE(PICKUP_DATETIME) AS D,
    PU_BOROUGH, PU_ZONE,
    SUM(CONGESTION_SURCHARGE) AS SUM_CONG,
    AVG(TOTAL_AMOUNT)         AS AVG_TOTAL
  FROM {TABLE}
  WHERE {where_period}
  GROUP BY TO_DATE(PICKUP_DATETIME), PU_BOROUGH, PU_ZONE
),
p AS (
  SELECT
    PU_BOROUGH, PU_ZONE,
    APPROX_PERCENTILE(SUM_CONG, 0.75) AS P75_CONG
  FROM d
  GROUP BY PU_BOROUGH, PU_ZONE
),
labeled AS (
  SELECT
    d.*,
    CASE WHEN d.SUM_CONG >= p.P75_CONG THEN 'ALTO_CONG' ELSE 'NORMAL' END AS DIA_TIPO
  FROM d JOIN p USING (PU_BOROUGH, PU_ZONE)
)
SELECT
  PU_BOROUGH, PU_ZONE, DIA_TIPO,
  AVG(AVG_TOTAL) AS AVG_TOTAL_DIA
FROM labeled
GROUP BY PU_BOROUGH, PU_ZONE, DIA_TIPO
ORDER BY PU_BOROUGH, PU_ZONE, DIA_TIPO
""")

print("\n✅ Análisis 05 completado. Archivos CSV en evidence/analysis_05/")


✅ Conectado a NYC_TAXI_DM (schema=ANALYTICS)

▶ a_top10_pickup_por_mes
 YEAR  MONTH PU_BOROUGH                      PU_ZONE  TRIPS
 2015      1  Manhattan        Upper East Side South 465778
 2015      1  Manhattan               Midtown Center 444793
 2015      1  Manhattan        Upper East Side North 443941
 2015      1  Manhattan                 East Village 436165
 2015      1  Manhattan    Times Sq/Theatre District 427351
 2015      1  Manhattan                     Union Sq 421911
 2015      1  Manhattan                  Murray Hill 409741
 2015      1  Manhattan                 Midtown East 409089
 2015      1  Manhattan                 Clinton East 399299
 2015      1  Manhattan Penn Station/Madison Sq West 391445
 2015      2  Manhattan        Upper East Side South 445672
 2015      2  Manhattan                 East Village 420282
 2015      2  Manhattan               Midtown Center 416770
 2015      2  Manhattan        Upper East Side North 415773
 2015      2  Manhattan      