In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Capa Plata - Uber NYC Data Pipeline
# MAGIC 
# MAGIC ## Objetivo
# MAGIC Limpiar, transformar y enriquecer los datos de la capa Bronce para crear datos listos para análisis.
# MAGIC 
# MAGIC ## Responsabilidades
# MAGIC - Leer datos desde la capa Bronce
# MAGIC - Limpiar y validar datos (tipos, nulos, formatos)
# MAGIC - Enriquecer con ingeniería de características
# MAGIC - Aplicar reglas de negocio
# MAGIC - Guardar datos curados en formato Delta Lake
# MAGIC 
# MAGIC ## Entrada
# MAGIC - Tabla Delta: `uber_bronze` desde `/Volumes/workspace/default/uber_etl_azure/bronze/`
# MAGIC 
# MAGIC ## Salida
# MAGIC - Tabla Delta: `uber_silver` en `/Volumes/workspace/default/uber_etl_azure/silver/`


In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## 1. Configuración e Imports

# COMMAND ----------

In [0]:
# Importar librerías necesarias
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import os

In [0]:
# Configurar rutas
bronze_path = "/Volumes/workspace/default/uber_etl_azure/bronze/"
silver_path = "/Volumes/workspace/default/uber_etl_azure/silver/"

In [0]:
# Crear directorio si no existe
dbutils.fs.mkdirs(silver_path)

print("✅ Configuración de Capa Plata inicializada")
print(f"🥉 Origen Bronce: {bronze_path}")
print(f"🥈 Destino Plata: {silver_path}")

✅ Configuración de Capa Plata inicializada
🥉 Origen Bronce: /Volumes/workspace/default/uber_etl_azure/bronze/
🥈 Destino Plata: /Volumes/workspace/default/uber_etl_azure/silver/


In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## 2. Validación de Dependencias

# COMMAND ----------

In [0]:
# Verificar que existe la tabla Bronce
try:
    bronze_files = dbutils.fs.ls(f"{bronze_path}uber_bronze")
    print("✅ Tabla Bronce encontrada:")
    for file in bronze_files[:5]:  # Mostrar solo los primeros 5 archivos
        print(f"   📄 {file.name}")
    
    if len(bronze_files) > 5:
        print(f"   ... y {len(bronze_files) - 5} archivos más")
        
except Exception as e:
    print(f"❌ Error: Tabla Bronce no encontrada - {str(e)}")
    print("💡 Asegúrate de ejecutar primero el notebook '01_Uber_Bronze_Layer'")
    raise

✅ Tabla Bronce encontrada:
   📄 _delta_log/
   📄 part-00000-354416d4-62a6-4cfa-aca1-782036f37426.c000.snappy.parquet
   📄 part-00000-37e02841-b576-4696-a44d-aabb9a1f6a62.c000.snappy.parquet
   📄 part-00000-7295e936-95fa-45f8-b62d-8d8b6936aba1.c000.snappy.parquet
   📄 part-00000-a1fec7a3-f5d5-48b2-a135-cc059da438bd.c000.snappy.parquet
   ... y 12 archivos más


In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## 3. Lectura de Datos Bronce

# COMMAND ----------

In [0]:
# Leer datos de la capa Bronce
print("📥 Leyendo datos de la capa Bronce...")

try:
    df_silver = spark.read.format("delta").load(f"{bronze_path}uber_bronze")
    
    bronze_count = df_silver.count()
    print(f"📊 Registros leídos de Bronce: {bronze_count:,}")
    
    if bronze_count == 0:
        raise Exception("❌ No se encontraron datos en la capa Bronce")
    
    print("✅ Datos Bronce cargados exitosamente")
    
except Exception as e:
    print(f"❌ Error al leer capa Bronce: {str(e)}")
    raise

📥 Leyendo datos de la capa Bronce...
📊 Registros leídos de Bronce: 4,534,327
✅ Datos Bronce cargados exitosamente


In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## 4. Análisis de Calidad de Datos

# COMMAND ----------

In [0]:
print("🔍 ANÁLISIS DE CALIDAD DE DATOS BRONCE:")
print("=" * 45)


🔍 ANÁLISIS DE CALIDAD DE DATOS BRONCE:


In [0]:
# Mostrar esquema actual
print("📋 Esquema actual:")
df_silver.printSchema()

📋 Esquema actual:
root
 |-- Date/Time: string (nullable = true)
 |-- Lat: double (nullable = true)
 |-- Lon: double (nullable = true)
 |-- Base: string (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)
 |-- source_file: string (nullable = true)
 |-- bronze_layer_version: string (nullable = true)



In [0]:
# Análisis de valores nulos por columna
print("\n🔍 Análisis de valores nulos:")
for col_name in df_silver.columns:
    null_count = df_silver.filter(col(col_name).isNull()).count()
    null_percentage = (null_count / bronze_count) * 100
    print(f"   {col_name}: {null_count:,} nulos ({null_percentage:.2f}%)")


🔍 Análisis de valores nulos:
   Date/Time: 0 nulos (0.00%)
   Lat: 0 nulos (0.00%)
   Lon: 0 nulos (0.00%)
   Base: 0 nulos (0.00%)
   ingestion_timestamp: 0 nulos (0.00%)
   source_file: 0 nulos (0.00%)
   bronze_layer_version: 0 nulos (0.00%)


In [0]:
# Análisis de duplicados
print(f"\n🔍 Análisis de duplicados:")
unique_count = df_silver.dropDuplicates().count()
duplicate_count = bronze_count - unique_count
print(f"   Registros únicos: {unique_count:,}")
print(f"   Registros duplicados: {duplicate_count:,}")


🔍 Análisis de duplicados:
   Registros únicos: 4,451,746
   Registros duplicados: 82,581


In [0]:
# Muestra de datos
print(f"\n📋 Muestra de datos Bronce:")
df_silver.show(3, truncate=False)


📋 Muestra de datos Bronce:
+----------------+-------+--------+------+--------------------------+-----------------------+--------------------+
|Date/Time       |Lat    |Lon     |Base  |ingestion_timestamp       |source_file            |bronze_layer_version|
+----------------+-------+--------+------+--------------------------+-----------------------+--------------------+
|7/1/2014 0:03:00|40.7586|-73.9706|B02512|2025-07-28 17:06:30.446467|uber-raw-data-jul14.csv|1.0                 |
|7/1/2014 0:05:00|40.7605|-73.9994|B02512|2025-07-28 17:06:30.446467|uber-raw-data-jul14.csv|1.0                 |
|7/1/2014 0:06:00|40.732 |-73.9999|B02512|2025-07-28 17:06:30.446467|uber-raw-data-jul14.csv|1.0                 |
+----------------+-------+--------+------+--------------------------+-----------------------+--------------------+
only showing top 3 rows


In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## 5. Limpieza de Datos

# COMMAND ----------

In [0]:
print("🧹 INICIANDO LIMPIEZA DE DATOS...")
print("=" * 35)

🧹 INICIANDO LIMPIEZA DE DATOS...


In [0]:
# 1. Renombrar columnas para seguir convenciones
print("📝 Paso 1: Renombrando columnas...")
df_silver = df_silver \
    .withColumnRenamed("Date/Time", "pickup_datetime") \
    .withColumnRenamed("Lat", "latitude") \
    .withColumnRenamed("Lon", "longitude") \
    .withColumnRenamed("Base", "base_code")

print("✅ Columnas renombradas exitosamente")

📝 Paso 1: Renombrando columnas...
✅ Columnas renombradas exitosamente


In [0]:
# 2. Convertir tipos de datos
print("🔧 Paso 2: Convirtiendo tipos de datos...")

# Convertir pickup_datetime de string a timestamp
df_silver = df_silver.withColumn("pickup_datetime", 
                                to_timestamp(col("pickup_datetime"), "M/d/yyyy H:mm:ss"))

# Asegurar que latitude y longitude sean Double
df_silver = df_silver \
    .withColumn("latitude", col("latitude").cast("double")) \
    .withColumn("longitude", col("longitude").cast("double"))

print("✅ Tipos de datos convertidos exitosamente")

🔧 Paso 2: Convirtiendo tipos de datos...
✅ Tipos de datos convertidos exitosamente


In [0]:
# 3. Validar rangos geográficos (NYC bounds)
print("🗺️ Paso 3: Validando coordenadas geográficas...")

# Bounds aproximados para NYC
NYC_LAT_MIN, NYC_LAT_MAX = 40.4774, 40.9176
NYC_LON_MIN, NYC_LON_MAX = -74.2591, -73.7004

# Contar registros fuera de los bounds de NYC
out_of_bounds = df_silver.filter(
    (col("latitude") < NYC_LAT_MIN) | (col("latitude") > NYC_LAT_MAX) |
    (col("longitude") < NYC_LON_MIN) | (col("longitude") > NYC_LON_MAX)
).count()

print(f"   Registros fuera de NYC bounds: {out_of_bounds:,}")
print(f"   Registros válidos geográficamente: {bronze_count - out_of_bounds:,}")

🗺️ Paso 3: Validando coordenadas geográficas...
   Registros fuera de NYC bounds: 31,910
   Registros válidos geográficamente: 4,502,417


In [0]:
# 4. Filtrar registros con valores nulos en campos críticos
print("🚫 Paso 4: Eliminando registros con valores nulos críticos...")

records_before_null_filter = df_silver.count()

df_silver = df_silver.filter(
    col("pickup_datetime").isNotNull() & 
    col("latitude").isNotNull() & 
    col("longitude").isNotNull() &
    col("base_code").isNotNull()
)

records_after_null_filter = df_silver.count()
removed_null_records = records_before_null_filter - records_after_null_filter

print(f"   Registros eliminados por nulos: {removed_null_records:,}")
print(f"   Registros restantes: {records_after_null_filter:,}")

🚫 Paso 4: Eliminando registros con valores nulos críticos...
   Registros eliminados por nulos: 0
   Registros restantes: 4,534,327


In [0]:
# 5. Aplicar filtros de calidad adicionales
print("✨ Paso 5: Aplicando filtros de calidad...")

# Filtrar coordenadas dentro de bounds de NYC
df_silver = df_silver.filter(
    (col("latitude") >= NYC_LAT_MIN) & (col("latitude") <= NYC_LAT_MAX) &
    (col("longitude") >= NYC_LON_MIN) & (col("longitude") <= NYC_LON_MAX)
)

# Filtrar fechas válidas (2014)
df_silver = df_silver.filter(
    (year(col("pickup_datetime")) == 2014)
)

final_count_after_cleaning = df_silver.count()
total_removed = bronze_count - final_count_after_cleaning

print(f"✅ LIMPIEZA COMPLETADA:")
print(f"   Registros originales: {bronze_count:,}")
print(f"   Registros después de limpieza: {final_count_after_cleaning:,}")
print(f"   Registros removidos: {total_removed:,} ({(total_removed/bronze_count)*100:.2f}%)")
print(f"   Tasa de retención: {(final_count_after_cleaning/bronze_count)*100:.2f}%")

✨ Paso 5: Aplicando filtros de calidad...
✅ LIMPIEZA COMPLETADA:
   Registros originales: 4,534,327
   Registros después de limpieza: 4,502,417
   Registros removidos: 31,910 (0.70%)
   Tasa de retención: 99.30%


In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## 6. Ingeniería de Características

# COMMAND ----------

In [0]:
print("🔧 INGENIERÍA DE CARACTERÍSTICAS...")
print("=" * 35)

# Crear características temporales
print("📅 Creando características temporales...")

df_silver = df_silver \
    .withColumn("pickup_hour", hour(col("pickup_datetime"))) \
    .withColumn("pickup_day_of_week", date_format(col("pickup_datetime"), "EEEE")) \
    .withColumn("pickup_day_of_week_num", dayofweek(col("pickup_datetime"))) \
    .withColumn("pickup_month", month(col("pickup_datetime"))) \
    .withColumn("pickup_year", year(col("pickup_datetime"))) \
    .withColumn("pickup_date", date_format(col("pickup_datetime"), "yyyy-MM-dd")) \
    .withColumn("pickup_week_of_year", weekofyear(col("pickup_datetime")))

🔧 INGENIERÍA DE CARACTERÍSTICAS...
📅 Creando características temporales...


In [0]:
# Categorizar horas del día
df_silver = df_silver.withColumn("time_category",
    when(col("pickup_hour").between(6, 9), "Morning Rush")
    .when(col("pickup_hour").between(10, 16), "Midday")
    .when(col("pickup_hour").between(17, 20), "Evening Rush")
    .when(col("pickup_hour").between(21, 23), "Night")
    .otherwise("Late Night/Early Morning")
)

In [0]:
# Categorizar días de la semana
df_silver = df_silver.withColumn("day_type",
    when(col("pickup_day_of_week_num").isin(2, 3, 4, 5, 6), "Weekday")
    .otherwise("Weekend")
)

In [0]:
# Crear identificador único para cada viaje
df_silver = df_silver.withColumn("trip_id", 
    concat(
        col("base_code"),
        lit("_"),
        date_format(col("pickup_datetime"), "yyyyMMdd_HHmmss"),
        lit("_"),
        round(col("latitude") * 10000).cast("int"),
        lit("_"),
        round(col("longitude") * 10000).cast("int")
    )
)

In [0]:
# Añadir metadatos de procesamiento
df_silver = df_silver \
    .withColumn("silver_processing_timestamp", current_timestamp()) \
    .withColumn("silver_layer_version", lit("1.0"))

print("✅ Características creadas exitosamente")

✅ Características creadas exitosamente


In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## 7. Validación de Datos Transformados

# COMMAND ----------

In [0]:
print("🔍 VALIDACIÓN DE DATOS TRANSFORMADOS:")
print("=" * 40)

# Mostrar nuevo esquema
print("📋 Esquema final:")
df_silver.printSchema()

🔍 VALIDACIÓN DE DATOS TRANSFORMADOS:
📋 Esquema final:
root
 |-- pickup_datetime: timestamp (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- base_code: string (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)
 |-- source_file: string (nullable = true)
 |-- bronze_layer_version: string (nullable = true)
 |-- pickup_hour: integer (nullable = true)
 |-- pickup_day_of_week: string (nullable = true)
 |-- pickup_day_of_week_num: integer (nullable = true)
 |-- pickup_month: integer (nullable = true)
 |-- pickup_year: integer (nullable = true)
 |-- pickup_date: string (nullable = true)
 |-- pickup_week_of_year: integer (nullable = true)
 |-- time_category: string (nullable = false)
 |-- day_type: string (nullable = false)
 |-- trip_id: string (nullable = true)
 |-- silver_processing_timestamp: timestamp (nullable = false)
 |-- silver_layer_version: string (nullable = false)



In [0]:
# Estadísticas generales
print(f"\n📊 Estadísticas generales:")
print(f"   Total registros: {df_silver.count():,}")
print(f"   Total columnas: {len(df_silver.columns)}")


📊 Estadísticas generales:
   Total registros: 4,502,417
   Total columnas: 19


In [0]:
# Distribución por características creadas
print(f"\n📈 Distribución por time_category:")
df_silver.groupBy("time_category").count().orderBy(col("count").desc()).show()

print(f"\n📈 Distribución por day_type:")
df_silver.groupBy("day_type").count().orderBy(col("count").desc()).show()

print(f"\n📈 Distribución por base_code:")
df_silver.groupBy("base_code").count().orderBy(col("count").desc()).show()


📈 Distribución por time_category:
+--------------------+-------+
|       time_category|  count|
+--------------------+-------+
|              Midday|1498592|
|        Evening Rush|1232653|
|               Night| 687346|
|        Morning Rush| 682876|
|Late Night/Early ...| 400950|
+--------------------+-------+


📈 Distribución por day_type:
+--------+-------+
|day_type|  count|
+--------+-------+
| Weekday|3376700|
| Weekend|1125717|
+--------+-------+


📈 Distribución por base_code:
+---------+-------+
|base_code|  count|
+---------+-------+
|   B02617|1449558|
|   B02598|1383720|
|   B02682|1205017|
|   B02764| 261394|
|   B02512| 202728|
+---------+-------+



In [0]:
# Muestra de datos finales
print(f"\n📋 Muestra de datos transformados:")
df_silver.select(
    "trip_id", "pickup_datetime", "latitude", "longitude", "base_code",
    "pickup_hour", "pickup_day_of_week", "time_category", "day_type"
).show(3, truncate=False)


📋 Muestra de datos transformados:
+-------------------------------------+-------------------+--------+---------+---------+-----------+------------------+------------------------+--------+
|trip_id                              |pickup_datetime    |latitude|longitude|base_code|pickup_hour|pickup_day_of_week|time_category           |day_type|
+-------------------------------------+-------------------+--------+---------+---------+-----------+------------------+------------------------+--------+
|B02512_20140401_001100_407690_-739549|2014-04-01 00:11:00|40.769  |-73.9549 |B02512   |0          |Tuesday           |Late Night/Early Morning|Weekday |
|B02512_20140401_001700_407267_-740345|2014-04-01 00:17:00|40.7267 |-74.0345 |B02512   |0          |Tuesday           |Late Night/Early Morning|Weekday |
|B02512_20140401_002100_407316_-739873|2014-04-01 00:21:00|40.7316 |-73.9873 |B02512   |0          |Tuesday           |Late Night/Early Morning|Weekday |
+-------------------------------------+--

In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## 8. Guardar en Capa Plata

# COMMAND ----------

In [0]:
print("💾 Guardando datos en la capa Plata...")

try:
    # Guardar en formato Delta
    df_silver.write \
        .format("delta") \
        .mode("overwrite") \
        .option("overwriteSchema", "true") \
        .save(f"{silver_path}uber_silver")
    
    print("✅ Datos guardados exitosamente en formato Delta")
    
    # Verificación de integridad
    verification_df = spark.read.format("delta").load(f"{silver_path}uber_silver")
    verification_count = verification_df.count()
    
    print(f"🔍 Verificación: {verification_count:,} registros en la tabla Delta")
    
    if verification_count != final_count_after_cleaning:
        raise Exception(f"❌ Error de integridad detectado")
    
    print("✅ Verificación de integridad exitosa")
    
except Exception as e:
    print(f"❌ Error al guardar en capa Plata: {str(e)}")
    raise

💾 Guardando datos en la capa Plata...
✅ Datos guardados exitosamente en formato Delta
🔍 Verificación: 4,502,417 registros en la tabla Delta
✅ Verificación de integridad exitosa


In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## 9. Análisis de Calidad Final

# COMMAND ----------

In [0]:
print("📊 ANÁLISIS DE CALIDAD FINAL:")
print("=" * 35)

# Métricas de calidad de datos
total_trips = verification_count
unique_trip_ids = df_silver.select("trip_id").distinct().count()
unique_bases = df_silver.select("base_code").distinct().count()
date_range = df_silver.agg(
    min("pickup_datetime").alias("min_date"),
    max("pickup_datetime").alias("max_date")
).collect()[0]

print(f"📈 Métricas de calidad:")
print(f"   Total viajes procesados: {total_trips:,}")
print(f"   IDs únicos de viajes: {unique_trip_ids:,}")
print(f"   Bases únicas: {unique_bases}")
print(f"   Rango de fechas: {date_range['min_date']} a {date_range['max_date']}")
print(f"   Tasa de unicidad de trip_id: {(unique_trip_ids/total_trips)*100:.2f}%")

# Estadísticas por mes
print(f"\n📅 Estadísticas por mes:")
monthly_stats = df_silver.groupBy("pickup_month") \
    .agg(
        count("*").alias("trip_count"),
        countDistinct("base_code").alias("unique_bases"),
        avg("latitude").alias("avg_lat"),
        avg("longitude").alias("avg_lon")
    ) \
    .orderBy("pickup_month")

monthly_stats.show()

📊 ANÁLISIS DE CALIDAD FINAL:
📈 Métricas de calidad:
   Total viajes procesados: 4,502,417
   IDs únicos de viajes: 4,420,746
   Bases únicas: 5
   Rango de fechas: 2014-04-01 00:00:00 a 2014-09-30 22:59:00
   Tasa de unicidad de trip_id: 98.19%

📅 Estadísticas por mes:
+------------+----------+------------+------------------+------------------+
|pickup_month|trip_count|unique_bases|           avg_lat|           avg_lon|
+------------+----------+------------+------------------+------------------+
|           4|    562101|           5| 40.73952012076089|-73.97738297619023|
|           5|    648998|           5|40.739550199075666|-73.97578937284925|
|           6|    659436|           5| 40.73941212748435|-73.97509419337048|
|           7|    789920|           5|40.738619726048285|-73.97374905914631|
|           8|    821662|           5|  40.7372086684074|-73.97187460367057|
|           9|   1020300|           5| 40.73847831412437|-73.97315561903594|
+------------+----------+------------

In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## 10. Resumen Final y Próximos Pasos

# COMMAND ----------

In [0]:
print("📊 RESUMEN FINAL - CAPA PLATA")
print("=" * 50)
print(f"🥉 Registros de entrada (Bronce): {bronze_count:,}")
print(f"🥈 Registros de salida (Plata): {verification_count:,}")
print(f"📉 Registros filtrados: {bronze_count - verification_count:,}")
print(f"📈 Tasa de retención: {(verification_count/bronze_count)*100:.2f}%")
print(f"🏷️ Columnas añadidas: {len(df_silver.columns) - len(spark.read.format('delta').load(f'{bronze_path}uber_bronze').columns)}")
print(f"💾 Formato de salida: Delta Lake")
print(f"📍 Ubicación: {silver_path}uber_silver")

print(f"\n🔧 Transformaciones aplicadas:")
print(f"   ✅ Limpieza de datos y validación de tipos")
print(f"   ✅ Filtros geográficos (NYC bounds)")
print(f"   ✅ Filtros temporales (año 2014)")
print(f"   ✅ Eliminación de valores nulos")
print(f"   ✅ Ingeniería de características temporales")
print(f"   ✅ Categorización de horarios y días")
print(f"   ✅ Creación de identificadores únicos")

print(f"\n🚀 Próximos pasos:")
print(f"   📋 Ejecutar notebook '03_Uber_Gold_Layer' para crear agregaciones")
print(f"   📊 Los datos están listos para análisis avanzado")
print(f"   🔍 Calidad de datos validada y documentada")

print("\n✅ CAPA PLATA COMPLETADA EXITOSAMENTE")

📊 RESUMEN FINAL - CAPA PLATA
🥉 Registros de entrada (Bronce): 4,534,327
🥈 Registros de salida (Plata): 4,502,417
📉 Registros filtrados: 31,910
📈 Tasa de retención: 99.30%
🏷️ Columnas añadidas: 12
💾 Formato de salida: Delta Lake
📍 Ubicación: /Volumes/workspace/default/uber_etl_azure/silver/uber_silver

🔧 Transformaciones aplicadas:
   ✅ Limpieza de datos y validación de tipos
   ✅ Filtros geográficos (NYC bounds)
   ✅ Filtros temporales (año 2014)
   ✅ Eliminación de valores nulos
   ✅ Ingeniería de características temporales
   ✅ Categorización de horarios y días
   ✅ Creación de identificadores únicos

🚀 Próximos pasos:
   📋 Ejecutar notebook '03_Uber_Gold_Layer' para crear agregaciones
   📊 Los datos están listos para análisis avanzado
   🔍 Calidad de datos validada y documentada

✅ CAPA PLATA COMPLETADA EXITOSAMENTE


In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## 11. Registro de Vista Temporal

# COMMAND ----------

In [0]:
# Registrar como vista temporal para consultas
df_silver.createOrReplaceTempView("uber_silver_temp")

print("📋 Tabla registrada como vista temporal: 'uber_silver_temp'")
print("💡 Ejemplos de consultas SQL:")
print("   SELECT time_category, COUNT(*) FROM uber_silver_temp GROUP BY time_category;")
print("   SELECT base_code, pickup_month, COUNT(*) FROM uber_silver_temp GROUP BY base_code, pickup_month;")
print("   SELECT * FROM uber_silver_temp WHERE day_type = 'Weekend' LIMIT 10;")

print("\n🎉 Datos Plata listos para la capa Oro")

📋 Tabla registrada como vista temporal: 'uber_silver_temp'
💡 Ejemplos de consultas SQL:
   SELECT time_category, COUNT(*) FROM uber_silver_temp GROUP BY time_category;
   SELECT base_code, pickup_month, COUNT(*) FROM uber_silver_temp GROUP BY base_code, pickup_month;
   SELECT * FROM uber_silver_temp WHERE day_type = 'Weekend' LIMIT 10;

🎉 Datos Plata listos para la capa Oro


In [0]:
%sql
SELECT * FROM uber_silver_temp WHERE day_type = 'Weekend' LIMIT 10;


pickup_datetime,latitude,longitude,base_code,ingestion_timestamp,source_file,bronze_layer_version,pickup_hour,pickup_day_of_week,pickup_day_of_week_num,pickup_month,pickup_year,pickup_date,pickup_week_of_year,time_category,day_type,trip_id,silver_processing_timestamp,silver_layer_version
2014-04-05T00:00:00.000Z,40.769,-73.9825,B02512,2025-07-28T17:06:30.446Z,uber-raw-data-apr14.csv,1.0,0,Saturday,7,4,2014,2014-04-05,14,Late Night/Early Morning,Weekend,B02512_20140405_000000_407690_-739825,2025-07-28T17:26:15.261Z,1.0
2014-04-05T00:00:00.000Z,40.7594,-73.9641,B02512,2025-07-28T17:06:30.446Z,uber-raw-data-apr14.csv,1.0,0,Saturday,7,4,2014,2014-04-05,14,Late Night/Early Morning,Weekend,B02512_20140405_000000_407594_-739641,2025-07-28T17:26:15.261Z,1.0
2014-04-05T00:01:00.000Z,40.7113,-74.0173,B02512,2025-07-28T17:06:30.446Z,uber-raw-data-apr14.csv,1.0,0,Saturday,7,4,2014,2014-04-05,14,Late Night/Early Morning,Weekend,B02512_20140405_000100_407113_-740173,2025-07-28T17:26:15.261Z,1.0
2014-04-05T00:02:00.000Z,40.806,-73.9652,B02512,2025-07-28T17:06:30.446Z,uber-raw-data-apr14.csv,1.0,0,Saturday,7,4,2014,2014-04-05,14,Late Night/Early Morning,Weekend,B02512_20140405_000200_408060_-739652,2025-07-28T17:26:15.261Z,1.0
2014-04-05T00:04:00.000Z,40.7211,-74.0042,B02512,2025-07-28T17:06:30.446Z,uber-raw-data-apr14.csv,1.0,0,Saturday,7,4,2014,2014-04-05,14,Late Night/Early Morning,Weekend,B02512_20140405_000400_407211_-740042,2025-07-28T17:26:15.261Z,1.0
2014-04-05T00:04:00.000Z,40.7073,-74.0088,B02512,2025-07-28T17:06:30.446Z,uber-raw-data-apr14.csv,1.0,0,Saturday,7,4,2014,2014-04-05,14,Late Night/Early Morning,Weekend,B02512_20140405_000400_407073_-740088,2025-07-28T17:26:15.261Z,1.0
2014-04-05T00:04:00.000Z,40.7566,-73.9975,B02512,2025-07-28T17:06:30.446Z,uber-raw-data-apr14.csv,1.0,0,Saturday,7,4,2014,2014-04-05,14,Late Night/Early Morning,Weekend,B02512_20140405_000400_407566_-739975,2025-07-28T17:26:15.261Z,1.0
2014-04-05T00:05:00.000Z,40.7594,-73.989,B02512,2025-07-28T17:06:30.446Z,uber-raw-data-apr14.csv,1.0,0,Saturday,7,4,2014,2014-04-05,14,Late Night/Early Morning,Weekend,B02512_20140405_000500_407594_-739890,2025-07-28T17:26:15.261Z,1.0
2014-04-05T00:05:00.000Z,40.6771,-73.9525,B02512,2025-07-28T17:06:30.446Z,uber-raw-data-apr14.csv,1.0,0,Saturday,7,4,2014,2014-04-05,14,Late Night/Early Morning,Weekend,B02512_20140405_000500_406771_-739525,2025-07-28T17:26:15.261Z,1.0
2014-04-05T00:07:00.000Z,40.7278,-73.9853,B02512,2025-07-28T17:06:30.446Z,uber-raw-data-apr14.csv,1.0,0,Saturday,7,4,2014,2014-04-05,14,Late Night/Early Morning,Weekend,B02512_20140405_000700_407278_-739853,2025-07-28T17:26:15.261Z,1.0
