In [17]:
#--------------------------------Liberias---------------------------------
import os
import warnings
warnings.filterwarnings('ignore')

# PySpark
os.environ["HADOOP_HOME"] = "C:\\hadoop" # Configuración de Spark/Hadoop
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import col, rand
from pyspark.sql.types import DoubleType, StructType, StructField

# Keras/TensorFlow
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Elephas - Deep Learning sobre Spark
from elephas.spark_model import SparkModel
from elephas.utils.rdd_utils import to_simple_rdd
#---------------------------------------------------------------------------------

In [27]:
#----------------Crear SparkSession-----------------------------------------------
spark = SparkSession.builder \
    .appName("RedNeuronal_Elephas") \
    .master("local[*]") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.maxResultSize", "4g") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")
#---------------------------------------------------------------------------------

In [19]:
#----------------------Cargar datos-------------------------------------------------------
DATA_PATH = "C:/Users/PC/Documents/DocumentosGustavo/Github/Maestria/BigData/nyc-taxi-spark/data/yellow/2024/yellow_tripdata_2024-01.parquet"
df = spark.read.parquet(DATA_PATH)
df.show(5) #Se utiliza show para vizualizar los datos, es como head en pandas, aqui solo vizualiso el parquet de enero

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       2| 2024-01-01 00:57:55|  2024-01-01 01:17:43|              1|         1.72|         1|                 N|         186|          79|           2|       17.7|  1.0|    0.5|       0.

In [20]:
print(f"  Total de registros: {df.count()}")
print(f"  Columnas: {len(df.columns)}")
print("\nEsquema del dataset:")
df.printSchema()

  Total de registros: 2964624
  Columnas: 19

Esquema del dataset:
root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: double (nullable = true)



In [21]:
# Seleccionar las columnas necesarias
rdd_features = df.select(
    "trip_distance",
    "passenger_count", 
    "tpep_pickup_datetime",
    "fare_amount"
).rdd.map(lambda row: (
    row.trip_distance,
    row.passenger_count,
    row.tpep_pickup_datetime,
    row.fare_amount
))

print(f" Se creocon exito el RDD con las columnas seleccionadas.")
print(f"  Total registros: {rdd_features.count():,}")
print(f"\nEjemplo de registro:")
print(rdd_features.take(1)[0])

 Se creocon exito el RDD con las columnas seleccionadas.
  Total registros: 2,964,624

Ejemplo de registro:
(1.72, 1, datetime.datetime(2024, 1, 1, 0, 57, 55), 17.7)


Variables del modeo:
- trip_distance: Distancia del viaje (normalizada a [0,1])
- passenger_count: Número de pasajeros (normalizada a [0,1])
- hour: Hora del día 0-23 (normalizada a [0,1])
- day_of_week: Día de la semana 1-7 (normalizada a [0,1])

Variable predictoria: 
- fare_amount: Tarifa del viaje (normalizada a [0,1])

In [22]:
def extract_and_normalize_features(row):
    """
    Extrae y normaliza features de cada registro.
    Input: (trip_distance, passenger_count, datetime, fare_amount)
    Output: (features_list, label_list) o None si datos inválidos
    
    Normalización a rango [0, 1] necesaria igmoid
    """
    trip_distance, passenger_count, datetime, fare_amount = row
    
    # Filtrar datos inválidos o atípicos
    if (trip_distance is None or trip_distance <= 0 or trip_distance >= 100 or
        passenger_count is None or passenger_count <= 0 or passenger_count > 6 or
        datetime is None or
        fare_amount is None or fare_amount <= 0 or fare_amount >= 200):
        return None
    
    #Extraccion de datos temporales
    hour_value = datetime.hour                # 0-23
    day_of_week = datetime.weekday() + 1     # 1=Lunes, 7=Domingo
    
    # normalizacion de 0 a 1     
    # Feature 1: Distancia del viaje
    trip_distance_norm = min(trip_distance / 100.0, 1.0)  # Max: 100 millas
    
    # Feature 2: Número de pasajeros
    passenger_count_norm = passenger_count / 6.0          # Max: 6 pasajeros
    
    # Feature 3: Hora del día
    hour_norm = hour_value / 24.0                         # Max: 24 horas
    
    # Feature 4: Día de la semana
    day_norm = day_of_week / 7.0                          # Max: 7 días
    
    # Target: Tarifa normalizada
    # Min: $2, Max: $200 a  Rango [0, 1]
    fare_norm = (fare_amount - 2.0) / (200.0 - 2.0)
    fare_norm = max(0.0, min(fare_norm, 1.0))  # Clamp a [0, 1]
    
    # Retornar formato compatible con elephas
    # Formato: ([features], label) donde label es float, no lista
    features = [
        float(trip_distance_norm),
        float(passenger_count_norm),
        float(hour_norm),
        float(day_norm)
    ]
    
    label = float(fare_norm)  # Elephas espera un float, no lista
    
    return (features, label)

print("Función definida")

Función definida


In [23]:
# Aplicar normalización distribuida al rdd
rdd_normalized = rdd_features.map(
    lambda row: extract_and_normalize_features(row)
).filter(lambda x: x is not None).cache()

# Contar registros válidos
total_normalized = rdd_normalized.count()

print(f"  Registros originales: {rdd_features.count():,}")
print(f"  Registros normalizados: {total_normalized:,}")
print(f"  Registros filtrados: {rdd_features.count() - total_normalized:,}")

# Mostrar ejemplo de dato normalizado
print("\n Ejemplo de registro normalizado:")
sample = rdd_normalized.take(1)[0]
print(f"  Features: {sample[0]}")
print(f"  Label (fare): {sample[1]:.4f}")

  Registros originales: 2,964,624
  Registros normalizados: 2,722,784
  Registros filtrados: 241,840

 Ejemplo de registro normalizado:
  Features: [0.0172, 0.16666666666666666, 0.0, 0.14285714285714285]
  Label (fare): 0.0793


In [24]:
#Dividiendo los datos en entrenamiento (80%) y prueba (20%) usando randomSplit en RDD
train_rdd, test_rdd = rdd_normalized.randomSplit([0.8, 0.2], seed=42)

# Cachear para mejor rendimiento
train_rdd = train_rdd.cache()
test_rdd = test_rdd.cache()

train_count = train_rdd.count()
test_count = test_rdd.count()

print(f"  Train: {train_count:,} registros ({train_count/total_normalized*100:.1f}%)")
print(f"  Test:  {test_count:,} registros ({test_count/total_normalized*100:.1f}%)")

  Train: 2,178,428 registros (80.0%)
  Test:  544,356 registros (20.0%)


Arquitectura de la red neuronal: 
- Capa oculta: 4 neuronas con activación sigmoide
- Capa de salida: 1 neurona con activación sigmoide
- Función de perdida: MAE
- Optimizador: Adam

In [25]:
# Número de features para la capa de entrada 
NUM_FEATURES = 4

def create_model():
    model = Sequential([
        # Capa oculta
        Dense(
            units=4,#neuronas
            activation='sigmoid',#activacion
            input_shape=(NUM_FEATURES,),
            name='capa_oculta'
        ),
        
        # Capa de salida
        Dense(
            units=1,#neuronas
            activation='sigmoid', #activacion 
            name='capa_salida'
        )
    ])
    
    # Compilar con MAE como función de pérdida
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='mean_absolute_error',
        metrics=['mae', 'mse']
    )
    
    return model

# Crear y mostrar el modelo
model = create_model()
print(" Modelo de red neuronal creado")
print("Arquitectura:")
model.summary()

 Modelo de red neuronal creado
Arquitectura:


Entrenamiento del modelo con Elephas (distribuido).
- Mini-batch: 500
- Epocas: 5
- Modo: entrenamiento distibuido synchronous
- workers: 2 (palalelismo)

In [29]:
BATCH_SIZE = 500  
EPOCHS = 5
NUM_WORKERS = 2

# Crear SparkModel de Elephas
spark_model = SparkModel(
    model=model,
    frequency='batch',      # CAMBIO 1
    mode='synchronous',     # CAMBIO 2 (MÁS ESTABLE)
    num_workers=2           # CAMBIO 3 (menos workers)
)

print(" SparkModel de Elephas creado")
print("INICIANDO ENTRENAMIENTO DISTRIBUIDO")
print(f"Registros de entrenamiento: {train_count:,}")
print(f"Batch size: {BATCH_SIZE:,}")
print(f"Epochs: {EPOCHS}")
print(f"Workers: {NUM_WORKERS}")
print(f"Mode: asynchronous")
print("="*70 + "\n")

# Entrenar el modelo de forma distribuida
spark_model.fit(
    train_rdd,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=1,
    validation_split=0.1  # 10% del train para validación
)

print(" ENTRENAMIENTO COMPLETADO")

 SparkModel de Elephas creado
INICIANDO ENTRENAMIENTO DISTRIBUIDO
Registros de entrenamiento: 2,178,428
Batch size: 500
Epochs: 5
Workers: 2
Mode: asynchronous

>>> Fit model


Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 44.0 failed 1 times, most recent failure: Lost task 1.0 in stage 44.0 (TID 205) (DESKTOP-OEGMP3C executor driver): org.apache.spark.SparkException: Python worker exited unexpectedly (crashed). Consider setting 'spark.sql.execution.pyspark.udf.faulthandler.enabled' or'spark.python.worker.faulthandler.enabled' configuration to 'true' for the better Python traceback.
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:678)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:663)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:35)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1034)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1014)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:596)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.mutable.Growable.addAll(Growable.scala:61)
	at scala.collection.mutable.Growable.addAll$(Growable.scala:57)
	at scala.collection.mutable.ArrayBuilder.addAll(ArrayBuilder.scala:75)
	at scala.collection.IterableOnceOps.toArray(IterableOnce.scala:1528)
	at scala.collection.IterableOnceOps.toArray$(IterableOnce.scala:1521)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1057)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2536)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:180)
	at org.apache.spark.scheduler.Task.run(Task.scala:147)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:716)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:86)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:83)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:97)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:719)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	at java.base/java.lang.Thread.run(Thread.java:842)
Caused by: java.io.EOFException
	at java.base/java.io.DataInputStream.readInt(DataInputStream.java:398)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1022)
	... 22 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$3(DAGScheduler.scala:3122)
	at scala.Option.getOrElse(Option.scala:201)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:3122)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:3114)
	at scala.collection.immutable.List.foreach(List.scala:323)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:3114)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1303)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1303)
	at scala.Option.foreach(Option.scala:437)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1303)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3397)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3328)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:3317)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:50)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:1017)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2496)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2517)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2536)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2561)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1057)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:417)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1056)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:205)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:184)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:108)
	at java.base/java.lang.Thread.run(Thread.java:842)
Caused by: org.apache.spark.SparkException: Python worker exited unexpectedly (crashed). Consider setting 'spark.sql.execution.pyspark.udf.faulthandler.enabled' or'spark.python.worker.faulthandler.enabled' configuration to 'true' for the better Python traceback.
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:678)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:663)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:35)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1034)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1014)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:596)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.mutable.Growable.addAll(Growable.scala:61)
	at scala.collection.mutable.Growable.addAll$(Growable.scala:57)
	at scala.collection.mutable.ArrayBuilder.addAll(ArrayBuilder.scala:75)
	at scala.collection.IterableOnceOps.toArray(IterableOnce.scala:1528)
	at scala.collection.IterableOnceOps.toArray$(IterableOnce.scala:1521)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1057)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2536)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:180)
	at org.apache.spark.scheduler.Task.run(Task.scala:147)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:716)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:86)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:83)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:97)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:719)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
	... 1 more
Caused by: java.io.EOFException
	at java.base/java.io.DataInputStream.readInt(DataInputStream.java:398)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:1022)
	... 22 more
