In [44]:
#--------------------------------Liberias---------------------------------
import os
import warnings
warnings.filterwarnings('ignore')

# PySpark
os.environ["HADOOP_HOME"] = "C:\\hadoop"
from pyspark.sql import SparkSession
from pyspark.sql.functions import hour, dayofweek, col, rand, monotonically_increasing_id
import numpy as np
import pandas as pd

# TensorFlow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# Configurar TensorFlow para modo CPU solamente
tf.config.set_visible_devices([], 'GPU')
#---------------------------------------------------------------------------------

In [45]:
#---------------------------------------------------------------------------------
# Crear SparkSession con configuración de memoria optimizada
spark = SparkSession.builder \
    .appName("RedNeuronal_Taxi") \
    .master("local[*]") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "false") \
    .config("spark.sql.execution.arrow.pyspark.fallback.enabled", "true") \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.memory.offHeap.enabled", "true") \
    .config("spark.memory.offHeap.size", "4g") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

#---------------------------------------------------------------------------------

In [47]:
#---------------------------------------------------------------------------------
# Cargar y preparar datos
DATA_PATH = "C:/Users/PC/Documents/DocumentosGustavo/Github/Maestria/BigData/nyc-taxi-spark/data/yellow/2024/yellow_tripdata_2024-01.parquet"
df = spark.read.parquet(DATA_PATH)

# Limpiar y preparar datos
df_clean = df.select(
    col("trip_distance").cast("double"),
    col("passenger_count").cast("double"),
    col("fare_amount").cast("double"),
    hour("tpep_pickup_datetime").alias("hour"),
    dayofweek("tpep_pickup_datetime").alias("day")
).filter(
    (col("trip_distance") > 0) & (col("trip_distance") < 50) &
    (col("passenger_count") > 0) & (col("passenger_count") <= 6) &
    (col("fare_amount") > 2) & (col("fare_amount") < 100)
)

total_registros = df_clean.count()
print(f"Registros limpios: {total_registros:,}")
df.show(5)
#---------------------------------------------------------------------------------

Registros limpios: 2,716,383
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       2| 2024-01-01 00:57:55|  2024-01-01 01:17:43|              1|         1.72|         1|                 N|         186|          79|           2|      

In [None]:
# Calcular tamaño de muestra
SAMPLE_SIZE = 100000  # 100k registros
sample_fraction = min(SAMPLE_SIZE / total_registros, 0.05)  # Máximo 5% ó 100k

print(f"\nUsando fracción de muestra: {sample_fraction:.4f} ({int(sample_fraction * total_registros):,} registros)")

df_sample = df_clean.sample(withReplacement=False, fraction=sample_fraction, seed=42)

# Convertir a pandas con manejo de memoria
print("Convirtiendo a pandas en lotes...")

# Método 1: Directo pero con límite
try:
    pdf = df_sample.limit(SAMPLE_SIZE).toPandas()
    print(f"Shape pandas: {pdf.shape}")
except Exception as e:
    print(f"Error en conversión directa: {e}")
    print("Usando método alternativo...")
    
    # Método 2: Recolectar en lotes, por si la primera no funciona 
    pdf = pd.DataFrame()
    batch_size = 50000
    
    # Obtener total de filas en la muestra
    sample_count = df_sample.count()
    num_batches = (sample_count + batch_size - 1) // batch_size
    
    for i in range(num_batches):
        print(f"Procesando lote {i+1}/{num_batches}")
        batch_df = df_sample.limit(batch_size).toPandas()
        pdf = pd.concat([pdf, batch_df], ignore_index=True)
        # Liberar memoria
        del batch_df
        
        # Forzar garbage collection
        import gc
        gc.collect()
    
    print(f"Shape pandas final: {pdf.shape}")



Usando fracción de muestra: 0.0368 (100,000 registros)
Convirtiendo a pandas en lotes...
Shape pandas: (99980, 5)


In [50]:
#---------------------------------------------------------------------------------
# Preparar features y target
def prepare_data_pandas(pdf):
    """Preparar datos para TensorFlow"""
    # Normalizar features
    pdf['trip_distance_norm'] = pdf['trip_distance'] / 50.0
    pdf['passenger_norm'] = pdf['passenger_count'] / 6.0
    pdf['hour_norm'] = pdf['hour'] / 24.0
    pdf['day_norm'] = (pdf['day'] - 1) / 6.0
    
    # Normalizar target
    pdf['fare_norm'] = (pdf['fare_amount'] - 2.0) / (100.0 - 2.0)
    pdf['fare_norm'] = pdf['fare_norm'].clip(0, 1)
    
    # Features y target
    features = pdf[['trip_distance_norm', 'passenger_norm', 'hour_norm', 'day_norm']].values
    target = pdf['fare_norm'].values
    
    return features, target

X, y = prepare_data_pandas(pdf)
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Dividir en train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTrain: {X_train.shape[0]:,} registros")
print(f"Test: {X_test.shape[0]:,} registros")

#---------------------------------------------------------------------------------

Features shape: (99980, 4)
Target shape: (99980,)

Train: 79,984 registros
Test: 19,996 registros


Arquitectura de la red neuronal: 
- Capa oculta: 4 neuronas con activación sigmoide
- Capa de salida: 1 neurona con activación sigmoide
- Función de perdida: MAE
- Optimizador: ADAM 
- Epoc: 10


In [51]:
# Crear y entrenar modelo
def create_model():
    model = Sequential([
        Dense(8, activation='relu', input_shape=(4,)),
        Dense(4, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='mse',
        metrics=['mae']
    )
    return model

model = create_model()
model.summary()

# Entrenar localmente
print("\n" + "="*50)
print("Entrenando el modelo")
print("="*50)

history = model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=10,
    batch_size=128,
    verbose=1
)

# Evaluar
print("\n" + "="*50)
print("Evaluación")
print("="*50)
loss, mae = model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss (MSE): {loss:.4f}")
print(f"Test MAE: {mae:.4f}")

# Mostrar algunas predicciones
print("\nPredicciones vs Reales (valores normalizados):")
print("-" * 50)
predictions = model.predict(X_test[:10], verbose=0)
for i in range(10):
    print(f"Pred: {predictions[i][0]:.3f} | Real: {y_test[i]:.3f} | Diff: {abs(predictions[i][0] - y_test[i]):.3f}")


Entrenando el modelo
Epoch 1/10
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - loss: 0.0353 - mae: 0.1467 - val_loss: 0.0059 - val_mae: 0.0600
Epoch 2/10
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 0.0038 - mae: 0.0427 - val_loss: 0.0028 - val_mae: 0.0364
Epoch 3/10
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 0.0031 - mae: 0.0365 - val_loss: 0.0027 - val_mae: 0.0351
Epoch 4/10
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 0.0030 - mae: 0.0353 - val_loss: 0.0026 - val_mae: 0.0343
Epoch 5/10
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 0.0028 - mae: 0.0339 - val_loss: 0.0024 - val_mae: 0.0327
Epoch 6/10
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 0.0026 - mae: 0.0318 - val_loss: 0.0022 - val_mae: 0.0298
Epoch 7/10
[1m563/563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 