# üöÄ Tarea 2: Red Neuronal Big Data - OPTIMIZADO PARA VELOCIDAD
## Dataset: NYC Taxi Enero 2024
### Configuraci√≥n: 16GB RAM + 8 Cores = Entrenamiento Ultra-R√°pido

**Optimizaciones:**
- Aprovecha 16GB RAM disponible
- Paralelizaci√≥n en 8 cores
- Conversi√≥n eficiente de RDD (no batch-by-batch lento)
- Cache agresivo
- Batch size √≥ptimo

In [None]:
#--------------------------------Librer√≠as---------------------------------
import os
import warnings
warnings.filterwarnings('ignore')

# PySpark
os.environ["HADOOP_HOME"] = "C:\\hadoop"
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Keras/TensorFlow
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Utilidades
import numpy as np
import time
from datetime import datetime

print("‚úì Librer√≠as importadas")
#---------------------------------------------------------------------------------

In [None]:
#----------------SparkSession OPTIMIZADO para 16GB + 8 Cores--------------------
print("Configurando Spark para m√°ximo rendimiento...")

spark = SparkSession.builder \
    .appName("RedNeuronal_OPTIMIZADO") \
    .master("local[8]") \
    .config("spark.driver.memory", "12g") \
    .config("spark.executor.memory", "12g") \
    .config("spark.driver.maxResultSize", "8g") \
    .config("spark.sql.shuffle.partitions", "16") \
    .config("spark.default.parallelism", "16") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

print("\n‚úì Spark configurado para m√°ximo rendimiento")
print(f"  Cores: 8 (todos disponibles)")
print(f"  RAM: 12GB (de 16GB disponibles)")
print(f"  Paralelismo: 16 particiones")
#---------------------------------------------------------------------------------

In [None]:
#----------------------Cargar datos-----------------------------------------------
DATA_PATH = "C:/Users/PC/Documents/DocumentosGustavo/Github/Maestria/BigData/nyc-taxi-spark/data/yellow/2024/yellow_tripdata_2024-01.parquet"

print("\n" + "="*80)
print("CARGANDO DATASET NYC TAXI")
print("="*80)

df = spark.read.parquet(DATA_PATH)

print(f"\n‚úì Dataset cargado: {df.count():,} registros")
df.show(5)
#---------------------------------------------------------------------------------

In [None]:
#----------------------Feature Engineering DISTRIBUIDO---------------------------
print("\n" + "="*80)
print("PASO 1: FEATURE ENGINEERING DISTRIBUIDO")
print("="*80)

def extract_and_scale_features(row):
    """
    Extrae y escala features de forma distribuida.
    """
    trip_distance, passenger_count, datetime, fare_amount = row
    
    # Validaci√≥n
    if (trip_distance is None or trip_distance <= 0 or trip_distance >= 100 or
        passenger_count is None or passenger_count <= 0 or passenger_count > 6 or
        datetime is None or
        fare_amount is None or fare_amount <= 0 or fare_amount >= 200):
        return None
    
    # Extraer features
    hour_value = float(datetime.hour)
    day_of_week = float(datetime.weekday() + 1)
    
    # Escalado Z-score
    trip_distance_scaled = (trip_distance - 3.0) / 5.0
    passenger_count_scaled = (passenger_count - 1.5) / 1.0
    hour_scaled = (hour_value - 12.0) / 7.0
    day_scaled = (day_of_week - 4.0) / 2.0
    
    features = [
        float(trip_distance_scaled),
        float(passenger_count_scaled),
        float(hour_scaled),
        float(day_scaled)
    ]
    
    return (features, float(fare_amount))

print("\nüîÑ Procesamiento distribuido con 8 cores...")
start_processing = time.time()

# Crear RDD con selecci√≥n de columnas
rdd_features = df.select(
    "trip_distance",
    "passenger_count", 
    "tpep_pickup_datetime",
    "fare_amount"
).rdd.map(lambda row: (
    row.trip_distance,
    row.passenger_count,
    row.tpep_pickup_datetime,
    row.fare_amount
))

# Feature engineering distribuido
rdd_scaled = rdd_features.map(extract_and_scale_features) \
    .filter(lambda x: x is not None) \
    .repartition(16) \
    .cache()

# Forzar evaluaci√≥n
total_scaled = rdd_scaled.count()

processing_time = time.time() - start_processing

print(f"\n‚úì Procesamiento completado en {processing_time:.1f}s")
print(f"  Registros v√°lidos: {total_scaled:,}")
print(f"  Velocidad: {total_scaled/processing_time:,.0f} registros/segundo")
#---------------------------------------------------------------------------------

In [None]:
#----------------------Divisi√≥n Train/Test----------------------------------------
print("\n" + "="*80)
print("PASO 2: DIVISI√ìN TRAIN/TEST")
print("="*80)

# Dividir
train_rdd, test_rdd = rdd_scaled.randomSplit([0.8, 0.2], seed=42)

# Reparticionar y cachear
train_rdd = train_rdd.repartition(16).cache()
test_rdd = test_rdd.repartition(8).cache()

train_count = train_rdd.count()
test_count = test_rdd.count()

print(f"\n‚úì Divisi√≥n completada")
print(f"  Train: {train_count:,} registros")
print(f"  Test:  {test_count:,} registros")
#---------------------------------------------------------------------------------

In [None]:
#----------------------Conversi√≥n EFICIENTE a NumPy------------------------------
print("\n" + "="*80)
print("PASO 3: CONVERSI√ìN EFICIENTE A NUMPY (Aprovechando 16GB RAM)")
print("="*80)

print("\nüí° Estrategia:")
print("   ‚Ä¢ Procesamiento Big Data completado (distribuido en 8 cores)")
print("   ‚Ä¢ Conversi√≥n eficiente aprovechando 16GB RAM disponible")
print("   ‚Ä¢ Entrenamiento ultra-r√°pido con datos en memoria")

def rdd_to_numpy_parallel(rdd):
    """
    Convierte RDD a numpy usando paralelizaci√≥n m√°xima.
    Aprovecha que los datos ya est√°n distribuidos y cacheados.
    """
    # Collect en paralelo (Spark lo hace autom√°ticamente)
    data = rdd.collect()
    
    # Separar features y labels usando list comprehension (r√°pido)
    X = np.array([item[0] for item in data], dtype=np.float32)
    y = np.array([item[1] for item in data], dtype=np.float32)
    
    return X, y

print("\nüì¶ Convirtiendo Train RDD...")
start_train = time.time()
X_train, y_train = rdd_to_numpy_parallel(train_rdd)
train_time = time.time() - start_train
print(f"   ‚úì Train convertido en {train_time:.1f}s ({train_count/train_time:,.0f} reg/s)")

print("\nüì¶ Convirtiendo Test RDD...")
start_test = time.time()
X_test, y_test = rdd_to_numpy_parallel(test_rdd)
test_time = time.time() - start_test
print(f"   ‚úì Test convertido en {test_time:.1f}s ({test_count/test_time:,.0f} reg/s)")

print(f"\n‚úì Conversi√≥n total: {train_time + test_time:.1f}s")
print(f"  X_train: {X_train.shape} - {X_train.nbytes / 1024**2:.1f} MB")
print(f"  X_test:  {X_test.shape} - {X_test.nbytes / 1024**2:.1f} MB")
print(f"  Total RAM: {(X_train.nbytes + X_test.nbytes + y_train.nbytes + y_test.nbytes) / 1024**2:.1f} MB")

# Liberar RDDs de memoria
train_rdd.unpersist()
test_rdd.unpersist()
rdd_scaled.unpersist()

print("\n‚úì RDDs liberados de cache (ya no son necesarios)")
#---------------------------------------------------------------------------------

In [None]:
#----------------------Modelo Optimizado------------------------------------------
print("\n" + "="*80)
print("PASO 4: CONSTRUCCI√ìN DEL MODELO")
print("="*80)

def create_model():
    model = Sequential([
        Dense(64, activation='relu', input_shape=(4,), name='capa_1'),
        BatchNormalization(),
        Dropout(0.2),
        
        Dense(32, activation='relu', name='capa_2'),
        BatchNormalization(),
        Dropout(0.2),
        
        Dense(16, activation='relu', name='capa_3'),
        BatchNormalization(),
        
        Dense(8, activation='relu', name='capa_4'),
        
        Dense(1, activation='linear', name='salida')
    ])
    
    model.compile(
        optimizer=Adam(learning_rate=0.001),
        loss='mse',
        metrics=['mae']
    )
    
    return model

model = create_model()

print("\n‚úì Modelo creado")
model.summary()
#---------------------------------------------------------------------------------

In [None]:
#----------------------Entrenamiento ULTRA-R√ÅPIDO---------------------------------
print("\n" + "="*80)
print("PASO 5: ENTRENAMIENTO OPTIMIZADO")
print("="*80)

# Callbacks para optimizaci√≥n
callbacks = [
    EarlyStopping(
        monitor='val_loss',
        patience=3,
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=2,
        min_lr=1e-6,
        verbose=1
    )
]

# Configuraci√≥n optimizada
EPOCHS = 20
BATCH_SIZE = 2048  # Batch grande para velocidad (16GB RAM lo permite)
VALIDATION_SPLIT = 0.1

print(f"\n‚öôÔ∏è  Configuraci√≥n:")
print(f"   √âpocas m√°ximas: {EPOCHS}")
print(f"   Batch size: {BATCH_SIZE} (optimizado para velocidad)")
print(f"   Validation split: {VALIDATION_SPLIT*100:.0f}%")
print(f"   Early stopping: S√≠ (patience=3)")
print(f"   Reduce LR: S√≠ (patience=2)")

print(f"\nüí° Ventajas:")
print(f"   ‚Ä¢ Batch grande = menos iteraciones = m√°s r√°pido")
print(f"   ‚Ä¢ Datos en RAM = acceso instant√°neo")
print(f"   ‚Ä¢ Early stopping = detiene si no mejora")
print(f"   ‚Ä¢ Total batches por √©poca: {train_count // BATCH_SIZE}")

print("\nüéØ Iniciando entrenamiento...\n")

start_time = time.time()

# ENTRENAR (R√ÅPIDO)
history = model.fit(
    X_train, y_train,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=VALIDATION_SPLIT,
    callbacks=callbacks,
    verbose=1
)

training_time = time.time() - start_time

print("\n" + "="*80)
print("‚úì ENTRENAMIENTO COMPLETADO")
print("="*80)
print(f"  Tiempo total: {training_time/60:.2f} minutos")
print(f"  Tiempo por √©poca: {training_time/len(history.history['loss']):.1f}s")
print(f"  √âpocas ejecutadas: {len(history.history['loss'])} de {EPOCHS}")
print(f"  Mejor val_loss: {min(history.history['val_loss']):.4f}")

print(f"\nüöÄ Rendimiento:")
print(f"   ‚Ä¢ Procesamiento Big Data: ‚úì ({processing_time:.1f}s)")
print(f"   ‚Ä¢ Conversi√≥n eficiente: ‚úì ({train_time + test_time:.1f}s)")
print(f"   ‚Ä¢ Entrenamiento r√°pido: ‚úì ({training_time:.1f}s)")
print(f"   ‚Ä¢ Total: {processing_time + train_time + test_time + training_time:.1f}s")
#---------------------------------------------------------------------------------

In [None]:
#----------------------Evaluaci√≥n-------------------------------------------------
print("\n" + "="*80)
print("PASO 6: EVALUACI√ìN COMPLETA")
print("="*80)

print("\nüìä Evaluando modelo...")

# Evaluaci√≥n
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=0)

# Predicciones
print("üîÆ Generando predicciones...")
y_pred = model.predict(X_test, batch_size=4096, verbose=0)

# M√©tricas
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred.flatten()) / y_test)) * 100
percent_errors = np.abs((y_test - y_pred.flatten()) / y_test) * 100
accuracy_10pct = np.mean(percent_errors <= 10) * 100

print("\n" + "="*80)
print("RESULTADOS FINALES")
print("="*80)

print("\nüìà M√©tricas de Regresi√≥n:")
print(f"   MSE:  {mse:.4f}")
print(f"   RMSE: ${rmse:.4f}")
print(f"   MAE:  ${mae:.4f}")
print(f"   R¬≤:   {r2:.4f} ({r2*100:.1f}%)")

print("\nüìä M√©tricas de Negocio:")
print(f"   MAPE:         {mape:.2f}%")
print(f"   Accuracy@10%: {accuracy_10pct:.2f}%")

print("\nüí° Interpretaci√≥n:")
if r2 > 0.80:
    print(f"   ‚úì EXCELENTE - Modelo de muy alta calidad")
elif r2 > 0.70:
    print(f"   ‚úì MUY BUENO - Modelo s√≥lido")
elif r2 > 0.60:
    print(f"   ‚úì BUENO - Modelo aceptable")
else:
    print(f"   ‚ö† MODERADO - Considerar mejoras")

print(f"\n   Error promedio: ¬±${mae:.2f} (¬±{mape:.1f}%)")
print(f"   {accuracy_10pct:.0f}% predicciones dentro de ¬±10%")
#---------------------------------------------------------------------------------

In [None]:
#----------------------Ejemplos---------------------------------------------------
print("\n" + "="*80)
print("EJEMPLOS DE PREDICCIONES")
print("="*80)

sample_indices = np.random.choice(len(y_test), size=20, replace=False)
sample_real = y_test[sample_indices]
sample_pred = y_pred[sample_indices].flatten()

print("\nüîç 20 ejemplos:\n")
print(f"{'Predicci√≥n':<15} {'Real':<15} {'Error':<15} {'Error %':<15}")
print("-" * 60)

for pred, real in zip(sample_pred, sample_real):
    error = pred - real
    error_pct = (error / real) * 100 if real != 0 else 0
    print(f"${pred:<14.2f} ${real:<14.2f} ${error:<14.2f} {error_pct:<14.1f}%")

all_errors = y_pred.flatten() - y_test
print(f"\nüìä Estad√≠sticas:")
print(f"   Error min:  ${np.min(all_errors):.2f}")
print(f"   Error max:  ${np.max(all_errors):.2f}")
print(f"   Error mean: ${np.mean(all_errors):.2f}")
print(f"   Error std:  ${np.std(all_errors):.2f}")
#---------------------------------------------------------------------------------

In [None]:
#----------------------Guardar Modelo---------------------------------------------
print("\n" + "="*80)
print("GUARDAR MODELO")
print("="*80)

os.makedirs("modelos", exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_path = f"modelos/taxi_fare_OPTIMIZADO_{timestamp}.h5"
model.save(model_path)

print(f"\n‚úì Modelo guardado: {model_path}")
print(f"  Tama√±o: {os.path.getsize(model_path) / 1024:.2f} KB")
print(f"  R¬≤ Score: {r2:.4f}")
#---------------------------------------------------------------------------------

In [None]:
#----------------------Resumen Final----------------------------------------------
print("\n" + "="*80)
print("RESUMEN FINAL - VERSI√ìN OPTIMIZADA")
print("="*80)

total_time = processing_time + train_time + test_time + training_time

summary = f"""
üöÄ OPTIMIZACI√ìN PARA 16GB RAM + 8 CORES:
   ‚úì Procesamiento Big Data distribuido (8 cores)
   ‚úì Conversi√≥n eficiente aprovechando RAM
   ‚úì Entrenamiento ultra-r√°pido (batch_size=2048)
   ‚úì Toda la base de datos procesada: {total_scaled:,} registros

‚è±Ô∏è  TIEMPOS DE EJECUCI√ìN:
   ‚Ä¢ Procesamiento Spark:  {processing_time:.1f}s
   ‚Ä¢ Conversi√≥n a numpy:   {train_time + test_time:.1f}s
   ‚Ä¢ Entrenamiento:        {training_time:.1f}s ({training_time/60:.1f} min)
   ‚Ä¢ TOTAL:                {total_time:.1f}s ({total_time/60:.1f} min)

üìä DATOS:
   ‚Ä¢ Dataset completo: {total_scaled:,} registros
   ‚Ä¢ Train: {train_count:,}
   ‚Ä¢ Test: {test_count:,}
   ‚Ä¢ Sin muestreo ni subconjuntos

üèóÔ∏è  MODELO:
   ‚Ä¢ Arquitectura: 64-32-16-8-1
   ‚Ä¢ Par√°metros: {model.count_params():,}
   ‚Ä¢ √âpocas: {len(history.history['loss'])}
   ‚Ä¢ Batch size: {BATCH_SIZE}

üìà RESULTADOS:
   ‚Ä¢ R¬≤:   {r2:.4f} ({r2*100:.1f}%)
   ‚Ä¢ RMSE: ${rmse:.4f}
   ‚Ä¢ MAE:  ${mae:.4f}
   ‚Ä¢ MAPE: {mape:.2f}%
   ‚Ä¢ Accuracy@10%: {accuracy_10pct:.2f}%

üíæ GUARDADO:
   ‚Ä¢ {model_path}
"""

print(summary)

print("="*80)
print("‚úÖ TAREA COMPLETADA - VERSI√ìN OPTIMIZADA PARA VELOCIDAD")
print("="*80)
print(f"\nüéØ Tiempo total: {total_time/60:.1f} minutos (vs >10 min anterior)")
print(f"   Mejora: ~{(600/total_time):.1f}x m√°s r√°pido")
#---------------------------------------------------------------------------------