# Big Data - Proyecto MLLib
# Gradient Boosted Trees - Configuración Optimizada para Torneo
# Basado en resultados previos con F1=0.8580


# INICIALIZACIÓN Y CONFIGURACIÓN

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import *
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
import time
import json

# Configuración Spark optimizada para recursos limitados
spark = SparkSession.builder \
    .appName("GBT Final - Hotel Booking Cancellation") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.driver.memory", "3g") \
    .config("spark.executor.memory", "3g") \
    .config("spark.driver.maxResultSize", "1g") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

print("=== CONFIGURACIÓN SPARK OPTIMIZADA ===")
print(f"Spark Version: {spark.version}")
print("Configuración: Sin Grid Search, solo mejor modelo")

25/06/11 20:21:22 WARN Utils: Your hostname, jgasbul-VirtualBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
25/06/11 20:21:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/11 20:21:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


=== CONFIGURACIÓN SPARK OPTIMIZADA ===
Spark Version: 3.5.5
Configuración: Sin Grid Search, solo mejor modelo


# CARGA Y ANÁLISIS INICIAL DE DATOS

In [2]:
print("=== CARGANDO DATOS ===")
df = spark.read.csv("train.csv", header=True, inferSchema=True)
print(f"Datos cargados: {df.count()} registros, {len(df.columns)} columnas")

# Verificación rápida de la distribución de clases
class_distribution = df.groupBy("is_canceled").count().orderBy("is_canceled")
print("\n=== DISTRIBUCIÓN DE CLASES ===")
class_distribution.show()

# Calcular proporción para referencia
total_records = df.count()
canceled_count = df.filter(col("is_canceled") == 1).count()
print(f"Cancelaciones: {canceled_count}/{total_records} ({canceled_count/total_records*100:.1f}%)")

=== CARGANDO DATOS ===


                                                                                

Datos cargados: 55531 registros, 23 columnas

=== DISTRIBUCIÓN DE CLASES ===
+-----------+-----+
|is_canceled|count|
+-----------+-----+
|          0|32418|
|          1|23113|
+-----------+-----+

Cancelaciones: 23113/55531 (41.6%)


# DEFINICIÓN DE COLUMNAS Y FEATURE ENGINEERING

In [3]:
print("=== CONFIGURANDO TRANSFORMACIONES SIMPLIFICADAS ===")

# Definir columnas por tipo (simplificado para mejor rendimiento)
numeric_cols = ["lead_time", "arrival_date_week_number", "stays_in_weekend_nights", 
                "stays_in_week_nights", "adults", "children", "babies", 
                "previous_cancellations", "previous_bookings_not_canceled", 
                "booking_changes", "days_in_waiting_list", "adr", 
                "required_car_parking_spaces", "total_of_special_requests"]

# Solo las categóricas más importantes para reducir complejidad
categorical_cols = ["meal", "market_segment", "deposit_type", "customer_type"]

boolean_cols = ["is_repeated_guest"]
target_col = "is_canceled"

print(f"Columnas numéricas: {len(numeric_cols)}")
print(f"Columnas categóricas (simplificadas): {len(categorical_cols)}")
print(f"Columnas booleanas: {len(boolean_cols)}")

=== CONFIGURANDO TRANSFORMACIONES SIMPLIFICADAS ===
Columnas numéricas: 14
Columnas categóricas (simplificadas): 4
Columnas booleanas: 1


# FEATURE ENGINEERING AVANZADO

In [4]:
print("\n=== FEATURE ENGINEERING ESENCIAL ===")

# Solo las características derivadas más importantes
df = df.withColumn("total_nights", col("stays_in_weekend_nights") + col("stays_in_week_nights")) \
       .withColumn("total_guests", col("adults") + coalesce(col("children"), lit(0)) + col("babies")) \
       .withColumn("adr_per_person", 
                   when(col("total_guests") > 0, col("adr") / col("total_guests")).otherwise(col("adr"))) \
       .withColumn("has_special_requests", (col("total_of_special_requests") > 0).cast("int")) \
       .withColumn("has_previous_cancellations", (col("previous_cancellations") > 0).cast("int")) \
       .withColumn("booking_changes_flag", (col("booking_changes") > 0).cast("int")) \
       .withColumn("lead_time_log", log(col("lead_time") + 1)) \
       .withColumn("adr_log", log(col("adr") + 1))

print("Feature engineering completado")

# Actualizar listas de columnas
derived_numeric_cols = ["total_nights", "total_guests", "adr_per_person", "lead_time_log", "adr_log"]
derived_binary_cols = ["has_special_requests", "has_previous_cancellations", "booking_changes_flag"]

all_numeric_cols = numeric_cols + derived_numeric_cols
all_binary_cols = boolean_cols + derived_binary_cols


=== FEATURE ENGINEERING ESENCIAL ===
Feature engineering completado


# PIPELINE DE TRANSFORMACIONES OPTIMIZADO

In [5]:
print("=== CONSTRUYENDO PIPELINE SIMPLIFICADO ===")

# FASE 1: Imputación
imputer = Imputer(
    inputCols=["children"],
    outputCols=["children_imputed"],
    strategy="median"
)

# FASE 2: Encoding categórico simplificado (solo StringIndexer)
string_indexers = []
indexed_categorical_cols = []

for col_name in categorical_cols:
    indexer = StringIndexer(
        inputCol=col_name,
        outputCol=f"{col_name}_indexed",
        handleInvalid="keep"
    )
    string_indexers.append(indexer)
    indexed_categorical_cols.append(f"{col_name}_indexed")

# FASE 3: Escalado de variables numéricas
numeric_cols_for_scaling = [col for col in all_numeric_cols if col != "children"] + ["children_imputed"]

assembler_for_scaling = VectorAssembler(
    inputCols=numeric_cols_for_scaling,
    outputCol="numeric_features_raw"
)

scaler = StandardScaler(
    inputCol="numeric_features_raw",
    outputCol="numeric_features_scaled",
    withStd=True,
    withMean=True
)

# FASE 4: Ensamblador final simplificado
all_feature_cols = ["numeric_features_scaled"] + indexed_categorical_cols + all_binary_cols

final_assembler = VectorAssembler(
    inputCols=all_feature_cols,
    outputCol="features"
)

print(f"Pipeline simplificado con {len(all_feature_cols)} tipos de características")

=== CONSTRUYENDO PIPELINE SIMPLIFICADO ===
Pipeline simplificado con 9 tipos de características


# APLICAR TRANSFORMACIONES Y DIVISIÓN DE DATOS

In [6]:
print("=== APLICANDO TRANSFORMACIONES ===")

# Pipeline simplificado
preprocessing_stages = [imputer] + string_indexers + [
    assembler_for_scaling,
    scaler,
    final_assembler
]

preprocessing_pipeline = Pipeline(stages=preprocessing_stages)

# Fit y transform
fitted_pipeline = preprocessing_pipeline.fit(df)
df_transformed = fitted_pipeline.transform(df)

print("Transformaciones aplicadas correctamente")

# Verificar dimensiones
sample_features = df_transformed.select("features").first()[0]
feature_dimension = len(sample_features)
print(f"Dimensión del vector de características: {feature_dimension}")

# División de datos
print("\n=== DIVISIÓN DE DATOS ===")
train_df, val_df = df_transformed.select("features", target_col).randomSplit([0.8, 0.2], seed=42)

# Cachear para mejor rendimiento
train_df.cache()
val_df.cache()

train_count = train_df.count()
val_count = val_df.count()
print(f"Datos de entrenamiento: {train_count} registros")
print(f"Datos de validación: {val_count} registros")

=== APLICANDO TRANSFORMACIONES ===


25/06/11 20:21:36 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

Transformaciones aplicadas correctamente
Dimensión del vector de características: 27

=== DIVISIÓN DE DATOS ===




Datos de entrenamiento: 44413 registros
Datos de validación: 11118 registros


                                                                                

# CONFIGURACIÓN OPTIMIZADA DE GBT

In [7]:
print("=== CONFIGURACIÓN ÓPTIMA DE GBT (SIN GRID SEARCH) ===")

# Configuración óptima basada en experiment_results.json
gbt_final = GBTClassifier(
    featuresCol="features",
    labelCol=target_col,
    maxDepth=7,           # Optimal parameter
    maxIter=150,          # Optimal parameter  
    stepSize=0.2,         # Optimal parameter
    subsamplingRate=1.0,  # Optimal parameter
    featureSubsetStrategy="all",
    seed=42,
    minInstancesPerNode=1,
    maxBins=32,
    lossType="logistic",
    validationTol=0.01,
    cacheNodeIds=True,
    checkpointInterval=10
)

print("Configuración GBT final:")
print(f"  - maxDepth: {gbt_final.getMaxDepth()}")
print(f"  - maxIter: {gbt_final.getMaxIter()}")
print(f"  - stepSize: {gbt_final.getStepSize()}")
print(f"  - subsamplingRate: {gbt_final.getSubsamplingRate()}")
print("  - Parámetros basados en mejores resultados previos (F1=0.8580)")

=== CONFIGURACIÓN ÓPTIMA DE GBT (SIN GRID SEARCH) ===
Configuración GBT final:
  - maxDepth: 7
  - maxIter: 150
  - stepSize: 0.2
  - subsamplingRate: 1.0
  - Parámetros basados en mejores resultados previos (F1=0.8580)


# ENTRENAMIENTO DEL MODELO FINAL

In [8]:
print("\n=== ENTRENAMIENTO DEL MODELO FINAL ===")
print("Iniciando entrenamiento con configuración óptima...")

start_time = time.time()
gbt_model = gbt_final.fit(train_df)
training_time = time.time() - start_time

print(f"✅ Entrenamiento completado en {training_time:.1f}s")


=== ENTRENAMIENTO DEL MODELO FINAL ===
Iniciando entrenamiento con configuración óptima...


25/06/11 20:22:23 WARN DAGScheduler: Broadcasting large task binary with size 1010.7 KiB
25/06/11 20:22:23 WARN DAGScheduler: Broadcasting large task binary with size 1011.3 KiB
25/06/11 20:22:23 WARN DAGScheduler: Broadcasting large task binary with size 1011.7 KiB
25/06/11 20:22:23 WARN DAGScheduler: Broadcasting large task binary with size 1012.1 KiB
25/06/11 20:22:23 WARN DAGScheduler: Broadcasting large task binary with size 1012.8 KiB
25/06/11 20:22:24 WARN DAGScheduler: Broadcasting large task binary with size 1013.6 KiB
25/06/11 20:22:24 WARN DAGScheduler: Broadcasting large task binary with size 1014.8 KiB
25/06/11 20:22:24 WARN DAGScheduler: Broadcasting large task binary with size 1026.0 KiB
25/06/11 20:22:24 WARN DAGScheduler: Broadcasting large task binary with size 1026.5 KiB
25/06/11 20:22:24 WARN DAGScheduler: Broadcasting large task binary with size 1026.9 KiB
25/06/11 20:22:24 WARN DAGScheduler: Broadcasting large task binary with size 1027.4 KiB
25/06/11 20:22:24 WAR

✅ Entrenamiento completado en 118.2s


# EVALUACIÓN COMPLETA

In [9]:
print("=== EVALUACIÓN COMPLETA DEL MODELO ===")

# Predicciones en validación
predictions = gbt_model.transform(val_df)

# Calcular todas las métricas
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol=target_col, predictionCol="prediction", metricName="f1")
f1_score = f1_evaluator.evaluate(predictions)

auc_evaluator = BinaryClassificationEvaluator(
    labelCol=target_col, rawPredictionCol="rawPrediction", metricName="areaUnderROC")
auc_score = auc_evaluator.evaluate(predictions)

accuracy_evaluator = MulticlassClassificationEvaluator(
    labelCol=target_col, predictionCol="prediction", metricName="accuracy")
accuracy = accuracy_evaluator.evaluate(predictions)

precision_evaluator = MulticlassClassificationEvaluator(
    labelCol=target_col, predictionCol="prediction", metricName="weightedPrecision")
precision = precision_evaluator.evaluate(predictions)

recall_evaluator = MulticlassClassificationEvaluator(
    labelCol=target_col, predictionCol="prediction", metricName="weightedRecall")
recall = recall_evaluator.evaluate(predictions)

print("\n=== RESULTADOS FINALES PARA TORNEO ===")
print(f"🎯 F1-Score:  {f1_score:.4f}")
print(f"📊 AUC:       {auc_score:.4f}")
print(f"🎯 Accuracy:  {accuracy:.4f}")
print(f"📈 Precision: {precision:.4f}")
print(f"📉 Recall:    {recall:.4f}")
print(f"⏱️  Tiempo:    {training_time:.1f}s")

=== EVALUACIÓN COMPLETA DEL MODELO ===


25/06/11 20:23:39 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
25/06/11 20:23:39 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
25/06/11 20:23:40 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
25/06/11 20:23:41 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
25/06/11 20:23:41 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB



=== RESULTADOS FINALES PARA TORNEO ===
🎯 F1-Score:  0.8450
📊 AUC:       0.9155
🎯 Accuracy:  0.8468
📈 Precision: 0.8479
📉 Recall:    0.8468
⏱️  Tiempo:    118.2s


# ANÁLISIS DE IMPORTANCIA DE CARACTERÍSTICAS

In [10]:
print("\n=== ANÁLISIS DE IMPORTANCIA DE CARACTERÍSTICAS ===")

if hasattr(gbt_model, 'featureImportances'):
    importances = gbt_model.featureImportances.toArray()
    
    print("Top 10 características más importantes:")
    feature_importance_pairs = [(i, imp) for i, imp in enumerate(importances)]
    feature_importance_pairs.sort(key=lambda x: x[1], reverse=True)
    
    for i, (feature_idx, importance) in enumerate(feature_importance_pairs[:10]):
        print(f"  {i+1:2d}. Característica {feature_idx:3d}: {importance:.4f}")
else:
    print("Importancias no disponibles")


=== ANÁLISIS DE IMPORTANCIA DE CARACTERÍSTICAS ===
Top 10 características más importantes:
   1. Característica  21: 0.2540
   2. Característica   1: 0.1019
   3. Característica   0: 0.1005
   4. Característica  10: 0.0695
   5. Característica  20: 0.0672
   6. Característica  15: 0.0548
   7. Característica  12: 0.0512
   8. Característica  22: 0.0344
   9. Característica  13: 0.0327
  10. Característica  16: 0.0257


# MATRIZ DE CONFUSIÓN Y ESTADÍSTICAS FINALES

In [12]:
# BLOQUE 11: MATRIZ DE CONFUSIÓN
print("\n=== MATRIZ DE CONFUSIÓN ===")
confusion_matrix = predictions.groupBy(target_col, "prediction").count().orderBy(target_col, "prediction")
confusion_matrix.show()

# Calcular métricas de la matriz de confusión manualmente (CORREGIDO)
conf_data = confusion_matrix.collect()

# Inicializar valores
tn = fp = fn = tp = 0

# Buscar valores en los datos recolectados
for row in conf_data:
    actual = row[target_col]  # Valor real
    predicted = row['prediction']  # Predicción
    count = row['count']  # Cantidad
    
    if actual == 0 and predicted == 0.0:
        tn = count  # True Negative
    elif actual == 0 and predicted == 1.0:
        fp = count  # False Positive
    elif actual == 1 and predicted == 0.0:
        fn = count  # False Negative
    elif actual == 1 and predicted == 1.0:
        tp = count  # True Positive

print(f"\nMétricas detalladas:")
print(f"True Positives:  {tp}")
print(f"True Negatives:  {tn}")
print(f"False Positives: {fp}")
print(f"False Negatives: {fn}")

# Calcular métricas manualmente
if tp + fp > 0:
    precision_manual = tp / (tp + fp)
    print(f"Precision manual: {precision_manual:.4f}")
else:
    print("Precision manual: No se puede calcular (tp + fp = 0)")

if tp + fn > 0:
    recall_manual = tp / (tp + fn)
    print(f"Recall manual: {recall_manual:.4f}")
else:
    print("Recall manual: No se puede calcular (tp + fn = 0)")

if tp + fp > 0 and tp + fn > 0:
    f1_manual = 2 * (precision_manual * recall_manual) / (precision_manual + recall_manual)
    print(f"F1-Score manual: {f1_manual:.4f}")


=== MATRIZ DE CONFUSIÓN ===


25/06/11 20:43:20 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
25/06/11 20:43:21 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
25/06/11 20:43:21 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB


+-----------+----------+-----+
|is_canceled|prediction|count|
+-----------+----------+-----+
|          0|       0.0| 5919|
|          0|       1.0|  566|
|          1|       0.0| 1137|
|          1|       1.0| 3496|
+-----------+----------+-----+



25/06/11 20:43:22 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
25/06/11 20:43:22 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB
25/06/11 20:43:22 WARN DAGScheduler: Broadcasting large task binary with size 2.2 MiB



Métricas detalladas:
True Positives:  3496
True Negatives:  5919
False Positives: 566
False Negatives: 1137
Precision manual: 0.8607
Recall manual: 0.7546
F1-Score manual: 0.8041


# COMPARACIÓN CON RESULTADOS PREVIOS

In [13]:
print("\n=== COMPARACIÓN CON EXPERIMENTOS PREVIOS ===")
previous_best_f1 = 0.8580  # De experiment_results.json
improvement = f1_score - previous_best_f1

print(f"F1-Score previo (con Grid Search): {previous_best_f1:.4f}")
print(f"F1-Score actual (sin Grid Search): {f1_score:.4f}")
print(f"Diferencia: {improvement:+.4f}")

if improvement >= -0.01:  # Tolerancia de 1%
    print("✅ EXCELENTE: Resultados similares sin Grid Search")
elif improvement >= -0.02:  # Tolerancia de 2%
    print("👍 BUENO: Pequeña pérdida aceptable por simplicidad")
else:
    print("⚠️  NOTA: Pérdida significativa, pero modelo más eficiente")


=== COMPARACIÓN CON EXPERIMENTOS PREVIOS ===
F1-Score previo (con Grid Search): 0.8580
F1-Score actual (sin Grid Search): 0.8450
Diferencia: -0.0130
👍 BUENO: Pequeña pérdida aceptable por simplicidad


# ESTADÍSTICAS FINALES Y GUARDADO

In [19]:
print("\n=== ESTADÍSTICAS FINALES ===")
print(f"✅ Dimensión de características: {feature_dimension}")
print(f"✅ Tiempo total de ejecución: {training_time:.1f}s")
print(f"✅ Configuración basada en: Mejores parámetros previos")
print(f"✅ Pipeline simplificado: {len(preprocessing_stages)} etapas")

# Guardar resultados
final_results = {
    "model": "Gradient Boosted Trees (Configuración Óptima)",
    "methodology": "Sin Grid Search - Parámetros basados en experimentos previos",
    "performance_level": performance,
    "metrics": {
        "f1_score": float(f1_score),
        "auc_score": float(auc_score),
        "accuracy": float(accuracy),
        "precision": float(precision),
        "recall": float(recall)
    },
    "training_info": {
        "training_time_seconds": training_time,
        "feature_dimension": feature_dimension,
        "pipeline_stages": len(preprocessing_stages)
    },
    "configuration": {
        "maxDepth": gbt_final.getMaxDepth(),
        "maxIter": gbt_final.getMaxIter(),
        "stepSize": gbt_final.getStepSize(),
        "subsamplingRate": gbt_final.getSubsamplingRate(),
        "minInstancesPerNode": gbt_final.getMinInstancesPerNode()
    },
    "comparison_with_grid_search": {
        "previous_best_f1": previous_best_f1,
        "current_f1": float(f1_score),F1-Score: 0.8450
        "difference": float(improvement),
        "time_saved": "Grid Search evitado - Crasheo de maquina virtual evitado"
    },
    "experiment_timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
}

with open("gbt_final_results.json", "w") as f:
    json.dump(final_results, f, indent=2)

print("\n📁 Resultados guardados en 'gbt_final_results.json'")
print("🚀 MODELO LISTO PARA TORNEO - VERSIÓN EFICIENTE!")
print(f"⚡ Tiempo total: {training_time:.1f}s (vs ~3426s con Grid Search)")


=== ESTADÍSTICAS FINALES ===
✅ Dimensión de características: 27
✅ Tiempo total de ejecución: 118.2s
✅ Configuración basada en: Mejores parámetros previos
✅ Pipeline simplificado: 8 etapas

📁 Resultados guardados en 'gbt_final_results.json'
🚀 MODELO LISTO PARA TORNEO - VERSIÓN EFICIENTE!
⚡ Tiempo total: 118.2s (vs ~3426s con Grid Search)


# MODELO FINAL PARA PRODUCCIÓN

In [18]:
print("\n=== PREPARANDO MODELO PARA TORNEO ===")
print("Configuración final lista para entrega:")
print(f"- Algoritmo: Gradient Boosted Trees")
print(f"- F1-Score: {f1_score:.4f}")
print(f"- Tiempo entrenamiento: {training_time:.1f}s << Crasheo de pc")
print(f"- Pipeline optimizado: {len(preprocessing_stages)} etapas")
print("- Basado en mejores parámetros encontrados experimentalmente")

# Limpiar cache
train_df.unpersist()
val_df.unpersist()

# Detener Spark
print("\n=== FINALIZANDO ===")
spark.stop()
print("✅ PROCESO COMPLETADO - MODELO LISTO PARA COMPETICIÓN!")


=== PREPARANDO MODELO PARA TORNEO ===
Configuración final lista para entrega:
- Algoritmo: Gradient Boosted Trees
- F1-Score: 0.8450
- Tiempo entrenamiento: 118.2s << Crasheo de pc
- Pipeline optimizado: 8 etapas
- Basado en mejores parámetros encontrados experimentalmente

=== FINALIZANDO ===
✅ PROCESO COMPLETADO - MODELO LISTO PARA COMPETICIÓN!
