# Big Data - Proyecto MLLib
# Experimentación con Modelos de Machine Learning

# Inicializar Spark

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import *
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler
import time
import json

spark = SparkSession.builder \
    .appName("Hotel Booking Cancellation - Model Experiments") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()

# Cargar datos preprocesados

In [None]:
print("=== CARGANDO DATOS PREPROCESADOS ===")

train_df = spark.read.parquet("train_processed")
val_df = spark.read.parquet("val_processed")

print(f"Datos de entrenamiento: {train_df.count()} registros")
print(f"Datos de validación: {val_df.count()} registros")

# Verificar esquema

In [None]:
print("\nEsquema de los datos:")
train_df.printSchema()

# Configurar evaluadores

In [None]:
binary_evaluator = BinaryClassificationEvaluator(
    rawPredictionCol="rawPrediction",
    labelCol="is_canceled",
    metricName="areaUnderROC"
)

multiclass_evaluator = MulticlassClassificationEvaluator(
    predictionCol="prediction",
    labelCol="is_canceled",
    metricName="f1"
)

# Función para evaluar modelo

In [None]:
def evaluate_model(model, train_data, val_data, model_name):
    """
    Evalúa un modelo y retorna métricas completas
    """
    start_time = time.time()
    
    # Entrenar modelo
    fitted_model = model.fit(train_data)
    
    # Predicciones en validación
    predictions = fitted_model.transform(val_data)
    
    # Calcular métricas
    auc = binary_evaluator.evaluate(predictions)
    f1 = multiclass_evaluator.evaluate(predictions)
    
    # Métricas adicionales
    accuracy_evaluator = MulticlassClassificationEvaluator(
        predictionCol="prediction", labelCol="is_canceled", metricName="accuracy"
    )
    accuracy = accuracy_evaluator.evaluate(predictions)
    
    precision_evaluator = MulticlassClassificationEvaluator(
        predictionCol="prediction", labelCol="is_canceled", metricName="weightedPrecision"
    )
    precision = precision_evaluator.evaluate(predictions)
    
    recall_evaluator = MulticlassClassificationEvaluator(
        predictionCol="prediction", labelCol="is_canceled", metricName="weightedRecall"
    )
    recall = recall_evaluator.evaluate(predictions)
    
    training_time = time.time() - start_time
    
    results = {
        "model": model_name,
        "auc": auc,
        "f1": f1,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "training_time": training_time
    }
    
    return results, fitted_model, predictions

# FASE 1: EVALUACIÓN DE MODELOS BASE

In [None]:
print("\n=== FASE 1: EVALUACIÓN DE MODELOS BASE ===")

# Lista para almacenar todos los resultados
all_results = []
fitted_models = {}

# 1. LOGISTIC REGRESSION

In [None]:
print("\n--- EXPERIMENTO 1: LOGISTIC REGRESSION ---")
lr = LogisticRegression(
    featuresCol="features",
    labelCol="is_canceled",
    maxIter=100
)

lr_results, lr_model, lr_predictions = evaluate_model(lr, train_df, val_df, "Logistic Regression")
all_results.append(lr_results)
fitted_models["Logistic Regression"] = lr_model

print(f"AUC: {lr_results['auc']:.4f}")
print(f"F1-Score: {lr_results['f1']:.4f}")
print(f"Accuracy: {lr_results['accuracy']:.4f}")
print(f"Tiempo: {lr_results['training_time']:.2f}s")

# 2.RANDOM FOREST

In [None]:
print("\n--- EXPERIMENTO 2: RANDOM FOREST ---")
rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="is_canceled",
    numTrees=100,
    seed=42
)

rf_results, rf_model, rf_predictions = evaluate_model(rf, train_df, val_df, "Random Forest")
all_results.append(rf_results)
fitted_models["Random Forest"] = rf_model

print(f"AUC: {rf_results['auc']:.4f}")
print(f"F1-Score: {rf_results['f1']:.4f}")
print(f"Accuracy: {rf_results['accuracy']:.4f}")
print(f"Tiempo: {rf_results['training_time']:.2f}s")

# 3. GRADIENT BOOSTED TREES

In [None]:
print("\n--- EXPERIMENTO 3: GRADIENT BOOSTED TREES ---")
gbt = GBTClassifier(
    featuresCol="features",
    labelCol="is_canceled",
    maxIter=100,
    seed=42
)

gbt_results, gbt_model, gbt_predictions = evaluate_model(gbt, train_df, val_df, "Gradient Boosted Trees")
all_results.append(gbt_results)
fitted_models["Gradient Boosted Trees"] = gbt_model

print(f"AUC: {gbt_results['auc']:.4f}")
print(f"F1-Score: {gbt_results['f1']:.4f}")
print(f"Accuracy: {gbt_results['accuracy']:.4f}")
print(f"Tiempo: {gbt_results['training_time']:.2f}s")

# 4. DECISION TREE

In [None]:
print("\n--- EXPERIMENTO 4: DECISION TREE ---")
dt = DecisionTreeClassifier(
    featuresCol="features",
    labelCol="is_canceled",
    seed=42
)

dt_results, dt_model, dt_predictions = evaluate_model(dt, train_df, val_df, "Decision Tree")
all_results.append(dt_results)
fitted_models["Decision Tree"] = dt_model

print(f"AUC: {dt_results['auc']:.4f}")
print(f"F1-Score: {dt_results['f1']:.4f}")
print(f"Accuracy: {dt_results['accuracy']:.4f}")
print(f"Tiempo: {dt_results['training_time']:.2f}s")

# 5. NAIVE BAYES

In [None]:
print("\n--- EXPERIMENTO 5: NAIVE BAYES ---")
nb = NaiveBayes(
    featuresCol="features",
    labelCol="is_canceled"
)

nb_results, nb_model, nb_predictions = evaluate_model(nb, train_df, val_df, "Naive Bayes")
all_results.append(nb_results)
fitted_models["Naive Bayes"] = nb_model

print(f"AUC: {nb_results['auc']:.4f}")
print(f"F1-Score: {nb_results['f1']:.4f}")
print(f"Accuracy: {nb_results['accuracy']:.4f}")
print(f"Tiempo: {nb_results['training_time']:.2f}s")

# RANKING DE MODELOS BASE

In [None]:
print("\n=== RANKING DE MODELOS BASE ===")

# Ordenar por F1-Score
sorted_results = sorted(all_results, key=lambda x: x['f1'], reverse=True)

print("\nRanking por F1-Score:")
for i, result in enumerate(sorted_results, 1):
    print(f"{i}. {result['model']}: F1={result['f1']:.4f}, AUC={result['auc']:.4f}")

# Seleccionar automáticamente los top 3
top_3_models = sorted_results[:3]
print(f"\n=== SELECCIONADOS PARA OPTIMIZACIÓN (TOP 3) ===")
for i, result in enumerate(top_3_models, 1):
    print(f"{i}. {result['model']}: F1={result['f1']:.4f}")

# FASE 2: OPTIMIZACIÓN CON GRID SEARCH

In [None]:
print("\n=== FASE 2: OPTIMIZACIÓN CON GRID SEARCH ===")

# Metodo para Crear un grid de parámetros específico para cada tipo de modelo

In [None]:
def create_param_grid(model_name, model_obj):
    """
    Crea grid de parámetros específico para cada tipo de modelo
    """
    if "Logistic Regression" in model_name:
        return ParamGridBuilder() \
            .addGrid(model_obj.regParam, [0.01, 0.1, 1.0]) \
            .addGrid(model_obj.elasticNetParam, [0.0, 0.5, 1.0]) \
            .addGrid(model_obj.maxIter, [100, 200]) \
            .build()
    
    elif "Random Forest" in model_name:
        return ParamGridBuilder() \
            .addGrid(model_obj.numTrees, [50, 100, 200]) \
            .addGrid(model_obj.maxDepth, [5, 10, 15]) \
            .addGrid(model_obj.minInstancesPerNode, [1, 5, 10]) \
            .build()
    
    elif "Gradient Boosted Trees" in model_name:
        return ParamGridBuilder() \
            .addGrid(model_obj.maxIter, [50, 100, 150]) \
            .addGrid(model_obj.maxDepth, [3, 5, 7]) \
            .addGrid(model_obj.stepSize, [0.01, 0.1, 0.2]) \
            .build()
    
    elif "Decision Tree" in model_name:
        return ParamGridBuilder() \
            .addGrid(model_obj.maxDepth, [5, 10, 15, 20]) \
            .addGrid(model_obj.minInstancesPerNode, [1, 5, 10, 20]) \
            .addGrid(model_obj.impurity, ["gini", "entropy"]) \
            .build()
    
    elif "Naive Bayes" in model_name:
        return ParamGridBuilder() \
            .addGrid(model_obj.smoothing, [0.1, 1.0, 2.0, 5.0]) \
            .build()
    
    else:
        raise ValueError(f"Tipo de modelo no reconocido: {model_name}")

# Metodo para optimizar un modelo específico usando Grid Search

In [None]:
def optimize_model(model_name, original_result):
    """
    Optimiza un modelo específico usando Grid Search
    """
    print(f"\n--- OPTIMIZANDO: {model_name.upper()} ---")
    
    # Crear nuevo modelo del mismo tipo
    if "Logistic Regression" in model_name:
        model_obj = LogisticRegression(featuresCol="features", labelCol="is_canceled", maxIter=100)
    elif "Random Forest" in model_name:
        model_obj = RandomForestClassifier(featuresCol="features", labelCol="is_canceled", numTrees=100, seed=42)
    elif "Gradient Boosted Trees" in model_name:
        model_obj = GBTClassifier(featuresCol="features", labelCol="is_canceled", maxIter=100, seed=42)
    elif "Decision Tree" in model_name:
        model_obj = DecisionTreeClassifier(featuresCol="features", labelCol="is_canceled", seed=42)
    elif "Naive Bayes" in model_name:
        model_obj = NaiveBayes(featuresCol="features", labelCol="is_canceled")
    
    # Crear grid de parámetros
    param_grid = create_param_grid(model_name, model_obj)
    
    # Configurar Cross Validator
    cv = CrossValidator(
        estimator=model_obj,
        estimatorParamMaps=param_grid,
        evaluator=multiclass_evaluator,  # F1-score
        numFolds=3,
        seed=42
    )
    
    print(f"Ejecutando Grid Search con {len(param_grid)} combinaciones...")
    
    # Entrenar
    start_time = time.time()
    cv_model = cv.fit(train_df)
    optimization_time = time.time() - start_time
    
    # Evaluar mejor modelo
    best_predictions = cv_model.transform(val_df)
    best_auc = binary_evaluator.evaluate(best_predictions)
    best_f1 = multiclass_evaluator.evaluate(best_predictions)
    
    # Calcular métricas adicionales
    accuracy_evaluator = MulticlassClassificationEvaluator(
        predictionCol="prediction", labelCol="is_canceled", metricName="accuracy"
    )
    best_accuracy = accuracy_evaluator.evaluate(best_predictions)
    
    precision_evaluator = MulticlassClassificationEvaluator(
        predictionCol="prediction", labelCol="is_canceled", metricName="weightedPrecision"
    )
    best_precision = precision_evaluator.evaluate(best_predictions)
    
    recall_evaluator = MulticlassClassificationEvaluator(
        predictionCol="prediction", labelCol="is_canceled", metricName="weightedRecall"
    )
    best_recall = recall_evaluator.evaluate(best_predictions)
    
    # Calcular mejoras
    original_f1 = original_result['f1']
    f1_improvement = best_f1 - original_f1
    improvement_percent = (f1_improvement / original_f1) * 100
    
    print(f"F1-Score original: {original_f1:.4f}")
    print(f"F1-Score optimizado: {best_f1:.4f}")
    print(f"Mejora: {f1_improvement:+.4f} ({improvement_percent:+.1f}%)")
    print(f"AUC: {best_auc:.4f}")
    print(f"Tiempo optimización: {optimization_time:.1f}s")
    
    # Mostrar mejores parámetros
    best_params = cv_model.bestModel.extractParamMap()
    print("Mejores parámetros:")
    for param, value in best_params.items():
        if hasattr(param, 'name'):
            print(f"  {param.name}: {value}")
    
    # Crear resultado
    optimized_result = {
        "model": f"{model_name} (Grid Search)",
        "auc": best_auc,
        "f1": best_f1,
        "accuracy": best_accuracy,
        "precision": best_precision,
        "recall": best_recall,
        "training_time": optimization_time,
        "f1_improvement": f1_improvement,
        "improvement_percent": improvement_percent,
        "best_params": {param.name: value for param, value in best_params.items() if hasattr(param, 'name')}
    }
    
    return optimized_result, cv_model

# Optimizar los top 3 modelos

In [None]:
optimized_results = []
optimized_models = {}

for result in top_3_models:
    model_name = result['model']
    try:
        opt_result, opt_model = optimize_model(model_name, result)
        optimized_results.append(opt_result)
        optimized_models[model_name] = opt_model
        all_results.append(opt_result)
    except Exception as e:
        print(f"Error optimizando {model_name}: {e}")

# COMPARACIÓN FINAL: TODOS LOS MODELOS

In [None]:
print("\n=== COMPARACIÓN FINAL: TODOS LOS MODELOS ===")

# Tabla comparativa
print("\n{:<35} {:<8} {:<8} {:<8} {:<10} {:<12}".format(
    "Modelo", "F1-Score", "AUC", "Accuracy", "Mejora F1", "Tiempo(s)"))
print("-" * 85)

# Modelos base
for result in all_results:
    if "Grid Search" not in result["model"]:
        print("{:<35} {:<8.4f} {:<8.4f} {:<8.4f} {:<10} {:<12.2f}".format(
            result['model'], result['f1'], result['auc'], 
            result['accuracy'], "-", result['training_time']))

print("-" * 85)

# Modelos optimizados
for result in all_results:
    if "Grid Search" in result["model"]:
        improvement = f"+{result['f1_improvement']:.4f}" if 'f1_improvement' in result else "N/A"
        print("{:<35} {:<8.4f} {:<8.4f} {:<8.4f} {:<10} {:<12.2f}".format(
            result['model'], result['f1'], result['auc'], 
            result['accuracy'], improvement, result['training_time']))

# Encontrar el mejor modelo absoluto
best_model_overall = max(all_results, key=lambda x: x['f1'])

print(f"\n=== MODELO GANADOR ===")
print(f"🏆 {best_model_overall['model']}")
print(f"📊 F1-Score: {best_model_overall['f1']:.4f}")
print(f"📊 AUC: {best_model_overall['auc']:.4f}")
print(f"📊 Accuracy: {best_model_overall['accuracy']:.4f}")

if 'f1_improvement' in best_model_overall:
    print(f"📈 Mejora vs base: +{best_model_overall['f1_improvement']:.4f} ({best_model_overall['improvement_percent']:+.1f}%)")

# GUARDAR MEJOR MODELO

In [None]:
print("\n=== GUARDAR MEJOR MODELO ===")

# Determinar qué modelo guardar
best_model_name = best_model_overall['model']

if "Grid Search" in best_model_name:
    # Es un modelo optimizado
    base_name = best_model_name.replace(" (Grid Search)", "")
    if base_name in optimized_models:
        best_fitted_model = optimized_models[base_name].bestModel
        print(f"Guardando modelo optimizado: {best_model_name}")
    else:
        print(f"Error: No se encontró el modelo optimizado para {base_name}")
        best_fitted_model = None
else:
    # Es un modelo base
    if best_model_name in fitted_models:
        best_fitted_model = fitted_models[best_model_name]
        print(f"Guardando modelo base: {best_model_name}")
    else:
        print(f"Error: No se encontró el modelo base para {best_model_name}")
        best_fitted_model = None

if best_fitted_model:
    best_fitted_model.write().overwrite().save("best_model")
    print("✅ Mejor modelo guardado en 'best_model/'")
else:
    print("❌ Error: No se pudo guardar el modelo")

# ANÁLISIS DE IMPORTANCIA DE CARACTERÍSTICAS

In [None]:
print("\n=== ANÁLISIS DE IMPORTANCIA DE CARACTERÍSTICAS ===")

# Analizar importancia si el modelo lo soporta
if hasattr(best_fitted_model, 'featureImportances') and best_fitted_model.featureImportances is not None:
    feature_importances = best_fitted_model.featureImportances.toArray()
    
    print(f"\nTop 10 características más importantes ({best_model_name}):")
    importance_pairs = [(i, importance) for i, importance in enumerate(feature_importances)]
    importance_pairs.sort(key=lambda x: x[1], reverse=True)
    
    for i, (feature_idx, importance) in enumerate(importance_pairs[:10]):
        print(f"{i+1:2d}. Característica {feature_idx:3d}: {importance:.4f}")
else:
    print(f"El modelo {best_model_name} no soporta análisis de importancia de características")

In [None]:
print("\n=== MATRIZ DE CONFUSIÓN DEL MEJOR MODELO ===")

# Obtener predicciones del mejor modelo
if "Grid Search" in best_model_name:
    base_name = best_model_name.replace(" (Grid Search)", "")
    if base_name in optimized_models:
        best_predictions = optimized_models[base_name].transform(val_df)
    else:
        print("Error obteniendo predicciones del modelo optimizado")
        best_predictions = None
else:
    if best_model_name in fitted_models:
        best_predictions = fitted_models[best_model_name].transform(val_df)
    else:
        print("Error obteniendo predicciones del modelo base")
        best_predictions = None

if best_predictions:
    print("\nMatriz de Confusión:")
    confusion_matrix = best_predictions.groupBy("is_canceled", "prediction").count().orderBy("is_canceled", "prediction")
    confusion_matrix.show()

In [None]:
print("\n=== ESTADÍSTICAS FINALES ===")
print(f"✅ Modelos evaluados inicialmente: 5")
print(f"✅ Modelos optimizados: {len(optimized_results)}")
print(f"✅ Mejor F1-Score final: {best_model_overall['f1']:.4f}")
print(f"✅ Tiempo total experimentación: {sum(r['training_time'] for r in all_results):.1f}s")

if optimized_results:
    avg_improvement = sum(r['f1_improvement'] for r in optimized_results) / len(optimized_results)
    print(f"✅ Mejora promedio por optimización: +{avg_improvement:.4f}")

# Guardar resultados en JSON
final_results = {
    "methodology": {
        "total_models_evaluated": 5,
        "models_optimized": len(optimized_results),
        "selection_criteria": "Top 3 by F1-Score",
        "optimization_method": "Grid Search with 3-fold CV"
    },
    "all_results": all_results,
    "best_model": best_model_overall,
    "experiment_timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
}

with open("experiment_results.json", "w") as f:
    json.dump(final_results, f, indent=2)

print("\n📁 Todos los resultados guardados en 'experiment_results.json'")

print("\n=== RESUMEN EJECUTIVO ===")
print("🎯 PROCESO COMPLETADO:")
print("   1. ✅ Evaluación objetiva de 5 algoritmos")
print("   2. ✅ Selección automática de top 3 por F1-Score")
print("   3. ✅ Optimización con Grid Search + CV")
print("   4. ✅ Selección final del mejor modelo global")
print(f"🏆 RESULTADO: {best_model_overall['model']} - F1: {best_model_overall['f1']:.4f}")

spark.stop()