In [1]:
# Importar librerías necesarias
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
    precision_score,
    recall_score,
    f1_score,
    accuracy_score,
    log_loss
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.keras

# Configurar MLflow
mlflow.set_registry_uri("databricks-uc")
experiment = mlflow.set_experiment('/ClasificacionAlejo')

# 1. Cargar y preparar los datos
data = pd.read_csv('DatosSingapore.csv')
data.dropna(inplace=True)

# Crear la variable objetivo para clasificación
data['recommended'] = np.where(
    (data['price'] <= 200) & 
    (data['review_scores_rating'] >= 4.5) & 
    (data['bedrooms'] >= 1) &
    (data['amenities_number'] >= 5) &
    (data['host_response_rate'] >= 0.79), 1, 0
)

# Verificar el balance de clases
print("Distribución de la variable objetivo:")
print(data['recommended'].value_counts())
print(f"Porcentaje de recomendados: {data['recommended'].mean()*100:.2f}%")

# Separar características (X) y variable objetivo (y)
X = data.drop(columns=['recommended', 'price'])
y = data['recommended']

# Codificar variables categóricas
X = pd.get_dummies(X, drop_first=True)

# Normalizar las variables numéricas
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Dividir datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# ============================================
# INICIO DE EXPERIMENTO MLFLOW
# ============================================

with mlflow.start_run(experiment_id=experiment.experiment_id, run_name="modelo_clasificacion_v1"):
    
    # -------------------------
    # Definir hiperparámetros
    # -------------------------
    input_dim = X_train.shape[1]
    layer1_units = 128
    layer2_units = 64
    layer3_units = 32
    dropout_rate1 = 0.3
    dropout_rate2 = 0.3
    dropout_rate3 = 0.2
    activation = 'relu'
    optimizer = 'adam'
    loss = 'binary_crossentropy'
    epochs = 100
    batch_size = 32
    validation_split = 0.2
    early_stopping_patience = 10
    random_state = 42
    
    # -------------------------
    # Crear el modelo
    # -------------------------
    model = Sequential([
        Dense(layer1_units, input_dim=input_dim, activation=activation),
        Dropout(dropout_rate1),
        Dense(layer2_units, activation=activation),
        Dropout(dropout_rate2),
        Dense(layer3_units, activation=activation),
        Dropout(dropout_rate3),
        Dense(1, activation='sigmoid')
    ])
    
    # Compilar el modelo
    model.compile(
        optimizer=optimizer,
        loss=loss,
        metrics=['accuracy']
    )
    
    # Early stopping
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=early_stopping_patience,
        restore_best_weights=True
    )
    
    # -------------------------
    # Registrar parámetros
    # -------------------------
    mlflow.log_param("input_dim", input_dim)
    mlflow.log_param("layer1_units", layer1_units)
    mlflow.log_param("layer2_units", layer2_units)
    mlflow.log_param("layer3_units", layer3_units)
    mlflow.log_param("dropout_rate1", dropout_rate1)
    mlflow.log_param("dropout_rate2", dropout_rate2)
    mlflow.log_param("dropout_rate3", dropout_rate3)
    mlflow.log_param("activation", activation)
    mlflow.log_param("optimizer", optimizer)
    mlflow.log_param("loss_function", loss)
    mlflow.log_param("epochs", epochs)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("validation_split", validation_split)
    mlflow.log_param("early_stopping_patience", early_stopping_patience)
    mlflow.log_param("random_state", random_state)
    mlflow.log_param("test_size", 0.2)
    
    # -------------------------
    # Entrenar el modelo
    # -------------------------
    print("\nEntrenando el modelo...")
    history = model.fit(
        X_train, y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_split=validation_split,
        callbacks=[early_stopping],
        verbose=1
    )
    
    # -------------------------
    # Predicciones
    # -------------------------
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype("int32")
    
    # -------------------------
    # Calcular métricas
    # -------------------------
    accuracy = accuracy_score(y_test, y_pred_classes)
    precision = precision_score(y_test, y_pred_classes)
    recall = recall_score(y_test, y_pred_classes)
    f1 = f1_score(y_test, y_pred_classes)
    roc_auc = roc_auc_score(y_test, y_pred)
    logloss = log_loss(y_test, y_pred)
    
    # Métricas de la matriz de confusión
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred_classes).ravel()
    specificity = tn / (tn + fp)
    
    # -------------------------
    # Registrar métricas en MLflow
    # -------------------------
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("roc_auc", roc_auc)
    mlflow.log_metric("log_loss", logloss)
    mlflow.log_metric("specificity", specificity)
    mlflow.log_metric("true_positives", int(tp))
    mlflow.log_metric("true_negatives", int(tn))
    mlflow.log_metric("false_positives", int(fp))
    mlflow.log_metric("false_negatives", int(fn))
    
    # Métricas finales del entrenamiento
    final_train_loss = history.history['loss'][-1]
    final_val_loss = history.history['val_loss'][-1]
    final_train_acc = history.history['accuracy'][-1]
    final_val_acc = history.history['val_accuracy'][-1]
    
    mlflow.log_metric("final_train_loss", final_train_loss)
    mlflow.log_metric("final_val_loss", final_val_loss)
    mlflow.log_metric("final_train_accuracy", final_train_acc)
    mlflow.log_metric("final_val_accuracy", final_val_acc)
    mlflow.log_metric("epochs_trained", len(history.history['loss']))
    
    # -------------------------
    # Crear y guardar visualizaciones
    # -------------------------
    
    # 1. Curvas de entrenamiento
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    axes[0].plot(history.history['loss'], label='Train Loss')
    axes[0].plot(history.history['val_loss'], label='Validation Loss')
    axes[0].set_xlabel('Época')
    axes[0].set_ylabel('Loss')
    axes[0].set_title('Curva de pérdida durante el entrenamiento')
    axes[0].legend()
    axes[0].grid(True)
    
    axes[1].plot(history.history['accuracy'], label='Train Accuracy')
    axes[1].plot(history.history['val_accuracy'], label='Validation Accuracy')
    axes[1].set_xlabel('Época')
    axes[1].set_ylabel('Accuracy')
    axes[1].set_title('Curva de exactitud durante el entrenamiento')
    axes[1].legend()
    axes[1].grid(True)
    
    plt.tight_layout()
    plt.savefig('training_curves.png')
    mlflow.log_artifact('training_curves.png')
    plt.close()
    
    # 2. Matriz de confusión
    plt.figure(figsize=(8, 6))
    conf_matrix = confusion_matrix(y_test, y_pred_classes)
    sns.heatmap(
        conf_matrix,
        annot=True,
        fmt='d',
        cmap='Blues',
        xticklabels=['No Recomendado', 'Recomendado'],
        yticklabels=['No Recomendado', 'Recomendado']
    )
    plt.xlabel('Predicción')
    plt.ylabel('Valor real')
    plt.title('Matriz de confusión')
    plt.savefig('confusion_matrix.png')
    mlflow.log_artifact('confusion_matrix.png')
    plt.close()
    
    # 3. Curva ROC
    fpr, tpr, thresholds = roc_curve(y_test, y_pred)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Classifier')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Curva ROC')
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.savefig('roc_curve.png')
    mlflow.log_artifact('roc_curve.png')
    plt.close()
    
    # -------------------------
    # Registrar el modelo
    # -------------------------
    mlflow.keras.log_model(model, "clasificacion-model")
    
    # -------------------------
    # Guardar reporte de clasificación
    # -------------------------
    report = classification_report(y_test, y_pred_classes, target_names=['No Recomendado', 'Recomendado'])
    with open('classification_report.txt', 'w') as f:
        f.write(report)
    mlflow.log_artifact('classification_report.txt')
    
    # -------------------------
    # Imprimir resumen
    # -------------------------
    print("\n" + "="*50)
    print("RESUMEN DEL EXPERIMENTO")
    print("="*50)
    print(f"Accuracy:     {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"Precision:    {precision:.4f} ({precision*100:.2f}%)")
    print(f"Recall:       {recall:.4f} ({recall*100:.2f}%)")
    print(f"F1-Score:     {f1:.4f}")
    print(f"ROC-AUC:      {roc_auc:.4f}")
    print(f"Log Loss:     {logloss:.4f}")
    print(f"Specificity:  {specificity:.4f}")
    print("="*50)
    print(f"Épocas entrenadas: {len(history.history['loss'])}")
    print(f"Train Loss final: {final_train_loss:.4f}")
    print(f"Val Loss final: {final_val_loss:.4f}")
    print("="*50)

print("\n✓ Experimento registrado exitosamente en MLflow")

  return FileStore(store_uri, store_uri)


Distribución de la variable objetivo:
recommended
0    1711
1     541
Name: count, dtype: int64
Porcentaje de recomendados: 24.02%

Entrenando el modelo...
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.7653 - loss: 0.5304 - val_accuracy: 0.8449 - val_loss: 0.3873
Epoch 2/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8340 - loss: 0.3792 - val_accuracy: 0.8504 - val_loss: 0.3192
Epoch 3/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8632 - loss: 0.3093 - val_accuracy: 0.8670 - val_loss: 0.2973
Epoch 4/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8944 - loss: 0.2688 - val_accuracy: 0.8643 - val_loss: 0.2912
Epoch 5/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8979 - loss: 0.2516 - val_accuracy: 0.8892 - val_loss: 0.2699
Epoch 6/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9042 - loss: 0.2305 - val_accuracy: 0.8809 - val_loss: 0.2655
Epoch 7/100
[1m45/45[0m [32m━━━━━━━━━━━━━━━




RESUMEN DEL EXPERIMENTO
Accuracy:     0.8980 (89.80%)
Precision:    0.7818 (78.18%)
Recall:       0.7963 (79.63%)
F1-Score:     0.7890
ROC-AUC:      0.9423
Log Loss:     0.3001
Specificity:  0.9300
Épocas entrenadas: 24
Train Loss final: 0.0892
Val Loss final: 0.2900

✓ Experimento registrado exitosamente en MLflow
