In [53]:
# Cargar Datos CSV
import pandas as pd
data = pd.read_csv('DatosSingapore2.csv')

In [54]:
import pandas as pd
import numpy as np

# Suponiendo que tu variable objetivo es:
y = data["price"]

# üü¶ Calcular correlaci√≥n de cada variable num√©rica contra price
corrs = {}

for col in data.columns:
    if col != "price":
        if pd.api.types.is_numeric_dtype(data[col]):
            corrs[col] = data[col].corr(y)     # correlaci√≥n de Pearson

# Convertir a dataframe volver valor absoluto y ordenar
corr_df = (
    pd.DataFrame.from_dict(corrs, orient="index", columns=["correlation"])
      .abs()
      .sort_values(by="correlation", ascending=False)
)

print(corr_df)


                                                  correlation
room_type:Entire home/apt                            0.517606
accommodates                                         0.510686
room_type:Private room                               0.502095
property_type:Private room in rental unit            0.483701
maximum_minimum_nights                               0.432841
...                                                       ...
review_scores_location                               0.007643
neighbourhood_cleansed:Museum                        0.006567
neighbourhood_cleansed:Hougang                       0.006360
property_type:Boat                                   0.006287
property_type:Private room in serviced apartment     0.004485

[129 rows x 1 columns]


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# 1) Lista de variables que quieres usar (num√©ricas)
num_vars = [
    "accommodates",
    "number_of_private_bathrooms",
    "number_of_shared_bathrooms",
    "bedrooms",
    "beds",
    "maximum_minimum_nights",
    "minimum_nights_avg_ntm",
    "minimum_nights",
    "review_scores_rating",
    "review_scores_communication",
    "review_scores_location",
    "estimated_response_time_hours"
]

# 2) Prefijos para las features categ√≥ricas one-hot (en tu dataset pueden ser 'property_type:...' o 'room_type:...')
cat_prefixes = ["property_type", "room_type"]

# 3) Detectar si ya existen dummies en el dataframe (ej: "property_type:Tiny home")
existing_cat_cols = [c for c in data.columns if any(c.startswith(f"{p}:") for p in cat_prefixes)]

if existing_cat_cols:
    # Usar las columnas one-hot ya existentes
    cat_df = data[existing_cat_cols].copy()
else:
    # Crear dummies desde las columnas categ√≥ricas originales (si existen)
    to_dummy = []
    for p in cat_prefixes:
        if p in data.columns:
            to_dummy.append(p)

    if to_dummy:
        cat_df = pd.get_dummies(data[to_dummy].astype(str), prefix=to_dummy, dummy_na=False)
    else:
        # Si no hay columnas categ√≥ricas, creamos dataframe vac√≠o para evitar errores
        cat_df = pd.DataFrame(index=data.index)

# 4) Construir X con solo las variables solicitadas
num_df = data[num_vars].copy()

# 5) Concatenar features finales
X = pd.concat([num_df, cat_df], axis=1)

# 6) Asegurarnos de tipos y detectar columnas binarias
binary_vars = [col for col in X.columns if set(X[col].dropna().unique()).issubset({0, 1})]

# Num√©ricas verdaderas para escalar (las que definiste)
numerical_true = [c for c in num_vars if c in X.columns and c not in binary_vars]

# 7) Escalar SOLO num√©ricas continuas
scaler = StandardScaler()
X_scaled = X.copy()
if len(numerical_true) > 0:
    X_scaled[numerical_true] = scaler.fit_transform(X[numerical_true])

# 8) Convertir a float32 (importante para TF/GPUs)
X_scaled = X_scaled.astype("float32")

# 9) Target y split (manejo b√°sico de NaNs en target)
y = data["price"].astype("float32")


X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [None]:
import tensorflow as tf
from tensorflow import keras

def create_model(n_neurons=64, n_layers=2, lr=0.001, dropout_rate=0.0,
                 hidden_activation="relu", output_activation="linear"):
    
    model = keras.Sequential()
    
    # Capa de entrada
    model.add(keras.layers.Input(shape=(X_train.shape[1],)))

    # Capas ocultas
    for _ in range(n_layers):
        model.add(keras.layers.Dense(n_neurons, activation=hidden_activation))
        if dropout_rate > 0:
            model.add(keras.layers.Dropout(dropout_rate))

    # Capa de salida
    model.add(keras.layers.Dense(1, activation=output_activation))

    # Compilar
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss="mean_squared_error",
        metrics=["mean_absolute_error"]
    )
    
    return model

In [None]:
import mlflow
import mlflow.tensorflow
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import math

mlflow.set_experiment("Airbnb_price_regression_2")

def train_and_log(params):
    n_neurons = params["n_neurons"]
    n_layers = params["n_layers"]
    lr = params["lr"]
    dropout = params["dropout"]
    hidden_activation = params["hidden_activation"]
    output_activation = params["output_activation"]

    with mlflow.start_run():

        # Registrar par√°metros del modelo
        mlflow.log_params(params)

        # Crear modelo
        model = create_model(
            n_neurons=n_neurons,
            n_layers=n_layers,
            lr=lr,
            dropout_rate=dropout,
            hidden_activation=hidden_activation,
            output_activation=output_activation
        )
        
        # Entrenamiento
        history = model.fit(
            X_train, y_train,
            validation_split=0.2,
            epochs=50,
            batch_size=32,
            verbose=0
        )

        # Predici√≥n
        preds = model.predict(X_test).flatten()

        # M√©tricas b√°sicas
        mse = mean_squared_error(y_test, preds)
        rmse = math.sqrt(mse)
        mae = mean_absolute_error(y_test, preds)

        # Registrar m√©tricas base
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("rmse", rmse)

        # Registrar activaciones
        mlflow.log_param("hidden_activation", hidden_activation)
        mlflow.log_param("output_activation", output_activation)

        # Guardar modelo en MLflow
        mlflow.tensorflow.log_model(model, "model")

        print(f"‚úî Run logged ‚Äî RMSE: {rmse:.2f}, MSE: {mse:.2f}")

In [None]:
from itertools import product

# ============================
# Definir espacio de b√∫squeda
# ============================

param_grid = {
    "n_neurons":        [128,256],
    "n_layers":         [2,3],
    "lr":               [0.001],
    "dropout":          [0.0],
    "hidden_activation": ["relu"],
    "output_activation": ["linear"]
}

# Crear todas las combinaciones
search_space = []

for values in product(*param_grid.values()):
    params = dict(zip(param_grid.keys(), values))
    search_space.append(params)

print(f"Total de combinaciones: {len(search_space)}")

# ============================
# Ejecutar grid search
# ============================

for params in search_space:
    print(f"\nüöÄ Ejecutando configuraci√≥n: {params}")
    train_and_log(params)
Tarea_4_ Modelamiento Regresion