In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib
matplotlib.use('Agg') 
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import RobustScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

airbnbDB = pd.read_csv("./airbnb-listings-extract.csv", sep=";")

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

# 1. Feature Engineering Avanzado
def create_advanced_features(df):
    df = df.copy()
    
    # Características básicas mejoradas
    df['Total_Rooms'] = df['Bedrooms'].fillna(0) + df['Bathrooms'].fillna(0)
    df['Price_per_Person'] = df['Price'] / df['Accommodates'].replace(0, 1)
    df['Price_per_Room'] = df['Price'] / df['Total_Rooms'].replace(0, 1)
    
    # Características de calidad más detalladas
    df['Quality_Score'] = df['Review Scores Rating'].fillna(0) * df['Review Scores Value'].fillna(0)
    df['Popularity'] = np.log1p(df['Number of Reviews']) * df['Review Scores Rating'].fillna(0)
    
    # Características monetarias
    df['Total_Cost'] = df['Price'].fillna(0) + df['Cleaning Fee'].fillna(0)
    df['Cost_per_Person'] = df['Total_Cost'] / df['Accommodates'].replace(0, 1)
        
    return df

def preprocess_data(df):
    # Características expandidas (removidas las que no existen)
    selected_features = [
        'Accommodates', 'Bathrooms', 'Bedrooms', 'Beds',
        'Cleaning Fee', 'Security Deposit', 'Number of Reviews',
        'Review Scores Rating', 'Review Scores Value',
        'Total_Rooms', 'Price_per_Person', 'Quality_Score',
        'Popularity', 'Total_Cost', 'Cost_per_Person'
    ]
    
    # Limpieza de precio más conservadora
    df = df[df['Price'].notna() & (df['Price'] > 0)]
    Q1 = df['Price'].quantile(0.01)
    Q3 = df['Price'].quantile(0.99)
    df = df[(df['Price'] >= Q1) & (df['Price'] <= Q3)]
    
    X = df[selected_features]
    y = np.log1p(df['Price'])  # Transformación logarítmica del precio
    
    # Imputación y escalado
    imputer = SimpleImputer(strategy='median')
    X_imputed = pd.DataFrame(
        imputer.fit_transform(X),
        columns=X.columns,
        index=X.index
    )
    
    # Escalado robusto
    scaler = RobustScaler()
    X_scaled = pd.DataFrame(
        scaler.fit_transform(X_imputed),
        columns=X_imputed.columns,
        index=X_imputed.index
    )
    
    return X_scaled, y

def create_hybrid_model(input_dim):
    # Red neuronal optimizada
    nn_model = Sequential([
        Input(shape=(input_dim,)),
        Dense(256, activation='relu'),
        BatchNormalization(),
        Dropout(0.4),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dense(32, activation='relu'),
        Dense(1)
    ])
    
    nn_model.compile(
        optimizer=Adam(learning_rate=0.0005),
        loss='huber',  # Más robusto que MSE
        metrics=['mae']
    )
    
    # XGBoost optimizado
    xgb_model = XGBRegressor(
        n_estimators=500,
        learning_rate=0.01,
        max_depth=6,
        min_child_weight=1,
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=0.1,
        reg_alpha=0.1,
        reg_lambda=1,
        random_state=42
    )
    
    return nn_model, xgb_model

# 4. Entrenamiento y predicción
# Preparar datos
airbnbDB = create_advanced_features(airbnbDB)
X_processed, y_processed = preprocess_data(airbnbDB)

# División de datos
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y_processed, 
    test_size=0.2, 
    random_state=42
)

# Crear y entrenar modelos
nn_model, xgb_model = create_hybrid_model(X_processed.shape[1])

# Entrenar XGBoost
xgb_model.fit(X_train, y_train)

# Entrenar Red Neuronal
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

history = nn_model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=20,         # Reducido de 100 a 20
    batch_size=64,     # Aumentado de 32 a 64
    callbacks=[early_stopping],
    verbose=1
)

# 5. Predicciones combinadas
xgb_pred = xgb_model.predict(X_test)
nn_pred = nn_model.predict(X_test).flatten()

# Promedio ponderado de predicciones
final_pred = 0.6 * xgb_pred + 0.4 * nn_pred

# 6. Evaluación
print("\nMétricas de evaluación del modelo híbrido:")
print(f"R²: {r2_score(y_test, final_pred):.3f}")
print(f"RMSE: ${np.sqrt(mean_squared_error(y_test, final_pred)):.2f}")

# 7. Visualización
def plot_results(y_test, final_pred):
    # Configuración para evitar errores de backend
    plt.switch_backend('Agg')
    
    # Crear figura
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))
    
    # Scatter plot
    sns.scatterplot(x=y_test, y=final_pred, alpha=0.5, ax=ax1)
    ax1.plot([y_test.min(), y_test.max()], 
             [y_test.min(), y_test.max()], 
             'r--', lw=2)
    ax1.set_xlabel('Precio Real ($)')
    ax1.set_ylabel('Precio Predicho ($)')
    ax1.set_title('Predicciones vs Valores Reales')
    
    # Distribución de errores
    errors = final_pred - y_test
    sns.histplot(errors, kde=True, ax=ax2)
    ax2.set_xlabel('Error de Predicción ($)')
    ax2.set_ylabel('Frecuencia')
    ax2.set_title('Distribución de Errores')
    
    plt.tight_layout()
    
    # Guardar la figura en lugar de mostrarla
    plt.savefig('prediction_results.png')
    plt.close()

# Reemplazar la visualización original con la nueva función
plot_results(y_test, final_pred)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python312\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "C:\Users\Usuario\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\kernelapp

AttributeError: _ARRAY_API not found

Columnas disponibles: ['ID', 'Listing Url', 'Scrape ID', 'Last Scraped', 'Name', 'Summary', 'Space', 'Description', 'Experiences Offered', 'Neighborhood Overview', 'Notes', 'Transit', 'Access', 'Interaction', 'House Rules', 'Thumbnail Url', 'Medium Url', 'Picture Url', 'XL Picture Url', 'Host ID', 'Host URL', 'Host Name', 'Host Since', 'Host Location', 'Host About', 'Host Response Time', 'Host Response Rate', 'Host Acceptance Rate', 'Host Thumbnail Url', 'Host Picture Url', 'Host Neighbourhood', 'Host Listings Count', 'Host Total Listings Count', 'Host Verifications', 'Street', 'Neighbourhood', 'Neighbourhood Cleansed', 'Neighbourhood Group Cleansed', 'City', 'State', 'Zipcode', 'Market', 'Smart Location', 'Country Code', 'Country', 'Latitude', 'Longitude', 'Property Type', 'Room Type', 'Accommodates', 'Bathrooms', 'Bedrooms', 'Beds', 'Bed Type', 'Amenities', 'Square Feet', 'Price', 'Weekly Price', 'Monthly Price', 'Security Deposit', 'Cleaning Fee', 'Guests Included', 'Extra People', 