In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# ------------------------------------------------------------------------------
# 1. Cargar el dataset
# ------------------------------------------------------------------------------
# Ajusta la ruta de tu archivo CSV según corresponda (ej. Windows):
df = pd.read_csv(r"C:\Users\Juan Diego\Downloads\propiedades_filtradas.csv")

# ------------------------------------------------------------------------------
# 2. Función para crear gráficos (Actual vs Predicho, Residuales)
# ------------------------------------------------------------------------------
def plot_results(y_true, y_pred, experiment_name):
    """
    Genera y guarda dos gráficos:
      1) Dispersión: Valores reales vs. valores predichos
      2) Residuales vs. valores predichos
    """
    # Gráfico 1: Actual vs Predicho
    plt.figure(figsize=(7,5))
    plt.scatter(y_true, y_pred, alpha=0.7)
    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], 'r--', label='Línea ideal')
    plt.title(f'{experiment_name} - Actual vs. Predicho')
    plt.xlabel('Valor Real')
    plt.ylabel('Valor Predicho')
    plt.legend()
    plt.tight_layout()
    plt.savefig(f'{experiment_name}_actual_vs_pred.png')
    plt.show()

    # Gráfico 2: Residuales
    residuals = y_true - y_pred
    plt.figure(figsize=(7,5))
    plt.scatter(y_pred, residuals, alpha=0.7)
    plt.axhline(y=0, color='r', linestyle='--')
    plt.title(f'{experiment_name} - Residuales')
    plt.xlabel('Predicción')
    plt.ylabel('Residual')
    plt.tight_layout()
    plt.savefig(f'{experiment_name}_residuals.png')
    plt.show()

# ------------------------------------------------------------------------------
# 3. Experimento 1 (Baseline) - Usar todas las variables excepto 'precio'
# ------------------------------------------------------------------------------
# Features: id, camaras, area, habitaciones, banos, tamano, parqueos
X1 = df.drop(columns=["precio"])
y1 = df["precio"]

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, 
                                                        test_size=0.2, 
                                                        random_state=42)

model1 = LinearRegression()
model1.fit(X1_train, y1_train)
y1_pred = model1.predict(X1_test)

r2_1 = r2_score(y1_test, y1_pred)
mse_1 = mean_squared_error(y1_test, y1_pred)

print("=== Experimento 1 (Baseline: todas las variables) ===")
print(f"R^2:  {r2_1:.3f}")
print(f"MSE:  {mse_1:.3f}")

# Graficar resultados
plot_results(y1_test, y1_pred, "Experimento1_Baseline")

# ------------------------------------------------------------------------------
# 4. Experimento 2 - Remover la variable 'id'
# ------------------------------------------------------------------------------
# Features: camaras, area, habitaciones, banos, tamano, parqueos
X2 = df.drop(columns=["precio", "id"])
y2 = df["precio"]

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, 
                                                        test_size=0.2, 
                                                        random_state=42)

model2 = LinearRegression()
model2.fit(X2_train, y2_train)
y2_pred = model2.predict(X2_test)

r2_2 = r2_score(y2_test, y2_pred)
mse_2 = mean_squared_error(y2_test, y2_pred)

print("\n=== Experimento 2 (Sin 'id') ===")
print(f"R^2:  {r2_2:.3f}")
print(f"MSE:  {mse_2:.3f}")

# Graficar resultados
plot_results(y2_test, y2_pred, "Experimento2_SinID")

# ------------------------------------------------------------------------------
# 5. Experimento 3 - Transformación logarítmica del objetivo
# ------------------------------------------------------------------------------
# Features: id, camaras, area, habitaciones, banos, tamano, parqueos (mismas que Exp1)
X3 = df.drop(columns=["precio"])
y3 = np.log1p(df["precio"])  # log1p para evitar problemas con precio=0

X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, 
                                                        test_size=0.2, 
                                                        random_state=42)

model3 = LinearRegression()
model3.fit(X3_train, y3_train)
y3_pred_log = model3.predict(X3_test)

# Convertir las predicciones al espacio original
y3_pred = np.expm1(y3_pred_log)
y3_test_orig = np.expm1(y3_test)

r2_3 = r2_score(y3_test_orig, y3_pred)
mse_3 = mean_squared_error(y3_test_orig, y3_pred)

print("\n=== Experimento 3 (Transformación logarítmica del precio) ===")
print(f"R^2:  {r2_3:.3f}")
print(f"MSE:  {mse_3:.3f}")

# Graficar resultados
plot_results(y3_test_orig, y3_pred, "Experimento3_LogTransform")

# ------------------------------------------------------------------------------
# 6. Comparación final de métricas (R^2 y MSE)
# ------------------------------------------------------------------------------
experiments = ["Exp1_Baseline", "Exp2_SinID", "Exp3_LogTransform"]
r2_scores = [r2_1, r2_2, r2_3]
mse_values = [mse_1, mse_2, mse_3]

# Gráfico de R^2
plt.figure(figsize=(7,5))
sns.barplot(x=experiments, y=r2_scores, palette="Blues")
plt.title("Comparación de R^2 entre Experimentos")
plt.ylabel("R^2 Score")
plt.tight_layout()
plt.savefig("Comparacion_R2.png")
plt.show()

# Gráfico de MSE
plt.figure(figsize=(7,5))
sns.barplot(x=experiments, y=mse_values, palette="Greens")
plt.title("Comparación de MSE entre Experimentos")
plt.ylabel("MSE")
plt.tight_layout()
plt.savefig("Comparacion_MSE.png")
plt.show()

ValueError: could not convert string to float: 'Carretera a El Salvador'