# EDA - California Housing
Este notebook realiza un Análisis Exploratorio de Datos (EDA) y evalúa los modelos entrenados (lineal y polinomial).
Se generan: descripción, matriz de correlación, heatmap, scatter plots de las variables más relevantes, y comparación `Real vs Predicho` con análisis de residuos.

In [None]:
# Imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
plt.style.use('seaborn')

In [None]:
# Features used en el proyecto
FEATURES = ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'Latitude', 'Longitude']
TARGET = 'MedHouseVal'

In [None]:
# Cargar dataset local (guardado en data/)
df = pd.read_csv('../data/california_housing.csv')
df.head()

In [None]:
# Información básica y estadísticos
print(df.shape)
display(df.info())
df[FEATURES + [TARGET]].describe().T

In [None]:
# Matriz de correlación y heatmap
corr = df[FEATURES + [TARGET]].corr()
corr_target = corr[TARGET].sort_values(ascending=False)
print('Correlación con target (orden descendente):')
print(corr_target)
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Heatmap - Correlaciones (features + target)')
plt.show()

In [None]:
# Scatter plots de las variables más correlacionadas con el target
top_features = corr_target.index[1:4].tolist()  # top 3 (excluye target mismo)
plt.figure(figsize=(14,4))
for i, f in enumerate(top_features, 1):
    plt.subplot(1,3,i)
    sns.scatterplot(x=df[f], y=df[TARGET], alpha=0.4)
    plt.xlabel(f)
    plt.ylabel(TARGET)
    plt.title(f'{f} vs {TARGET}')
plt.tight_layout()
plt.show()

In [None]:
# Reproducible train/test split (mismo usado en src/train.py)
X = df[FEATURES].values
y = df[TARGET].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Train/Test shapes:', X_train.shape, X_test.shape)

In [None]:
# Cargar modelos guardados
model_linear = joblib.load('../models/model_linear.joblib')
model_poly2 = joblib.load('../models/model_poly_2.joblib')
poly2 = joblib.load('../models/poly_2_transform.joblib')
# Intentamos cargar grado 3 si existe
try:
    model_poly3 = joblib.load('../models/model_poly_3.joblib')
    poly3 = joblib.load('../models/poly_3_transform.joblib')
except Exception:
    model_poly3 = None
    poly3 = None
print('Modelos cargados; poly3 disponible =', model_poly3 is not None)

In [None]:
# Predicciones y métricas
# Linear
y_pred_lr = model_linear.predict(X_test)
lr_r2 = r2_score(y_test, y_pred_lr)
lr_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr))
print(f'Linear: R2={lr_r2:.4f}, RMSE={lr_rmse:.4f}')
# Poly degree 2
X_test_p2 = poly2.transform(X_test)
y_pred_p2 = model_poly2.predict(X_test_p2)
p2_r2 = r2_score(y_test, y_pred_p2)
p2_rmse = np.sqrt(mean_squared_error(y_test, y_pred_p2))
print(f'Poly degree 2: R2={p2_r2:.4f}, RMSE={p2_rmse:.4f}')
# Poly degree 3 (si existe)
if model_poly3 is not None and poly3 is not None:
    X_test_p3 = poly3.transform(X_test)
    y_pred_p3 = model_poly3.predict(X_test_p3)
    p3_r2 = r2_score(y_test, y_pred_p3)
    p3_rmse = np.sqrt(mean_squared_error(y_test, y_pred_p3))
    print(f'Poly degree 3: R2={p3_r2:.4f}, RMSE={p3_rmse:.4f}')

In [None]:
# Gráficas: Real vs Predicho y residuos para Linear y Poly2
def plot_pred_resid(y_true, y_pred, title):
    plt.figure(figsize=(12,5))
    plt.subplot(1,2,1)
    sns.scatterplot(x=y_true, y=y_pred, alpha=0.4)
    plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')
    plt.xlabel('Real')
    plt.ylabel('Predicho')
    plt.title(f'Real vs Predicho - {title}')
    plt.subplot(1,2,2)
    resid = y_true - y_pred
    sns.scatterplot(x=y_pred, y=resid, alpha=0.4)
    plt.axhline(0, color='red', linestyle='--')
    plt.xlabel('Predicho')
    plt.ylabel('Residual')
    plt.title(f'Residuals - {title}')
    plt.tight_layout()
    plt.show()

plot_pred_resid(y_test, y_pred_lr, 'Linear')
plot_pred_resid(y_test, y_pred_p2, 'Poly degree 2')
if model_poly3 is not None:
    plot_pred_resid(y_test, y_pred_p3, 'Poly degree 3')

## Conclusiones preliminares
- Compare las métricas R2 y RMSE impresas arriba.
- Observe los heatmaps y scatter plots para seleccionar features importantes o transformaciones futuras.
- Si hay patrones en los residuos, considere feature engineering o regularización.