In [None]:
# ACTIVIDAD 4.1 - k-NN + ANÁLISIS

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error

sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (9, 6)

# CARGA DEL DATASET
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data"
columnas = [
    "CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD",
    "TAX","PTRATIO","B","LSTAT","MEDV"
]
df = pd.read_csv(url, delim_whitespace=True, names=columnas)

print("Dimensiones:", df.shape)
display(df.head())

# ANÁLISIS DE DATOS

print("\n--- ¿Hay datos nulos? ---")
print(df.isnull().sum())

print("\n--- Tipos de datos ---")
print(df.dtypes)

# Histogramas
df.hist(bins=20, figsize=(15, 12))
plt.suptitle("Histogramas de los atributos", fontsize=16)
plt.show()

# MATRIZ DE CORRELACIÓN COMPLETA
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=False, cmap="coolwarm")
plt.title("Matriz de correlación completa")
plt.show()

# Correlación con MEDV
corr_medv = df.corr()["MEDV"].sort_values(ascending=False)
print("\n--- Correlación con MEDV ---")
display(corr_medv)

# DIVISIÓN TRAIN/TEST

X = df.drop(columns=["MEDV"])
y = df["MEDV"]

X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# split train/val
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.25, random_state=42
)

print(f"\nTamaños -> Train:{len(X_train)}, Val:{len(X_val)}, Test:{len(X_test)}")

# FUNCIÓN PARA PROBAR DISTINTOS k

def evaluar_k(Xtr, ytr, Xval, yval, k_list, scaler=None):
    resultados = []
    if scaler is not None:
        Xtr_s = scaler.fit_transform(Xtr)
        Xval_s = scaler.transform(Xval)
    else:
        Xtr_s = Xtr.values
        Xval_s = Xval.values

    for k in k_list:
        knn = KNeighborsRegressor(n_neighbors=k)
        knn.fit(Xtr_s, ytr)
        pred_val = knn.predict(Xval_s)
        r2 = r2_score(yval, pred_val)
        resultados.append((k, r2, knn))

    return resultados

k_values = range(1, 31)

# SIN normalizar
res_sin = evaluar_k(X_train, y_train, X_val, y_val, k_values)

# CON normalización
scaler = MinMaxScaler()
res_norm = evaluar_k(X_train, y_train, X_val, y_val, k_values, scaler=scaler)

# GRÁFICA R2 VS K

df_sin = pd.DataFrame(res_sin, columns=["k", "R2", "model"])
df_norm = pd.DataFrame(res_norm, columns=["k", "R2", "model"])

plt.figure(figsize=(10,5))
plt.plot(df_sin["k"], df_sin["R2"], marker='o', label="Sin normalizar")
plt.plot(df_norm["k"], df_norm["R2"], marker='o', label="Normalizado (MinMax)")

plt.title("R² vs k (validación)")
plt.xlabel("k")
plt.ylabel("R²")
plt.legend()
plt.grid(True)
plt.show()

# MEJOR k PARA AMBOS CASOS

best_sin = df_sin.loc[df_sin["R2"].idxmax()]
best_norm = df_norm.loc[df_norm["R2"].idxmax()]

print("\nMejor sin normalizar:", best_sin)
print("\nMejor con normalización:", best_norm)

# ENTRENAR MODELOS FINALES EN TRAIN_FULL Y PROBAR EN TEST

# SIN normalizar
knn_final_sin = KNeighborsRegressor(n_neighbors=int(best_sin["k"]))
knn_final_sin.fit(X_train_full.values, y_train_full.values)
pred_test_sin = knn_final_sin.predict(X_test.values)

r2_sin = r2_score(y_test, pred_test_sin)
rmse_sin = np.sqrt(mean_squared_error(y_test, pred_test_sin))

# CON normalizar
sc_final = MinMaxScaler()
X_train_full_scaled = sc_final.fit_transform(X_train_full)
X_test_scaled = sc_final.transform(X_test)

knn_final_norm = KNeighborsRegressor(n_neighbors=int(best_norm["k"]))
knn_final_norm.fit(X_train_full_scaled, y_train_full.values)
pred_test_norm = knn_final_norm.predict(X_test_scaled)

r2_norm = r2_score(y_test, pred_test_norm)
rmse_norm = np.sqrt(mean_squared_error(y_test, pred_test_norm))

print("\n--- MÉTRICAS EN TEST ---")
print(f"Sin normalizar -> R2={r2_sin:.4f}, RMSE={rmse_sin:.4f}")
print(f"Normalizado     -> R2={r2_norm:.4f}, RMSE={rmse_norm:.4f}")

# GRÁFICAS REAL VS PREDICHO

plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
sns.scatterplot(x=y_test, y=pred_test_sin)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--')
plt.title(f"Sin normalizar (k={int(best_sin['k'])})")
plt.xlabel("Real")
plt.ylabel("Predicho")

plt.subplot(1,2,2)
sns.scatterplot(x=y_test, y=pred_test_norm)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--')
plt.title(f"Normalizado (k={int(best_norm['k'])})")
plt.xlabel("Real")
plt.ylabel("Predicho")

plt.tight_layout()
plt.show()

# TABLA COMPARATIVA

tabla = pd.DataFrame({
    "MEDV_real": y_test.values,
    "Pred_sin_norm": pred_test_sin,
    "Pred_norm": pred_test_norm
})

print("\nPrimeras filas de la comparación:")
display(tabla.head(12))
