# Red Neuronal con Embeddings para Variables Categóricas


Este notebook reemplaza el uso de `OneHotEncoder` por capas de `Embedding` para procesar variables categóricas (como `ocean_proximity`), lo que permite:

- Reducir dimensionalidad.
- Capturar relaciones semánticas entre categorías.
- Mejorar el rendimiento de la red neuronal al tratar con datos mixtos.


In [1]:

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Usando dispositivo:", device)


Usando dispositivo: cpu


In [2]:

housing = pd.read_csv("housing.csv")

# Separar variable categórica
cat_col = "ocean_proximity"
housing[cat_col] = housing[cat_col].astype("category")
housing["cat_code"] = housing[cat_col].cat.codes

# Variables numéricas
X_num = housing.drop(["median_house_value", "ocean_proximity", "cat_code"], axis=1)
X_cat = housing["cat_code"]
y = housing["median_house_value"]

# División
X_train_num, X_test_num, X_train_cat, X_test_cat, y_train, y_test = train_test_split(
    X_num, X_cat, y, test_size=0.2, random_state=42)

X_train_num, X_valid_num, X_train_cat, X_valid_cat, y_train, y_valid = train_test_split(
    X_train_num, X_train_cat, y_train, test_size=0.2, random_state=42)


In [3]:

y_scaler = StandardScaler()
y_train_scaled = y_scaler.fit_transform(y_train.values.reshape(-1, 1)).astype(np.float32)
y_valid_scaled = y_scaler.transform(y_valid.values.reshape(-1, 1)).astype(np.float32)
y_test_scaled = y_scaler.transform(y_test.values.reshape(-1, 1)).astype(np.float32)


In [4]:

scaler = StandardScaler()
X_train_num_scaled = scaler.fit_transform(X_train_num).astype(np.float32)
X_valid_num_scaled = scaler.transform(X_valid_num).astype(np.float32)
X_test_num_scaled = scaler.transform(X_test_num).astype(np.float32)

X_train_cat_tensor = torch.tensor(X_train_cat.values).long()
X_valid_cat_tensor = torch.tensor(X_valid_cat.values).long()
X_test_cat_tensor = torch.tensor(X_test_cat.values).long()

X_train_num_tensor = torch.tensor(X_train_num_scaled)
X_valid_num_tensor = torch.tensor(X_valid_num_scaled).to(device)
X_test_num_tensor = torch.tensor(X_test_num_scaled).to(device)

y_train_tensor = torch.tensor(y_train_scaled)
y_valid_tensor = torch.tensor(y_valid_scaled).to(device)
y_test_tensor = torch.tensor(y_test_scaled).to(device)


In [5]:

train_loader = DataLoader(
    TensorDataset(X_train_num_tensor, X_train_cat_tensor, y_train_tensor),
    batch_size=64, shuffle=True)


In [6]:

class HousingEmbeddingNet(nn.Module):
    def __init__(self, input_dim_num, num_categories, emb_dim=4):
        super(HousingEmbeddingNet, self).__init__()
        self.embedding = nn.Embedding(num_categories, emb_dim)
        self.net = nn.Sequential(
            nn.Linear(input_dim_num + emb_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x_num, x_cat):
        emb = self.embedding(x_cat)
        x = torch.cat([x_num, emb], dim=1)
        return self.net(x)

num_features = X_train_num_tensor.shape[1]
num_categories = housing["cat_code"].nunique()
model = HousingEmbeddingNet(num_features, num_categories).to(device)


In [7]:

criterion = nn.SmoothL1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 100
best_rmse = float('inf')
patience = 10
trigger = 0

for epoch in range(epochs):
    model.train()
    for x_num_batch, x_cat_batch, y_batch in train_loader:
        x_num_batch = x_num_batch.to(device)
        x_cat_batch = x_cat_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        preds = model(x_num_batch, x_cat_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        val_preds = model(X_valid_num_tensor, X_valid_cat_tensor.to(device))
        rmse_val = np.sqrt(mean_squared_error(
            y_scaler.inverse_transform(y_valid_tensor.cpu().numpy()),
            y_scaler.inverse_transform(val_preds.cpu().numpy())
        ))

    print(f"Epoch {epoch+1}, Val RMSE: {rmse_val:.2f}")

    if rmse_val < best_rmse:
        best_rmse = rmse_val
        best_model_state = model.state_dict()
        trigger = 0
    else:
        trigger += 1
        if trigger >= patience:
            print("Early stopping.")
            break


Epoch 1, Val RMSE: 65140.16
Epoch 2, Val RMSE: 62786.90
Epoch 3, Val RMSE: 61285.09
Epoch 4, Val RMSE: 60782.86
Epoch 5, Val RMSE: 59809.74
Epoch 6, Val RMSE: 59219.83
Epoch 7, Val RMSE: 58613.42
Epoch 8, Val RMSE: 58376.60
Epoch 9, Val RMSE: 57730.92
Epoch 10, Val RMSE: 57573.97
Epoch 11, Val RMSE: 57444.37
Epoch 12, Val RMSE: 57260.55
Epoch 13, Val RMSE: 57086.36
Epoch 14, Val RMSE: 56613.69
Epoch 15, Val RMSE: 56132.92
Epoch 16, Val RMSE: 56187.09
Epoch 17, Val RMSE: 55869.54
Epoch 18, Val RMSE: 55945.42
Epoch 19, Val RMSE: 55499.78
Epoch 20, Val RMSE: 55541.74
Epoch 21, Val RMSE: 55362.74
Epoch 22, Val RMSE: 55583.70
Epoch 23, Val RMSE: 56579.63
Epoch 24, Val RMSE: 55684.44
Epoch 25, Val RMSE: 55566.45
Epoch 26, Val RMSE: 55724.01
Epoch 27, Val RMSE: 54660.84
Epoch 28, Val RMSE: 55307.77
Epoch 29, Val RMSE: 54841.50
Epoch 30, Val RMSE: 54701.05
Epoch 31, Val RMSE: 54410.20
Epoch 32, Val RMSE: 54607.57
Epoch 33, Val RMSE: 55447.40
Epoch 34, Val RMSE: 54566.95
Epoch 35, Val RMSE: 549

In [8]:

model.load_state_dict(best_model_state)
model.eval()
with torch.no_grad():
    y_test_pred_scaled = model(X_test_num_tensor, X_test_cat_tensor.to(device)).cpu().numpy()
    y_test_pred = y_scaler.inverse_transform(y_test_pred_scaled.reshape(-1, 1))
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
    print("✅ RMSE test:", rmse_test)

plt.figure(figsize=(8,6))
plt.scatter(y_test, y_test_pred, alpha=0.4)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel("Valor real ($)")
plt.ylabel("Predicción ($)")
plt.title("Predicción con Embeddings")
plt.grid(True)
plt.show()


ValueError: Input contains NaN.