In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np

data = pd.read_csv('/home/mw/.cache/kagglehub/datasets/yasserh/housing-prices-dataset/versions/1/Housing.csv')

X = data.iloc[:, :-1].values

categorical_columns = data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    if data[col].nunique() <= 2:
        data[col] = LabelEncoder().fit_transform(data[col])
    else:
        one_hot = pd.get_dummies(data[col], prefix=col)
        data = pd.concat([data.drop(col, axis=1), one_hot], axis=1)

boolean_columns = data.select_dtypes(include='bool').columns
data[boolean_columns] = data[boolean_columns].astype(int)

numeric_columns = data.select_dtypes(include=['float64']).columns
data[numeric_columns] = (data[numeric_columns] - data[numeric_columns].mean()) / data[numeric_columns].std()

target_column = "price"
features = data.drop(target_column, axis=1)
target = data[target_column]

features_tensor = torch.tensor(features.values, dtype=torch.float32)
target_tensor = torch.tensor(target.values, dtype=torch.float32).unsqueeze(1)

X_train_tensor = torch.tensor(features.values, dtype=torch.float32)
X_test_tensor = torch.tensor(target.values, dtype=torch.float32)

class VAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super(VAE, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc21 = nn.Linear(512, latent_dim)
        self.fc22 = nn.Linear(512, latent_dim)
        self.fc3 = nn.Linear(latent_dim, 512)
        self.fc4 = nn.Linear(512, input_dim)

    def encode(self, x):
        h1 = F.relu(self.fc1(x))
        return self.fc21(h1), self.fc22(h1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self, z):
        h3 = F.relu(self.fc3(z))
        return torch.sigmoid(self.fc4(h3))

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

def loss_function(recon_x, x, mu, logvar):
    BCE = F.mse_loss(recon_x, x, reduction='sum')
    return BCE + 0.5 * torch.sum(torch.exp(logvar) + mu.pow(2) - 1 - logvar)

input_dim = features.shape[1]
latent_dim = 32
vae = VAE(input_dim, latent_dim)
optimizer = optim.Adam(vae.parameters(), lr=1e-3)

epochs = 50
for epoch in range(epochs):
    vae.train()
    train_loss = 0
    for batch_idx in range(0, len(X_train_tensor), 64):
        data = X_train_tensor[batch_idx:batch_idx+64]
        
        optimizer.zero_grad()
        
        recon_batch, mu, logvar = vae(data)
        
        loss = loss_function(recon_batch, data, mu, logvar)
        loss.backward()
        
        train_loss += loss.item()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{epochs}] | Loss: {train_loss/len(X_train_tensor):.4f}")

vae.eval()
with torch.no_grad():
    z = torch.randn(64, latent_dim)
    generated_data = vae.decode(z).cpu().numpy()

generated_df = pd.DataFrame(generated_data, columns=features.columns)  # Use features.columns here
generated_df.to_csv("generated_housing_data.csv", index=False)

print("Generated data saved to 'generated_housing_data.csv'.")


Epoch [1/50] | Loss: nan
Epoch [2/50] | Loss: nan
Epoch [3/50] | Loss: nan
Epoch [4/50] | Loss: nan
Epoch [5/50] | Loss: nan
Epoch [6/50] | Loss: nan
Epoch [7/50] | Loss: nan
Epoch [8/50] | Loss: nan
Epoch [9/50] | Loss: nan
Epoch [10/50] | Loss: nan
Epoch [11/50] | Loss: nan
Epoch [12/50] | Loss: nan
Epoch [13/50] | Loss: nan
Epoch [14/50] | Loss: nan
Epoch [15/50] | Loss: nan
Epoch [16/50] | Loss: nan
Epoch [17/50] | Loss: nan
Epoch [18/50] | Loss: nan
Epoch [19/50] | Loss: nan
Epoch [20/50] | Loss: nan
Epoch [21/50] | Loss: nan
Epoch [22/50] | Loss: nan
Epoch [23/50] | Loss: nan
Epoch [24/50] | Loss: nan
Epoch [25/50] | Loss: nan
Epoch [26/50] | Loss: nan
Epoch [27/50] | Loss: nan
Epoch [28/50] | Loss: nan
Epoch [29/50] | Loss: nan
Epoch [30/50] | Loss: nan
Epoch [31/50] | Loss: nan
Epoch [32/50] | Loss: nan
Epoch [33/50] | Loss: nan
Epoch [34/50] | Loss: nan
Epoch [35/50] | Loss: nan
Epoch [36/50] | Loss: nan
Epoch [37/50] | Loss: nan
Epoch [38/50] | Loss: nan
Epoch [39/50] | Loss: