In [43]:
import pandas as pd
import numpy as np
import math

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

In [15]:
# Check if MPS is available to use GPU from mac
print(torch.__version__)
print(torch.backends.mps.is_available())
print(torch.backends.mps.is_built())

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

2.6.0
True
True
Using device: mps


# Data

In [None]:
train = pd.read_csv('train_cleaned.csv')
test = pd.read_csv('test_cleaned.csv')

train = train.astype('float32')
test = test.astype('float32')

def log_sqm(value):
    return np.log(value)

#train['square_meters'] = train['square_meters'].apply(log_sqm)
#train['y'] = train['y'].apply(log_sqm)


X_train = torch.tensor(train.drop('y', axis=1).values, dtype=torch.float32)
y_train = torch.tensor(train['y'].values, dtype=torch.float32)

In [38]:
print(X_train.shape, y_train.shape)

torch.Size([8000, 54]) torch.Size([8000])


# Encoder 

In [39]:
class Encoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(Encoder, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, latent_dim)

    def forward(self, x):
        h = F.relu(self.fc1(x))
        z = self.fc2(h)
        return z

# Decoder module for the autoencoder
class Decoder(nn.Module):
    def __init__(self, latent_dim, hidden_dim, output_dim):
        super(Decoder, self).__init__()
        self.fc1 = nn.Linear(latent_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, z):
        h = F.relu(self.fc1(z))
        # Use sigmoid if input values are normalized between 0 and 1 (e.g., images)
        x_recon = torch.sigmoid(self.fc2(h))
        return x_recon
    

class Autoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(Autoencoder, self).__init__()
        self.encoder = Encoder(input_dim, hidden_dim, latent_dim)
        self.decoder = Decoder(latent_dim, hidden_dim, input_dim)

    def forward(self, x):
        z = self.encoder(x)
        x_recon = self.decoder(z)
        return x_recon, z

In [44]:
input_dim = 54        
hidden_dim = 42
latent_dim = 32
batch_size = 32
num_epochs = 100
learning_rate = 1e-3

autoencoder = Autoencoder(input_dim, hidden_dim, latent_dim)

train_dataset = TensorDataset(X_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Initialize the autoencoder, optimizer, and loss function
autoencoder = Autoencoder(input_dim, hidden_dim, latent_dim)
optimizer = optim.Adam(autoencoder.parameters(), lr=learning_rate)
loss_fn = nn.MSELoss()  # Using Mean Squared Error for reconstruction loss

# Training loop for the autoencoder
autoencoder.train()
for epoch in range(num_epochs):
    running_loss = 0.0
    for batch in train_loader:
        x = batch[0]  # Get the input data
        optimizer.zero_grad()
        
        # Forward pass: get the reconstructed output and latent representation
        x_recon, _ = autoencoder(x)
        
        # Compute the reconstruction loss
        loss = loss_fn(x_recon, x)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    avg_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

# Optionally, freeze the encoder weights if you don't plan to fine-tune them further:
for param in autoencoder.encoder.parameters():
    param.requires_grad = False



Epoch 1/100, Loss: 339215.6626
Epoch 2/100, Loss: 339214.4643
Epoch 3/100, Loss: 339214.4609
Epoch 4/100, Loss: 339214.4646
Epoch 5/100, Loss: 339214.4697
Epoch 6/100, Loss: 339214.4739
Epoch 7/100, Loss: 339214.4448
Epoch 8/100, Loss: 339214.4588
Epoch 9/100, Loss: 339214.4483
Epoch 10/100, Loss: 339214.4576
Epoch 11/100, Loss: 339214.4396
Epoch 12/100, Loss: 339214.4373
Epoch 13/100, Loss: 339214.4224
Epoch 14/100, Loss: 339214.4550
Epoch 15/100, Loss: 339214.4312
Epoch 16/100, Loss: 339214.4530
Epoch 17/100, Loss: 339214.4298
Epoch 18/100, Loss: 339214.4299
Epoch 19/100, Loss: 339214.4331
Epoch 20/100, Loss: 339214.4268
Epoch 21/100, Loss: 339214.4409
Epoch 22/100, Loss: 339214.4345
Epoch 23/100, Loss: 339214.4227
Epoch 24/100, Loss: 339214.4279
Epoch 25/100, Loss: 339214.4131
Epoch 26/100, Loss: 339214.4136
Epoch 27/100, Loss: 339214.4209
Epoch 28/100, Loss: 339214.4049
Epoch 29/100, Loss: 339214.4215
Epoch 30/100, Loss: 339214.3817
Epoch 31/100, Loss: 339214.4342
Epoch 32/100, Los

In [45]:
dataset = TensorDataset(X_train, y_train)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

# Extract latent representations for all samples
x_latent_train = []
y_latent_train = []

with torch.no_grad():
    for x_batch, labels in loader:
        z = autoencoder.encoder(x_batch)
        x_latent_train.append(z)
        y_latent_train.append(labels)

# Concatenate all latent vectors and labels
x_latent_train = torch.cat(x_latent_train, dim=0)
y_latent_train = torch.cat(y_latent_train, dim=0)

# Simple FNN with regularization layers.

In [60]:
class RealEstateNN(nn.Module):
    def __init__(self, input_size):
        super(RealEstateNN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 32),
            nn.BatchNorm1d(32),
            nn.ELU(alpha=1.0),
            nn.Dropout(0.2),
            
            nn.Linear(32, 16),
            nn.BatchNorm1d(16),
            nn.ELU(alpha=1.0),
            nn.Dropout(0.2),

            nn.Linear(16, 16),
            nn.ELU(alpha=1.0),

            nn.Linear(16, 8),
            nn.ELU(alpha=1.0),

            nn.Linear(8, 8),
            nn.ELU(alpha=1.0),

            nn.Linear(8, 1)
        )

    def forward(self, x):
        return self.model(x)

In [61]:
def train_fit(model, dataloader, criterion, optimizer, num_epochs=10, device=device):
    model.to(device)
    model.train()
    
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in dataloader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss / len(dataloader):.4f}")
    print('model trained!')

In [62]:
input_size = 32
hidden_size = 32  
output_size = 1
learning_rate = 0.001
num_epochs = 50
batch_size = 8

dataset = TensorDataset(x_latent_train, y_latent_train)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [63]:
model = RealEstateNN(input_size).to(device)
criterion = nn.L1Loss()

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [65]:
train_fit(model, dataloader, criterion, optimizer, num_epochs)

Epoch [1/50], Loss: 271194.9497
Epoch [2/50], Loss: 270907.8470
Epoch [3/50], Loss: 271114.2637
Epoch [4/50], Loss: 271149.9064
Epoch [5/50], Loss: 271330.8259
Epoch [6/50], Loss: 270791.3718
Epoch [7/50], Loss: 271128.6424
Epoch [8/50], Loss: 271369.7264
Epoch [9/50], Loss: 270900.1098
Epoch [10/50], Loss: 270822.9375
Epoch [11/50], Loss: 271112.0617
Epoch [12/50], Loss: 271120.1389
Epoch [13/50], Loss: 270943.0328
Epoch [14/50], Loss: 271210.0474
Epoch [15/50], Loss: 271036.3113
Epoch [16/50], Loss: 271053.8863
Epoch [17/50], Loss: 270980.3007
Epoch [18/50], Loss: 271064.0744
Epoch [19/50], Loss: 270991.4795
Epoch [20/50], Loss: 270922.6334
Epoch [21/50], Loss: 270724.6489
Epoch [22/50], Loss: 270486.5048
Epoch [23/50], Loss: 271196.4955
Epoch [24/50], Loss: 271018.4057
Epoch [25/50], Loss: 270682.7486
Epoch [26/50], Loss: 270960.7660
Epoch [27/50], Loss: 270929.1258
Epoch [28/50], Loss: 271069.5726
Epoch [29/50], Loss: 270704.4115
Epoch [30/50], Loss: 270918.1881
Epoch [31/50], Loss