In [41]:
from preprocess_data import DataPreprocessor
# Imports
import optuna
import numpy as np
import helpers


data_preprocessor = DataPreprocessor()

X_train, X_test, Y_train, Y_test = data_preprocessor.preprocess_data(lot_frontage_threshold=13)

to_remove = ['ohe__Utilities_NoSeWa', 'ohe__Neighborhood_Blueste', 'ohe__Condition1_RRNe', 'ohe__Condition2_PosA', 'ohe__Condition2_RRAe', 'ohe__Condition2_RRAn', 'ohe__Condition2_RRNn', 'ohe__RoofMatl_Membran', 'ohe__RoofMatl_Metal', 'ohe__RoofMatl_Roll', 'ohe__Exterior1st_AsphShn', 'ohe__Exterior1st_CBlock', 'ohe__Exterior1st_ImStucc', 'ohe__Exterior1st_Stone', 'ohe__Exterior2nd_CBlock', 'ohe__Exterior2nd_Other', 'ohe__Electrical_Mix', 'ohe__MiscFeature_TenC'] #Removed based on <= 1 total non-zero appearances

X_train = X_train.drop(to_remove, axis=1)
X_test = X_test.drop(to_remove, axis=1)

Calculated Global Median Ratio: 0.7235 (from 951 samples)
Calculating for group level: 3way (['MSZoning', 'BldgType', 'LotShape'])
 -> Found 39 groups for 3way
Calculating for group level: 2way_ZS (['MSZoning', 'LotShape'])
 -> Found 16 groups for 2way_ZS
Calculating for group level: 2way_ZB (['MSZoning', 'BldgType'])
 -> Found 19 groups for 2way_ZB
Calculating for group level: 2way_BS (['BldgType', 'LotShape'])
 -> Found 14 groups for 2way_BS
Calculating for group level: 1way_Z (['MSZoning'])
 -> Found 5 groups for 1way_Z
Calculating for group level: 1way_B (['BldgType'])
 -> Found 5 groups for 1way_B
Calculating for group level: 1way_S (['LotShape'])
 -> Found 4 groups for 1way_S


In [42]:
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

log_then_minmax = Pipeline([
    ('log_transform', FunctionTransformer(np.log1p)), # Example log transform
    ('min_max_scaler', MinMaxScaler())
])

ordinal_then_minmax_pipeline = Pipeline([
    ('ordinal_encode', OrdinalEncoder(
        categories=helpers.get_ordinal_cats_ordered(), # Make sure this returns the correct list of lists for categories
        handle_unknown='use_encoded_value',
        unknown_value=-1 # Or np.nan, but -1 works fine with MinMaxScaler
    )),
    ('minmax_scale_ordinal', MinMaxScaler()) # Scale the 0,1,2... output of OrdinalEncoder to [0,1]
])

model_pipeline = ColumnTransformer(
        transformers=[
            ('log_num', log_then_minmax, helpers.get_log_minmax_cols()),
            ('ord', ordinal_then_minmax_pipeline, helpers.get_categorical_cols_ordinal()),
            ('num', MinMaxScaler(), helpers.get_minmax_cols())
        ],
        remainder='passthrough',
        sparse_threshold=1
    )

In [43]:
from feature_engineering import FeatureEngineering
from torch.utils.data import DataLoader, TensorDataset
# Deep learning model:
import torch
from torch import nn

feature_engineer = FeatureEngineering()

feature_engineer.fit(X_train, Y_train)
X_train = feature_engineer.transform(X_train)
X_test = feature_engineer.transform(X_test)
model_pipeline.fit(X_train, Y_train)
X_train = model_pipeline.transform(X_train)
X_test = model_pipeline.transform(X_test)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# float64 acceptable for EDA, float32 preferred for training.
X_train = torch.tensor(X_train.values, device=device, dtype=torch.float32)
X_test = torch.tensor(X_test.values, device=device, dtype=torch.float32)
Y_train = torch.tensor(Y_train.values, device=device, dtype=torch.float32)
Y_test = torch.tensor(Y_test.values, device=device, dtype=torch.float32)


#model = nn.Sequential(
#    nn.Linear(X_train.shape[1], 128),
#    nn.ReLU(),
#    nn.Linear(128, 64),
#    nn.ReLU(),
#    nn.Linear(64, 16),
#    nn.ReLU(),
#    nn.Linear(16, 1)
#)

try:
    train_dataset = TensorDataset(X_train, Y_train)
    val_dataset = TensorDataset(X_test, Y_test)
    print(f"Train dataset length: {len(train_dataset)}")
    print(f"Validation dataset length: {len(val_dataset)}")
except Exception as e:
    print(f"Error creating TensorDataset: {e}")
    # Likely length mismatch between X and y tensors if error here

batch_size = 32 # Batch size lowered, due to sample size being less than ideal.
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) # Shuffle training data
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) # No need to shuffle validation

loss_func = nn.MSELoss()
#optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

#model.to(device)


Fitting FeatureEngineering: Learning max_actual_garage_age_...
Learned max_actual_garage_age_: 107.0
Train dataset length: 1168
Validation dataset length: 292


In [44]:
num_epochs = 2000 # Hyperparameter: How many times to iterate over the dataset

def get_choice(activation_choice_name):
    """Helper function to get an activation function instance."""
    if activation_choice_name == 'sigmoid':
        return nn.Sigmoid()
    elif activation_choice_name == 'tanh':
        return nn.Tanh()
    elif activation_choice_name == 'leaky_relu':
        return nn.LeakyReLU()
    elif activation_choice_name == 'elu':
        return nn.ELU()
    elif activation_choice_name == 'silu': # Swish
        return nn.SiLU()
    else: # Default to ReLU
        return nn.ReLU()

def objective(trial):
    adam_lr = trial.suggest_float("adam_lr", 0.09, 0.1, log=True)
    adam_beta1 = trial.suggest_float("adam_beta1", 0.8, 0.95)
    adam_beta2 = trial.suggest_float("adam_beta2", 0.9, 0.999)
    adam_epsilon = trial.suggest_float("adam_epsilon", 1e-9, 1e-7, log=True)
    activation_choice_0 = trial.suggest_categorical('activation_0', ['relu', 'leaky_relu', 'elu', 'silu', 'tanh'])
    activation_choice_1 = trial.suggest_categorical('activation_1', ['relu', 'leaky_relu', 'elu', 'silu', 'tanh'])
    activation_choice_2 = trial.suggest_categorical('activation_2', ['relu', 'leaky_relu', 'elu', 'silu', 'tanh'])
    
    activation_0 = get_choice(activation_choice_0)
    activation_1 = get_choice(activation_choice_1)
    activation_2 = get_choice(activation_choice_2)
    
    input_features = X_train.shape[1]
    
    neurons_h1 = trial.suggest_categorical('neurons_h1', [
        max(32, int(input_features * 0.5)), 
        max(32, int(input_features * 0.75)),
        max(32, int(input_features * 1.0)),
        max(32, int(input_features * 1.25)),
        max(32, int(input_features * 1.5))
    ])
    
    # Neurons for Hidden Layer 2 (H2) - descending from H1
    # Using suggest_int with dynamic high bound based on neurons_h1
    # Ensure low is less than or equal to high.
    low_h2 = max(16, int(neurons_h1 * 0.25))
    high_h2 = neurons_h1
    if low_h2 > high_h2: # Ensure low is not greater than high
        low_h2 = high_h2 
    step_h2 = max(1, (high_h2 - low_h2) // 4) if high_h2 > low_h2 else 1 # Ensure step is at least 1
    if high_h2 == low_h2: # If low and high are same, suggest_int might error with step > 1
        neurons_h2 = high_h2
    else:
        neurons_h2 = trial.suggest_int('neurons_h2', low=low_h2, high=high_h2, step=max(1, step_h2 // 4 * 4 if step_h2 > 4 else step_h2)) # Ensure step is reasonable

    # Neurons for Hidden Layer 3 (H3) - descending from H2
    # Using suggest_int with dynamic high bound based on neurons_h2
    # Ensure low is less than or equal to high.
    low_h3 = max(8, int(neurons_h2 * 0.25))
    high_h3 = neurons_h2
    if low_h3 > high_h3: # Ensure low is not greater than high
        low_h3 = high_h3
    step_h3 = max(1, (high_h3 - low_h3) // 4) if high_h3 > low_h3 else 1 # Ensure step is at least 1
    if high_h3 == low_h3:
        neurons_h3 = high_h3
    else:
        neurons_h3 = trial.suggest_int('neurons_h3', low=low_h3, high=high_h3, step=max(1, step_h3 // 2 * 2 if step_h3 > 2 else step_h3))
    
    model = nn.Sequential(
        nn.Linear(input_features, neurons_h1),
        activation_0,
        nn.Linear(neurons_h1, neurons_h2),
        activation_1,
        nn.Linear(neurons_h2, neurons_h3),
        activation_2,
        nn.Linear(neurons_h3, 1)
    )
    
    _optimizer = torch.optim.Adam(
        model.parameters(),
        lr=adam_lr,
        betas=(adam_beta1, adam_beta2),
        eps=adam_epsilon
    )
    
    model.to(device)
        
    for epoch in range(num_epochs):
        # --- Training Phase ---
        model.train() # Set model to training mode (enables dropout, batchnorm updates)
        running_train_loss = 0.0
        for batch_idx, (features, targets) in enumerate(train_loader):
            # Move batch data to the target device (GPU or CPU)
            features, targets = features.to(device), targets.to(device)
            _optimizer.zero_grad()
            outputs = model(features)
            # Reshape the targets tensor to match the outputs shape ([batch_size, 1])
            targets_reshaped = targets.unsqueeze(1)
            loss = loss_func(outputs, targets_reshaped)
            loss.backward()
            _optimizer.step()
    
            running_train_loss += loss.item() * features.size(0)
    
        epoch_train_loss = running_train_loss / len(train_loader.dataset)
    
        # --- Validation Phase ---
        model.eval()
        running_val_loss = 0.0
        with torch.no_grad():
            for features, targets in val_loader:
                features, targets = features.to(device), targets.to(device)
                outputs = model(features)
                targets_reshaped = targets.unsqueeze(1)
                loss = loss_func(outputs, targets_reshaped)
                running_val_loss += loss.item() * features.size(0)
    
        epoch_val_loss = running_val_loss / len(val_loader.dataset)
        
        trial.report(epoch_val_loss, epoch) # Report intermediate value for pruning
        if trial.should_prune():
            print(f"Trial {trial.number} pruned at epoch {epoch+1}.")
            raise optuna.exceptions.TrialPruned()
        
        return epoch_val_loss

study = optuna.create_study(
    direction='minimize', 
    pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=150, interval_steps=10) 
    # n_startup_trials: Don't prune first 5 trials.
    # n_warmup_steps: Don't prune a trial before it has completed 50 epochs.
    # interval_steps: Check for pruning every 10 epochs after warmup.
)

# 2. Run the optimization.
#    Optuna will call your 'objective' function 'n_trials' times.
#    Each time, it passes a 'trial' object to your function.
study.optimize(objective, n_trials=4000)

# 3. Get the best results.
print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(study.get_trials(states=[optuna.trial.TrialState.PRUNED])))
print("  Number of complete trials: ", len(study.get_trials(states=[optuna.trial.TrialState.COMPLETE])))

print("\nBest trial:")
best_trial = study.best_trial

print("  Value (Min Validation Loss): ", best_trial.value)

print("  Best hyperparameters: ")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

    # Print progress (e.g., every epoch or every few epochs)
#    print(f'Epoch {epoch+1}/{num_epochs} | Train Loss: {epoch_train_loss:.6f} | Val Loss: {epoch_val_loss:.6f}')

[I 2025-05-19 19:08:17,486] A new study created in memory with name: no-name-0048cf34-29cc-45a9-b3b4-2b6e8e7d1ee4
[I 2025-05-19 19:08:17,592] Trial 0 finished with value: 39582686502.57534 and parameters: {'adam_lr': 0.09201692600298994, 'adam_beta1': 0.8427096218722957, 'adam_beta2': 0.9629893503364455, 'adam_epsilon': 2.27531665360591e-09, 'activation_0': 'tanh', 'activation_1': 'leaky_relu', 'activation_2': 'tanh', 'neurons_h1': 244, 'neurons_h2': 237, 'neurons_h3': 59}. Best is trial 0 with value: 39582686502.57534.
[I 2025-05-19 19:08:17,674] Trial 1 finished with value: 39608907972.38356 and parameters: {'adam_lr': 0.0977789298783441, 'adam_beta1': 0.8420989041700085, 'adam_beta2': 0.9324824391312059, 'adam_epsilon': 7.524598477471212e-09, 'activation_0': 'tanh', 'activation_1': 'silu', 'activation_2': 'tanh', 'neurons_h1': 183, 'neurons_h2': 45, 'neurons_h3': 35}. Best is trial 0 with value: 39582686502.57534.
[I 2025-05-19 19:08:17,742] Trial 2 finished with value: 7472840914.4

Study statistics: 
  Number of finished trials:  4000
  Number of pruned trials:  0
  Number of complete trials:  4000

Best trial:
  Value (Min Validation Loss):  6273713804.2739725
  Best hyperparameters: 
    adam_lr: 0.09306966029414018
    adam_beta1: 0.8071539541931217
    adam_beta2: 0.971864442892929
    adam_epsilon: 2.395189253804147e-09
    activation_0: elu
    activation_1: leaky_relu
    activation_2: leaky_relu
    neurons_h1: 244
    neurons_h2: 105
    neurons_h3: 26


num_epochs = 2000 # Hyperparameter: How many times to iterate over the dataset

print("\nStarting Training...")
for epoch in range(num_epochs):
    # --- Training Phase ---
    model.train() # Set model to training mode (enables dropout, batchnorm updates)
    running_train_loss = 0.0
    for batch_idx, (features, targets) in enumerate(train_loader):
        # Move batch data to the target device (GPU or CPU)
        features, targets = features.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(features)
        # Reshape the targets tensor to match the outputs shape ([batch_size, 1])
        targets_reshaped = targets.unsqueeze(1)
        loss = loss_func(outputs, targets_reshaped)
        loss.backward()
        optimizer.step()

        running_train_loss += loss.item() * features.size(0)

    epoch_train_loss = running_train_loss / len(train_loader.dataset)

    # --- Validation Phase ---
    model.eval() # Set model to evaluation mode (disables dropout, batchnorm updates)
    running_val_loss = 0.0
    with torch.no_grad(): # No need to calculate gradients during validation
        for features, targets in val_loader:
            features, targets = features.to(device), targets.to(device)
            outputs = model(features)
            targets_reshaped = targets.unsqueeze(1)
            loss = loss_func(outputs, targets_reshaped)
            running_val_loss += loss.item() * features.size(0)

    epoch_val_loss = running_val_loss / len(val_loader.dataset)

    # Print progress (e.g., every epoch or every few epochs)
    print(f'Epoch {epoch+1}/{num_epochs} | Train Loss: {epoch_train_loss:.6f} | Val Loss: {epoch_val_loss:.6f}')

Model mathematically represented, using Adam optimizer, default hyperparameters:
$$
\begin{align}
% Input
\mathbf{a}^{(0)} &= \mathbf{x}, \quad \text{where } \mathbf{x} \in \mathbb{R}^{260} \\
% Layer 1
\mathbf{z}^{(1)} &= W^{(1)}\mathbf{a}^{(0)} + \mathbf{b}^{(1)}, \quad \text{where } W^{(1)} \in \mathbb{R}^{128 \times 260}, \mathbf{b}^{(1)} \in \mathbb{R}^{128} \\
\mathbf{a}^{(1)} &= \text{ReLU}(\mathbf{z}^{(1)}), \quad \mathbf{a}^{(1)} \in \mathbb{R}^{128} \\
% Layer 2
\mathbf{z}^{(2)} &= W^{(2)}\mathbf{a}^{(1)} + \mathbf{b}^{(2)}, \quad \text{where } W^{(2)} \in \mathbb{R}^{64 \times 128}, \mathbf{b}^{(2)} \in \mathbb{R}^{64} \\
\mathbf{a}^{(2)} &= \text{ReLU}(\mathbf{z}^{(2)}), \quad \mathbf{a}^{(2)} \in \mathbb{R}^{64} \\
% Layer 3
\mathbf{z}^{(3)} &= W^{(3)}\mathbf{a}^{(2)} + \mathbf{b}^{(3)}, \quad \text{where } W^{(3)} \in \mathbb{R}^{16 \times 64}, \mathbf{b}^{(3)} \in \mathbb{R}^{16} \\
\mathbf{a}^{(3)} &= \text{ReLU}(\mathbf{z}^{(3)}), \quad \mathbf{a}^{(3)} \in \mathbb{R}^{16} \\
% Layer 4 (Output)
\mathbf{z}^{(4)} &= W^{(4)}\mathbf{a}^{(3)} + b^{(4)}, \quad \text{where } W^{(4)} \in \mathbb{R}^{1 \times 16}, \mathbf{b}^{(4)} \in \mathbb{R} \\
\hat{y} &= \mathbf{z}^{(4)}, \quad \hat{y} \in \mathbb{R} \\
% Loss
\mathcal{L} &= (\hat{y} - y_{\text{target}})^2
\end{align}
$$