In [7]:
from preprocess_data import DataPreprocessor
# Imports
import pandas as pd
import numpy as np
import helpers
from sklearn.model_selection import train_test_split


data_preprocessor = DataPreprocessor()

X_train, X_test, Y_train, Y_test = data_preprocessor.preprocess_data(lot_frontage_threshold=13)

#to_remove = ['ohe__MSSubClass_40', 'ohe__Utilities_NoSeWa', 'ohe__LotConfig_FR3', 'ohe__Neighborhood_Blueste', 'ohe__Condition1_RRNe', 'ohe__Condition1_RRNn', 'ohe__Condition2_Feedr', 'ohe__Condition2_PosA', 'ohe__Condition2_PosN', 'ohe__Condition2_RRAe', 'ohe__Condition2_RRAn', 'ohe__Condition2_RRNn', 'ohe__RoofStyle_Mansard', 'ohe__RoofStyle_Shed', 'ohe__RoofMatl_Membran', 'ohe__RoofMatl_Metal', 'ohe__RoofMatl_Roll', 'ohe__RoofMatl_WdShake', 'ohe__RoofMatl_WdShngl', 'ohe__Exterior1st_AsphShn', 'ohe__Exterior1st_BrkComm', 'ohe__Exterior1st_CBlock', 'ohe__Exterior1st_ImStucc', 'ohe__Exterior1st_Stone', 'ohe__Exterior2nd_AsphShn', 'ohe__Exterior2nd_CBlock', 'ohe__Exterior2nd_Other', 'ohe__Exterior2nd_Stone', 'ohe__Foundation_Stone', 'ohe__Foundation_Wood', 'ohe__Heating_OthW', 'ohe__Heating_Wall', 'ohe__Electrical_FuseP', 'ohe__Electrical_Mix', 'ohe__MiscFeature_Othr', 'ohe__MiscFeature_TenC', 'ohe__SaleType_CWD', 'ohe__SaleType_Con', 'ohe__SaleType_ConLI', 'ohe__SaleType_ConLw', 'ohe__SaleType_Oth', 'ohe__SaleCondition_AdjLand']

#X_train = X_train.drop(to_remove, axis=1)
#X_test = X_test.drop(to_remove, axis=1)

# Test data to be processed separately, here.
#test_df['Electrical'] = test_df['Electrical'].fillna(mode_electrical)
#test_df = helpers.init_fill_na(test_df)

Calculated Global Median Ratio: 0.7235 (from 951 samples)
Calculating for group level: 3way (['MSZoning', 'BldgType', 'LotShape'])
 -> Found 39 groups for 3way
Calculating for group level: 2way_ZS (['MSZoning', 'LotShape'])
 -> Found 16 groups for 2way_ZS
Calculating for group level: 2way_ZB (['MSZoning', 'BldgType'])
 -> Found 19 groups for 2way_ZB
Calculating for group level: 2way_BS (['BldgType', 'LotShape'])
 -> Found 14 groups for 2way_BS
Calculating for group level: 1way_Z (['MSZoning'])
 -> Found 5 groups for 1way_Z
Calculating for group level: 1way_B (['BldgType'])
 -> Found 5 groups for 1way_B
Calculating for group level: 1way_S (['LotShape'])
 -> Found 4 groups for 1way_S


In [8]:
from sklearn.pipeline import Pipeline

# Update the pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', helpers.generate_preprocessor(
        ordinal_cats_ordered=helpers.get_ordinal_cats_ordered(),
        categorical_cols_ordinal=helpers.get_categorical_cols_ordinal(),
        numerical_cols=helpers.get_numeric_cols()))
])

In [9]:
from scipy.sparse import issparse
from torch.utils.data import DataLoader, TensorDataset
# Deep learning model:
import torch
from torch import nn

X_train = model_pipeline.fit_transform(X_train)
X_test = model_pipeline.transform(X_test)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# float64 acceptable for EDA, float32 preferred for training.
X_train = torch.tensor(X_train, device=device, dtype=torch.float32)
X_test = torch.tensor(X_test, device=device, dtype=torch.float32)
Y_train = torch.tensor(Y_train.values, device=device, dtype=torch.float32)
Y_test = torch.tensor(Y_test.values, device=device, dtype=torch.float32)

model = nn.Sequential(
    nn.Linear(X_train.shape[1], 128),
    nn.ReLU(),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 16),
    nn.ReLU(),
    nn.Linear(16, 1)
)

try:
    train_dataset = TensorDataset(X_train, Y_train)
    val_dataset = TensorDataset(X_test, Y_test)
    print(f"Train dataset length: {len(train_dataset)}")
    print(f"Validation dataset length: {len(val_dataset)}")
except Exception as e:
    print(f"Error creating TensorDataset: {e}")
    # Likely length mismatch between X and y tensors if error here

batch_size = 64 # Hyperparameter: How many samples per batch
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) # Shuffle training data
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) # No need to shuffle validation

loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

model.to(device)

num_epochs = 500 # Hyperparameter: How many times to iterate over the dataset

print("\nStarting Training...")
for epoch in range(num_epochs):
    # --- Training Phase ---
    model.train() # Set model to training mode (enables dropout, batchnorm updates)
    running_train_loss = 0.0
    for batch_idx, (features, targets) in enumerate(train_loader):
        # Move batch data to the target device (GPU or CPU)
        features, targets = features.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(features)
        # Reshape the targets tensor to match the outputs shape ([batch_size, 1])
        targets_reshaped = targets.unsqueeze(1)
        loss = loss_func(outputs, targets_reshaped)
        loss.backward()
        optimizer.step()

        running_train_loss += loss.item() * features.size(0)

    epoch_train_loss = running_train_loss / len(train_loader.dataset)

    # --- Validation Phase ---
    model.eval() # Set model to evaluation mode (disables dropout, batchnorm updates)
    running_val_loss = 0.0
    with torch.no_grad(): # No need to calculate gradients during validation
        for features, targets in val_loader:
            features, targets = features.to(device), targets.to(device)
            outputs = model(features)
            targets_reshaped = targets.unsqueeze(1)
            loss = loss_func(outputs, targets_reshaped)
            running_val_loss += loss.item() * features.size(0)

    epoch_val_loss = running_val_loss / len(val_loader.dataset)

    # Print progress (e.g., every epoch or every few epochs)
    print(f'Epoch {epoch+1}/{num_epochs} | Train Loss: {epoch_train_loss:.6f} | Val Loss: {epoch_val_loss:.6f}')

Train dataset length: 1168
Validation dataset length: 292

Starting Training...
Epoch 1/500 | Train Loss: 38885370922.082191 | Val Loss: 39653252951.671234
Epoch 2/500 | Train Loss: 38882968898.630135 | Val Loss: 39647411775.123291
Epoch 3/500 | Train Loss: 38869442616.109589 | Val Loss: 39619934404.383560
Epoch 4/500 | Train Loss: 38817453434.739723 | Val Loss: 39530671454.684929
Epoch 5/500 | Train Loss: 38672402572.273972 | Val Loss: 39302922520.547943
Epoch 6/500 | Train Loss: 38331230797.150688 | Val Loss: 38816165270.794518
Epoch 7/500 | Train Loss: 37657501976.547943 | Val Loss: 37899009318.575340
Epoch 8/500 | Train Loss: 36456105282.630135 | Val Loss: 36357935104.000000
Epoch 9/500 | Train Loss: 34500718143.123291 | Val Loss: 33958050381.150684
Epoch 10/500 | Train Loss: 31601833703.452053 | Val Loss: 30546922650.301369
Epoch 11/500 | Train Loss: 27649055309.150684 | Val Loss: 26122463680.876713
Epoch 12/500 | Train Loss: 22735532424.767124 | Val Loss: 20837007219.726028
Epoch