In [89]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import helpers
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

plt.style.use('ggplot')

train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

target_col = "SalePrice"
X_train = train_df.drop(columns=[target_col])
Y_train = train_df[target_col]

X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)

# I cross-referenced missing values with expected missing values from the description, electrical has one unexpected
# missing value checks done in EDA_Analysis

mode_electrical = X_train['Electrical'].mode()
X_train['Electrical'] = X_train['Electrical'].fillna(mode_electrical)
X_test['Electrical'] = X_test['Electrical'].fillna(mode_electrical)
test_df['Electrical'] = test_df['Electrical'].fillna(mode_electrical)

# helpers.py manages imputation for missing data.

X_train = helpers.init_fill_na(X_train)
X_test = helpers.init_fill_na(X_test)
test_df = helpers.init_fill_na(test_df)

In [90]:
# Feature engineering:

X_train["SqrtLotArea"] = np.sqrt(X_train["LotArea"])
X_test["SqrtLotArea"] = np.sqrt(X_test["LotArea"])
test_df["SqrtLotArea"] = np.sqrt(test_df["LotArea"])
# Made redundant by SqrtLotArea.
del train_df["LotArea"]
del test_df["LotArea"]

try:
    # Only learning medians based on X_train as to not leak data to x_test.
    indexed_scales_dict, global_val = helpers.learn_scaling_factors(X_train)
    # Now pass indexed_scales_dict and global_val to your apply function
    X_train["LotFrontage"] = helpers.fill_na_lotfrontage(X_train, indexed_scales_dict, 13, global_val)
    X_test["LotFrontage"] = helpers.fill_na_lotfrontage(X_test, indexed_scales_dict, 13, global_val)
    test_df["LotFrontage"] = helpers.fill_na_lotfrontage(test_df, indexed_scales_dict, 13, global_val)
except ValueError as e:
    print(f"Error learning rules: {e}")

X_train["ScaleFactor"] = X_train["LotFrontage"] / X_train["SqrtLotArea"]
X_test["ScaleFactor"] = X_test["LotFrontage"] / X_test["SqrtLotArea"]
test_df["ScaleFactor"] = test_df["LotFrontage"] / test_df["SqrtLotArea"]

Calculated Global Median Ratio: 0.7235 (from 951 samples)
Calculating for group level: 3way (['MSZoning', 'BldgType', 'LotShape'])
 -> Found 39 groups for 3way
Calculating for group level: 2way_ZS (['MSZoning', 'LotShape'])
 -> Found 16 groups for 2way_ZS
Calculating for group level: 2way_ZB (['MSZoning', 'BldgType'])
 -> Found 19 groups for 2way_ZB
Calculating for group level: 2way_BS (['BldgType', 'LotShape'])
 -> Found 14 groups for 2way_BS
Calculating for group level: 1way_Z (['MSZoning'])
 -> Found 5 groups for 1way_Z
Calculating for group level: 1way_B (['BldgType'])
 -> Found 5 groups for 1way_B
Calculating for group level: 1way_S (['LotShape'])
 -> Found 4 groups for 1way_S
Finished learning rules.


In [91]:
from sklearn.pipeline import Pipeline

num_cols = helpers.get_numeric_cols()
num_cols = num_cols + ["SqrtLotArea", "ScaleFactor"]
num_cols.remove("LotArea")

# Update the pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', helpers.generate_preprocessor(
        drop='first',
        sparse_output=True,
        ordinal_cats_ordered=helpers.get_ordinal_cats_ordered(),
        categorical_cols_ordinal=helpers.get_categorical_cols_ordinal(),
        numerical_cols=num_cols,
        categorical_cols_nominal=helpers.get_categorical_cols_nominal()
    ))
])

In [92]:
from scipy.sparse import issparse
from torch.utils.data import DataLoader, TensorDataset
# Deep learning model:
import torch
from torch import nn

X_train = model_pipeline.fit_transform(X_train)
X_test = model_pipeline.transform(X_test)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# float64 acceptable for EDA, float32 preferred for training.
X_train = torch.tensor(X_train, device=device, dtype=torch.float32)
X_test = torch.tensor(X_test, device=device, dtype=torch.float32)
Y_train = torch.tensor(Y_train.values, device=device, dtype=torch.float32)
Y_test = torch.tensor(Y_test.values, device=device, dtype=torch.float32)

model = nn.Sequential(
    nn.Linear(X_train.shape[1], 128),
    nn.ReLU(),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Linear(64, 16),
    nn.ReLU(),
    nn.Linear(16, 1)
)

try:
    train_dataset = TensorDataset(X_train, Y_train)
    val_dataset = TensorDataset(X_test, Y_test)
    print(f"Train dataset length: {len(train_dataset)}")
    print(f"Validation dataset length: {len(val_dataset)}")
except Exception as e:
    print(f"Error creating TensorDataset: {e}")
    # Likely length mismatch between X and y tensors if error here

batch_size = 64 # Hyperparameter: How many samples per batch
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) # Shuffle training data
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) # No need to shuffle validation

loss_func = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

model.to(device)

num_epochs = 500 # Hyperparameter: How many times to iterate over the dataset

print("\nStarting Training...")
for epoch in range(num_epochs):
    # --- Training Phase ---
    model.train() # Set model to training mode (enables dropout, batchnorm updates)
    running_train_loss = 0.0
    for batch_idx, (features, targets) in enumerate(train_loader):
        # Move batch data to the target device (GPU or CPU)
        features, targets = features.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(features)
        # Reshape the targets tensor to match the outputs shape ([batch_size, 1])
        targets_reshaped = targets.unsqueeze(1)
        loss = loss_func(outputs, targets_reshaped)
        loss.backward()
        optimizer.step()

        running_train_loss += loss.item() * features.size(0)

    epoch_train_loss = running_train_loss / len(train_loader.dataset)

    # --- Validation Phase ---
    model.eval() # Set model to evaluation mode (disables dropout, batchnorm updates)
    running_val_loss = 0.0
    with torch.no_grad(): # No need to calculate gradients during validation
        for features, targets in val_loader:
            features, targets = features.to(device), targets.to(device)
            outputs = model(features)
            targets_reshaped = targets.unsqueeze(1)
            loss = loss_func(outputs, targets_reshaped)
            running_val_loss += loss.item() * features.size(0)

    epoch_val_loss = running_val_loss / len(val_loader.dataset)

    # Print progress (e.g., every epoch or every few epochs)
    print(f'Epoch {epoch+1}/{num_epochs} | Train Loss: {epoch_train_loss:.6f} | Val Loss: {epoch_val_loss:.6f}')

Train dataset length: 1168
Validation dataset length: 292

Starting Training...
Epoch 1/500 | Train Loss: 38774767167.123291 | Val Loss: 39358063882.520546
Epoch 2/500 | Train Loss: 38044874976.438354 | Val Loss: 37925816965.260277




Epoch 3/500 | Train Loss: 34874688890.739723 | Val Loss: 32776287666.849316
Epoch 4/500 | Train Loss: 26920410911.561646 | Val Loss: 21604932088.986301
Epoch 5/500 | Train Loss: 19673199349.479450 | Val Loss: 14392968781.150684
Epoch 6/500 | Train Loss: 18398497749.917809 | Val Loss: 13103486821.698629
Epoch 7/500 | Train Loss: 17156744696.986301 | Val Loss: 15499385491.287672
Epoch 8/500 | Train Loss: 17304412174.027397 | Val Loss: 14522042830.904110
Epoch 9/500 | Train Loss: 16806715588.383562 | Val Loss: 14001244917.479452
Epoch 10/500 | Train Loss: 16578250611.726027 | Val Loss: 13593247758.027397
Epoch 11/500 | Train Loss: 16397799592.328768 | Val Loss: 13172866833.534246
Epoch 12/500 | Train Loss: 16431502434.191780 | Val Loss: 12390083836.493151
Epoch 13/500 | Train Loss: 16034704888.986301 | Val Loss: 12704850032.219177
Epoch 14/500 | Train Loss: 15867553427.287672 | Val Loss: 12943505253.698629
Epoch 15/500 | Train Loss: 15733159487.123287 | Val Loss: 12883215710.684931
Epoch 