In [5]:


from data_management.preprocess_data import DataPreprocessor
# Imports
import numpy as np
from utils import helpers

data_preprocessor = DataPreprocessor()

X_train, X_test, Y_train, Y_test = data_preprocessor.preprocess_data(lot_frontage_threshold=13)


Calculated Global Median Ratio: 0.7235 (from 951 samples)
Calculating for group level: 3way (['MSZoning', 'BldgType', 'LotShape'])
 -> Found 39 groups for 3way
Calculating for group level: 2way_ZS (['MSZoning', 'LotShape'])
 -> Found 16 groups for 2way_ZS
Calculating for group level: 2way_ZB (['MSZoning', 'BldgType'])
 -> Found 19 groups for 2way_ZB
Calculating for group level: 2way_BS (['BldgType', 'LotShape'])
 -> Found 14 groups for 2way_BS
Calculating for group level: 1way_Z (['MSZoning'])
 -> Found 5 groups for 1way_Z
Calculating for group level: 1way_B (['BldgType'])
 -> Found 5 groups for 1way_B
Calculating for group level: 1way_S (['LotShape'])
 -> Found 4 groups for 1way_S


In [6]:
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

log_then_minmax = Pipeline([
    ('log_transform', FunctionTransformer(np.log1p)), # Example log transform
    ('min_max_scaler', MinMaxScaler())
])

ordinal_then_minmax_pipeline = Pipeline([
    ('ordinal_encode', OrdinalEncoder(
        categories=helpers.get_ordinal_cats_ordered(), # Make sure this returns the correct list of lists for categories
        handle_unknown='use_encoded_value',
        unknown_value=-1 # Or np.nan, but -1 works fine with MinMaxScaler
    )),
    ('minmax_scale_ordinal', MinMaxScaler()) # Scale the 0,1,2... output of OrdinalEncoder to [0,1]
])

model_pipeline = ColumnTransformer(
        transformers=[
            ('log_num', log_then_minmax, helpers.get_log_minmax_cols()),
            ('ord', ordinal_then_minmax_pipeline, helpers.get_categorical_cols_ordinal()),
            ('num', MinMaxScaler(), helpers.get_minmax_cols())
        ],
        remainder='passthrough',
        sparse_threshold=1
    )

In [7]:
from torch.utils.data import DataLoader, TensorDataset
# Deep learning model:
import torch
from torch import nn

model_pipeline.fit(X_train, Y_train)
X_train = model_pipeline.transform(X_train)
X_test = model_pipeline.transform(X_test)

# to remove after sensitivty analysis
to_remove_1 = [
    # Starting from the bottom of your list and going up to importance 0.00010
    "remainder__ohe__Exterior1st_VinylSd",   # Mean: -0.00004
    "remainder__ohe__Foundation_Slab",       # Mean: -0.00001
    "remainder__ohe__LandContour_Low",       # Mean: -0.00001
    "ord__BsmtCond",                         # Mean: -0.00001
    "remainder__ohe__MSSubClass_90",         # Mean: -0.00001
    "ord__GarageFinish",                     # Mean: -0.00001
    "remainder__ohe__BldgType_Duplex",       # Mean: -0.00000
    "remainder__ohe__RoofStyle_Hip",         # Mean: -0.00000
    "ord__ExterQual",                        # Mean: -0.00000
    "log_num__MasVnrArea",                   # Mean: -0.00000
    "remainder__ohe__MSSubClass_75",         # Mean: -0.00000
    "remainder__ohe__Neighborhood_OldTown",  # Mean: -0.00000
    "remainder__ohe__BedroomAbvGr_4",        # Mean:  0.00000 (from previous list, check current)
    "remainder__ohe__Neighborhood_IDOTRR",   # Mean:  0.00000
    "remainder__ohe__SaleType_CWD",          # Mean:  0.00000
    "remainder__ohe__Exterior1st_MetalSd",   # Mean:  0.00001 (from previous list, check current)
    "remainder__ohe__FullBath_2",            # Mean:  0.00001 (from previous list, check current)
    "remainder__ohe__RoofStyle_Gable",       # Mean:  0.00001 (from previous list, check current)
    "log_num__MiscVal",                      # Mean:  0.00001
    "remainder__ohe__RoofStyle_Gambrel",     # Mean:  0.00001
    "remainder__ohe__Exterior2nd_Brk Cmn",   # Mean:  0.00001
    "remainder__ohe__HalfBath_2",            # Mean:  0.00001
    "remainder__ohe__Neighborhood_Gilbert",  # Mean:  0.00001
    "ord__ExterCond",                        # Mean:  0.00001
    "remainder__ohe__SaleType_ConLI",        # Mean:  0.00001
    "remainder__ohe__Exterior1st_Stucco",    # Mean:  0.00001
    "remainder__ohe__BsmtFinType2_GLQ",      # Mean:  0.00001
    "remainder__ohe__Exterior2nd_Wd Shng",   # Mean:  0.00001
    "remainder__ohe__BsmtFinType2_LwQ",      # Mean:  0.00001
    "remainder__ohe__BsmtHalfBath_1",        # Mean:  0.00001
    "num__L1_I_PR",                          # Mean:  0.00001
    "num__L1_I_HPI_MA3",                     # Mean:  0.00001
    "remainder__ohe__Electrical_FuseP",      # Mean:  0.00001
    "remainder__ohe__Foundation_Stone",      # Mean:  0.00001
    "remainder__ohe__MSSubClass_40",         # Mean:  0.00001
    "remainder__ohe__Exterior2nd_Stone",     # Mean:  0.00001
    "remainder__ohe__BsmtFullBath_3",        # Mean:  0.00001 (from previous list, check current)
    "remainder__ohe__SaleType_Con",          # Mean:  0.00002
    "remainder__ohe__Exterior1st_CemntBd",   # Mean:  0.00002
    "remainder__ohe__BsmtFinType2_Rec",      # Mean:  0.00002
    "remainder__ohe__Foundation_Wood",       # Mean:  0.00002
    "remainder__ohe__Exterior2nd_BrkFace",   # Mean:  0.00002
    "remainder__ohe__BsmtExposure_Mn",       # Mean:  0.00002
    "remainder__ohe__BldgType_Twnhs",          # Mean:  0.00002
    "remainder__ohe__Neighborhood_SawyerW",  # Mean:  0.00002
    "remainder__ohe__MiscFeature_Othr",      # Mean:  0.00002
    "remainder__ohe__BedroomAbvGr_6",        # Mean:  0.00002
    "remainder__ohe__Exterior2nd_Stucco",    # Mean:  0.00002
    "remainder__ohe__Neighborhood_NPkVill",  # Mean:  0.00002
    "log_num__EnclosedPorch",                # Mean:  0.00003
    "remainder__ohe__MSSubClass_85",         # Mean:  0.00003
    "remainder__ohe__HouseStyle_2.5Fin",     # Mean:  0.00003
    "remainder__ohe__Neighborhood_Timber",   # Mean:  0.00003
    "remainder__ohe__BsmtFinType2_GLQ",      # Mean:  0.00003 (Appears again, likely meant a different feature or copy-paste from previous list)
    "remainder__ohe__Condition1_RRNn",       # Mean:  0.00003
    "remainder__ohe__BldgType_TwnhsE",       # Mean:  0.00003
    "remainder__ohe__BsmtFinType1_GLQ",      # Mean:  0.00003
    "remainder__ohe__Condition1_RRAe",       # Mean:  0.00003
    "remainder__ohe__Neighborhood_BrDale",   # Mean:  0.00003
    "remainder__ohe__BsmtHalfBath_1",        # Mean:  0.00003 (Appears again)
    "remainder__ohe__BsmtHalfBath_2",        # Mean:  0.00003
    "remainder__ohe__Street_Pave",           # Mean:  0.00004
    "remainder__ohe__Exterior1st_WdShing",   # Mean:  0.00004
    "remainder__ohe__BedroomAbvGr_8",        # Mean:  0.00004
    "remainder__ohe__Alley_None",            # Mean:  0.00004
    "num__L1_I_PR",                          # Mean:  0.00004
    "ord__Fence",                            # Mean:  0.00004
    "ord__FireplaceQu",                      # Mean:  0.00004
    "remainder__ohe__Electrical_SBrkr",      # Mean:  0.00005
    "remainder__ohe__BsmtExposure_Mn",       # Mean:  0.00005 (Appears again)
    "remainder__ohe__Exterior2nd_Stucco",    # Mean:  0.00005 (Appears again)
    "remainder__ohe__BsmtFinType2_Rec",      # Mean:  0.00005 (Appears again)
    "remainder__ohe__Condition1_RRAe",       # Mean:  0.00005 (Appears again)
    "remainder__ohe__SaleCondition_AdjLand", # Mean:  0.00005
    "remainder__ohe__Exterior2nd_Stone",     # Mean:  0.00005
    "num__L1_I_HPI",                         # Mean:  0.00005
    "remainder__ohe__KitchenAbvGr_3",        # Mean:  0.00005
    "remainder__ohe__Exterior1st_Plywood",   # Mean:  0.00005
    "remainder__ohe__Neighborhood_NWAmes",   # Mean:  0.00005
    "remainder__ohe__Electrical_FuseF",      # Mean:  0.00005
    "remainder__ohe__Exterior1st_MetalSd",   # Mean:  0.00005
    "remainder__MoSold_sin",                 # Mean:  0.00006
    "remainder__ohe__BsmtFinType1_BLQ",      # Mean:  0.00006 (Appears again)
    "ord__HeatingQC",                        # Mean:  0.00006
    "remainder__ohe__BsmtFinType1_Rec",      # Mean:  0.00006 (Appears again)
    "remainder__ohe__Exterior1st_Stucco",    # Mean:  0.00006 (Appears again)
    "num__L1_I_HPI_MA3",                     # Mean:  0.00006 (from previous list, check current. Your new list has it at 0.00020)
    "remainder__ohe__Exterior2nd_AsphShn",   # Mean:  0.00006
    "remainder__ohe__Exterior1st_CemntBd",   # Mean:  0.00006
    "log_num__BsmtUnfSF",                    # Mean:  0.00006
    "remainder__ohe__BedroomAbvGr_5",        # Mean:  0.00007
    "num__YrSold",                           # Mean:  0.00007
    "remainder__ohe__Neighborhood_CollgCr",  # Mean:  0.00007
    "remainder__ohe__LotConfig_Inside",      # Mean:  0.00007
    "remainder__ohe__BldgType_TwnhsE",       # Mean:  0.00007
    "remainder__HasPool",                    # Mean:  0.00007
    "remainder__ohe__Heating_Grav",          # Mean:  0.00007
    "remainder__ohe__SaleCondition_Family",  # Mean:  0.00007
    "num__L1_I_PR_MA3",                      # Mean:  0.00007
    "remainder__ohe__MSSubClass_70",         # Mean:  0.00008
    "remainder__ohe__Exterior2nd_Wd Shng",   # Mean:  0.00008
    "num__L1_I_UR_MA3",                      # Mean:  0.00008
    "remainder__ohe__BldgType_Duplex",       # Mean:  0.00008
    "remainder__ohe__MSSubClass_75",         # Mean:  0.00008
    "remainder__ohe__Exterior1st_HdBoard",   # Mean:  0.00009
    "remainder__ohe__SaleType_Oth",          # Mean:  0.00009
    "remainder__ohe__KitchenAbvGr_3",        # Mean:  0.00009 (Appears again)
    "log_num__LowQualFinSF",                 # Mean:  0.00009
    "remainder__ohe__Neighborhood_Sawyer",   # Mean:  0.00009
    "num__L1_I_PR_MA6",                      # Mean:  0.00009
    "remainder__ohe__Neighborhood_Mitchel",  # Mean:  0.00009 (Appears again)
    "ord__HeatingQC",                        # Mean:  0.00009 (Appears again)
    "remainder__ohe__SaleType_WD",           # Mean:  0.00010
    "remainder__ohe__Exterior1st_Plywood",   # Mean:  0.00010 (Appears again)
    "remainder__ohe__MSSubClass_80",         # Mean:  0.00010
    "remainder__ohe__Exterior2nd_VinylSd",   # Mean:  0.00010
    "remainder__ohe__Neighborhood_Veenker",  # Mean:  0.00010
    "remainder__ohe__Exterior2nd_CmentBd",   # Mean:  0.00010
    "remainder__ohe__BsmtFinType1_LwQ",
    "remainder__ohe__SaleType_ConLw",
    "ord__LotShape",
    "ord__LandSlope",
    "remainder__ohe__MSSubClass_180",
    "remainder__ohe__Heating_OthW",
    "remainder__ohe__MSSubClass_120",
    "remainder__ohe__Exterior2nd_MetalSd",
    "remainder__ohe__Neighborhood_NAmes",
    "remainder__ohe__BedroomAbvGr_2",
    "remainder__ohe__BedroomAbvGr_1",
    "remainder__ohe__Exterior2nd_ImStucc",
    "remainder__ohe__BsmtFinType2_BLQ",
    "remainder__ohe__RoofStyle_Mansard",
    "ord__GarageCond",
    "remainder__ohe__LotConfig_FR3"
]


X_train = X_train.drop(to_remove_1, axis=1)
X_test = X_test.drop(to_remove_1, axis=1)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# float64 acceptable for EDA, float32 preferred for training.
X_train = torch.tensor(X_train.values, device=device, dtype=torch.float32)
X_test = torch.tensor(X_test.values, device=device, dtype=torch.float32)
Y_train = torch.tensor(Y_train.values, device=device, dtype=torch.float32)
Y_test = torch.tensor(Y_test.values, device=device, dtype=torch.float32)


try:
    train_dataset = TensorDataset(X_train, Y_train)
    val_dataset = TensorDataset(X_test, Y_test)
    print(f"Train dataset length: {len(train_dataset)}")
    print(f"Validation dataset length: {len(val_dataset)}")
except Exception as e:
    print(f"Error creating TensorDataset: {e}")
    # Likely length mismatch between X and y tensors if error here

#batch_size = 48 # Batch size lowered, due to sample size being less than ideal.
#train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) # Shuffle training data
#val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False) # No need to shuffle validation

loss_func = nn.MSELoss()
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

Train dataset length: 1168
Validation dataset length: 292


import optuna
num_epochs = 400 # Hyperparameter: How many times to iterate over the dataset

def get_activation_instance(activation_choice_name):
    """Helper function to get an activation function instance."""
    if activation_choice_name == 'sigmoid':
        return nn.Sigmoid()
    elif activation_choice_name == 'tanh':
        return nn.Tanh()
    elif activation_choice_name == 'leaky_relu':
        return nn.LeakyReLU()
    elif activation_choice_name == 'elu':
        return nn.ELU()
    elif activation_choice_name == 'silu': # Swish
        return nn.SiLU()
    else: # Default to ReLU
        return nn.ReLU()

def objective(trial):
    # --- Adam Optimizer Hyperparameters ---
    # Focused LR based on recent best findings
    adam_lr = trial.suggest_float("adam_lr", 0.001, 0.015, log=True) 
    
    # Betas around previously found good values
    adam_beta1 = trial.suggest_float("adam_beta1", 0.8, 0.95) 
    adam_beta2 = trial.suggest_float("adam_beta2", 0.93, 0.999) # Slightly wider upper for beta2
    
    adam_epsilon = trial.suggest_float("adam_epsilon", 1e-9, 1e-7, log=True)

    # --- Architecture Hyperparameters ---
    # Assuming a 3-hidden-layer network (Input -> H1 -> H2 -> H3 -> Output)
    
    input_features = X_train.shape[1] # Assuming X_train is a global tensor

    # Neurons for Hidden Layer 1 (H1)
    # Explore smaller sizes more, given recent best trial
    neurons_h1_choices = [32, 48, 64, 80, 96, 128, 160, 192, 240] 
    # Filter choices to be <= 1.5 * input_features to keep it somewhat relative if input_features is small
    neurons_h1_choices_filtered = [n for n in neurons_h1_choices if n <= input_features * 1.5 or n <= 64] # Ensure some smaller options
    if not neurons_h1_choices_filtered: # Fallback if input_features is tiny
        neurons_h1_choices_filtered = [32, 48, 64]
    neurons_h1 = trial.suggest_categorical('neurons_h1', neurons_h1_choices_filtered)
    
    # Neurons for Hidden Layer 2 (H2) - descending from H1
    low_h2 = max(16, int(neurons_h1 * 0.25))
    high_h2 = neurons_h1
    if low_h2 > high_h2: low_h2 = high_h2 
    step_h2 = max(1, (high_h2 - low_h2) // 4) if high_h2 > low_h2 else 1 
    if high_h2 == low_h2: 
        neurons_h2 = high_h2
    else:
        # Ensure step is at least 1 and high is greater than low for suggest_int
        step_h2_final = max(1, step_h2) if high_h2 > low_h2 else 1
        if high_h2 < low_h2 + step_h2_final and high_h2 > low_h2 : # If range is too small for step
            step_h2_final = 1
        neurons_h2 = trial.suggest_int('neurons_h2', low=low_h2, high=high_h2, step=step_h2_final)


    # Neurons for Hidden Layer 3 (H3) - descending from H2
    low_h3 = max(8, int(neurons_h2 * 0.25))
    high_h3 = neurons_h2
    if low_h3 > high_h3: low_h3 = high_h3
    step_h3 = max(1, (high_h3 - low_h3) // 4) if high_h3 > low_h3 else 1
    if high_h3 == low_h3:
        neurons_h3 = high_h3
    else:
        step_h3_final = max(1, step_h3) if high_h3 > low_h3 else 1
        if high_h3 < low_h3 + step_h3_final and high_h3 > low_h3:
            step_h3_final = 1
        neurons_h3 = trial.suggest_int('neurons_h3', low=low_h3, high=high_h3, step=step_h3_final)


    # Activation choices for the hidden layers
    activation_choice_h1_name = trial.suggest_categorical('activation_h1', ['relu', 'leaky_relu', 'elu', 'silu', 'tanh'])
    activation_choice_h2_name = trial.suggest_categorical('activation_h2', ['relu', 'leaky_relu', 'elu', 'silu', 'tanh'])
    activation_choice_h3_name = trial.suggest_categorical('activation_h3', ['relu', 'leaky_relu', 'elu', 'silu', 'tanh'])

    # Build the model layers dynamically
    model_layers = []
    model_layers.extend([nn.Linear(input_features, neurons_h1), get_activation_instance(activation_choice_h1_name)])
    model_layers.extend([nn.Linear(neurons_h1, neurons_h2), get_activation_instance(activation_choice_h2_name)])
    model_layers.extend([nn.Linear(neurons_h2, neurons_h3), get_activation_instance(activation_choice_h3_name)])
    model_layers.append(nn.Linear(neurons_h3, 1)) 
    
    model = nn.Sequential(*model_layers)
    model.to(device) 
    
    _optimizer = torch.optim.Adam(
        model.parameters(),
        lr=adam_lr,
        betas=(adam_beta1, adam_beta2),
        eps=adam_epsilon
    )
    
    best_val_loss_this_trial = float('inf') 
    epochs_without_improvement = 0
    patience_for_trial_early_stop = 75 # Reduced patience for 400 epochs

    for epoch in range(num_epochs): # num_epochs defined globally (e.g., 400)
        model.train() 
        running_train_loss = 0.0
        for batch_idx, (features, targets) in enumerate(train_loader):
            features, targets = features.to(device), targets.to(device)
            _optimizer.zero_grad()
            outputs = model(features)
            targets_reshaped = targets.unsqueeze(1) if targets.ndim == 1 else targets
            loss = loss_func(outputs, targets_reshaped)
            
            if torch.isnan(loss) or torch.isinf(loss): 
                print(f"Trial {trial.number}, Epoch {epoch+1}: NaN or Inf loss detected in training. Returning high value.")
                return float('inf') 
                
            loss.backward()
            _optimizer.step()
            running_train_loss += loss.item() * features.size(0)

        model.eval() 
        running_val_loss = 0.0
        with torch.no_grad():
            for features, targets in val_loader:
                features, targets = features.to(device), targets.to(device)
                outputs = model(features)
                targets_reshaped = targets.unsqueeze(1) if targets.ndim == 1 else targets
                loss = loss_func(outputs, targets_reshaped)

                if torch.isnan(loss) or torch.isinf(loss): 
                    print(f"Trial {trial.number}, Epoch {epoch+1}: NaN or Inf loss detected in validation. Returning high value.")
                    return float('inf') 

                running_val_loss += loss.item() * features.size(0)
        
        current_epoch_val_loss = running_val_loss / len(val_loader.dataset)

        if current_epoch_val_loss < best_val_loss_this_trial:
            best_val_loss_this_trial = current_epoch_val_loss
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1

        trial.report(current_epoch_val_loss, epoch) 
        if trial.should_prune():
            print(f"Trial {trial.number} pruned at epoch {epoch+1} by Optuna pruner.")
            raise optuna.exceptions.TrialPruned()
            
        if epochs_without_improvement >= patience_for_trial_early_stop:
            print(f"Trial {trial.number} stopped early at epoch {epoch+1} (in-trial patience {patience_for_trial_early_stop}).")
            break 
            
    return best_val_loss_this_trial 

study = optuna.create_study(
    direction='minimize', 
    pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=150, interval_steps=10) 
    # n_startup_trials: Don't prune first 5 trials.
    # n_warmup_steps: Don't prune a trial before it has completed 50 epochs.
    # interval_steps: Check for pruning every 10 epochs after warmup.
)

# 2. Run the optimization.
#    Optuna will call your 'objective' function 'n_trials' times.
#    Each time, it passes a 'trial' object to your function.
study.optimize(objective, n_trials=500)

# 3. Get the best results.
print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(study.get_trials(states=[optuna.trial.TrialState.PRUNED])))
print("  Number of complete trials: ", len(study.get_trials(states=[optuna.trial.TrialState.COMPLETE])))

print("\nBest trial:")
best_trial = study.best_trial

print("  Value (Min Validation Loss): ", best_trial.value)

print("  Best hyperparameters: ")
for key, value in best_trial.params.items():
    print(f"    {key}: {value}")

    # Print progress (e.g., every epoch or every few epochs)
#    print(f'Epoch {epoch+1}/{num_epochs} | Train Loss: {epoch_train_loss:.6f} | Val Loss: {epoch_val_loss:.6f}')

In [8]:
num_epochs = 400 # Hyperparameter: How many times to iterate over the dataset

best_validation_loss = float('inf')
best_model_path = "best_model_weights.pth"
patience_epochs = 150
totalEpochs = 0
print("\nStarting Training...")
while best_validation_loss > float(0.015) and totalEpochs < 1200:
    patience_counter = 0
    updates = 0
    model, optimizer = helpers.get_model_and_optim(X_train.shape[1])
    model.to(device)
    for epoch in range(num_epochs):
        # --- Training Phase ---
        model.train() # Set model to training mode (enables dropout, batchnorm updates)
        running_train_loss = 0.0
        for batch_idx, (features, targets) in enumerate(train_loader):
            # Move batch data to the target device (GPU or CPU)
            features, targets = features.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(features)
            # Reshape the targets tensor to match the outputs shape ([batch_size, 1])
            targets_reshaped = targets.unsqueeze(1)
            loss = loss_func(outputs, targets_reshaped)
            loss.backward()
            optimizer.step()
    
            running_train_loss += loss.item() * features.size(0)
            totalEpochs += 1
    
        epoch_train_loss = running_train_loss / len(train_loader.dataset)
    
        # --- Validation Phase ---
        model.eval() # Set model to evaluation mode (disables dropout, batchnorm updates)
        running_val_loss = 0.0
    
        with torch.no_grad(): # No need to calculate gradients during validation
            for features, targets in val_loader:
                features, targets = features.to(device), targets.to(device)
                outputs = model(features)
                targets_reshaped = targets.unsqueeze(1)
                loss = loss_func(outputs, targets_reshaped)
                running_val_loss += loss.item() * features.size(0)
    
        epoch_val_loss = running_val_loss / len(val_loader.dataset)
    
        # Print progress (e.g., every epoch or every few epochs)
        print(f'Epoch {epoch+1}/{num_epochs} | Train Loss: {epoch_train_loss:.6f} | Val Loss: {epoch_val_loss:.6f}')
        if epoch_val_loss < best_validation_loss:
            print(f"Validation loss improved from {best_validation_loss:.6f} to {epoch_val_loss:.6f}. Saving model to {best_model_path}")
            best_validation_loss = epoch_val_loss
            torch.save(model.state_dict(), best_model_path) # Save the model's weights
            patience_counter = 0  # Reset patience since we found a better model
        else:
            patience_counter += 1
            updates += 1
        
        if patience_counter >= patience_epochs:
            print(f"Early stopping triggered after {patience_epochs} epochs without improvement.")
            break

print(f"The bestest best validation loss: {best_validation_loss:.6f}")



Starting Training...
Epoch 1/400 | Train Loss: 90.206425 | Val Loss: 5.843169
Validation loss improved from inf to 5.843169. Saving model to best_model_weights.pth
Epoch 2/400 | Train Loss: 4.095268 | Val Loss: 2.293256
Validation loss improved from 5.843169 to 2.293256. Saving model to best_model_weights.pth
Epoch 3/400 | Train Loss: 0.704974 | Val Loss: 0.444810
Validation loss improved from 2.293256 to 0.444810. Saving model to best_model_weights.pth
Epoch 4/400 | Train Loss: 0.241751 | Val Loss: 0.191608
Validation loss improved from 0.444810 to 0.191608. Saving model to best_model_weights.pth
Epoch 5/400 | Train Loss: 0.147256 | Val Loss: 0.123218
Validation loss improved from 0.191608 to 0.123218. Saving model to best_model_weights.pth
Epoch 6/400 | Train Loss: 0.095979 | Val Loss: 0.083716
Validation loss improved from 0.123218 to 0.083716. Saving model to best_model_weights.pth
Epoch 7/400 | Train Loss: 0.064601 | Val Loss: 0.058673
Validation loss improved from 0.083716 to 0.

Model mathematically represented, using Adam optimizer, default hyperparameters:
$$
\begin{align}
% Input
\mathbf{a}^{(0)} &= \mathbf{x}, \quad \text{where } \mathbf{x} \in \mathbb{R}^{260} \\
% Layer 1
\mathbf{z}^{(1)} &= W^{(1)}\mathbf{a}^{(0)} + \mathbf{b}^{(1)}, \quad \text{where } W^{(1)} \in \mathbb{R}^{128 \times 260}, \mathbf{b}^{(1)} \in \mathbb{R}^{128} \\
\mathbf{a}^{(1)} &= \text{ReLU}(\mathbf{z}^{(1)}), \quad \mathbf{a}^{(1)} \in \mathbb{R}^{128} \\
% Layer 2
\mathbf{z}^{(2)} &= W^{(2)}\mathbf{a}^{(1)} + \mathbf{b}^{(2)}, \quad \text{where } W^{(2)} \in \mathbb{R}^{64 \times 128}, \mathbf{b}^{(2)} \in \mathbb{R}^{64} \\
\mathbf{a}^{(2)} &= \text{ReLU}(\mathbf{z}^{(2)}), \quad \mathbf{a}^{(2)} \in \mathbb{R}^{64} \\
% Layer 3
\mathbf{z}^{(3)} &= W^{(3)}\mathbf{a}^{(2)} + \mathbf{b}^{(3)}, \quad \text{where } W^{(3)} \in \mathbb{R}^{16 \times 64}, \mathbf{b}^{(3)} \in \mathbb{R}^{16} \\
\mathbf{a}^{(3)} &= \text{ReLU}(\mathbf{z}^{(3)}), \quad \mathbf{a}^{(3)} \in \mathbb{R}^{16} \\
% Layer 4 (Output)
\mathbf{z}^{(4)} &= W^{(4)}\mathbf{a}^{(3)} + b^{(4)}, \quad \text{where } W^{(4)} \in \mathbb{R}^{1 \times 16}, \mathbf{b}^{(4)} \in \mathbb{R} \\
\hat{y} &= \mathbf{z}^{(4)}, \quad \hat{y} \in \mathbb{R} \\
% Loss
\mathcal{L} &= (\hat{y} - y_{\text{target}})^2
\end{align}
$$

try:
    weights = torch.load(best_model_path, map_location=torch.device('cpu'))
    print("Successfully loaded entire model.")
    model.path = weights
    # You can now inspect the model structure
    print("\nModel structure:")
    print(model)

except Exception as e:
    print(f"Error loading .pth file: {e}")