In [1]:
import torch
from utils import helpers
from data_management import preprocess_data
from sensitivity_analysis import permutation_importance_analysis
from utils import estimator_wrapper
import numpy as np

data_preprocessor = preprocess_data.DataPreprocessor()

X_train, X_test, Y_train, T_test = data_preprocessor.preprocess_data(13)
print(X_train.shape)

Calculated Global Median Ratio: 0.7235 (from 951 samples)
Calculating for group level: 3way (['MSZoning', 'BldgType', 'LotShape'])
 -> Found 39 groups for 3way
Calculating for group level: 2way_ZS (['MSZoning', 'LotShape'])
 -> Found 16 groups for 2way_ZS
Calculating for group level: 2way_ZB (['MSZoning', 'BldgType'])
 -> Found 19 groups for 2way_ZB
Calculating for group level: 2way_BS (['BldgType', 'LotShape'])
 -> Found 14 groups for 2way_BS
Calculating for group level: 1way_Z (['MSZoning'])
 -> Found 5 groups for 1way_Z
Calculating for group level: 1way_B (['BldgType'])
 -> Found 5 groups for 1way_B
Calculating for group level: 1way_S (['LotShape'])
 -> Found 4 groups for 1way_S
(1168, 238)


In [2]:

from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

log_then_minmax = Pipeline([
    ('log_transform', FunctionTransformer(np.log1p)), # Example log transform
    ('min_max_scaler', MinMaxScaler())
])

ordinal_then_minmax_pipeline = Pipeline([
    ('ordinal_encode', OrdinalEncoder(
        categories=helpers.get_ordinal_cats_ordered(), # Make sure this returns the correct list of lists for categories
        handle_unknown='use_encoded_value',
        unknown_value=-1 # Or np.nan, but -1 works fine with MinMaxScaler
    )),
    ('minmax_scale_ordinal', MinMaxScaler()) # Scale the 0,1,2... output of OrdinalEncoder to [0,1]
])

model_pipeline = ColumnTransformer(
        transformers=[
            ('log_num', log_then_minmax, helpers.get_log_minmax_cols()),
            ('ord', ordinal_then_minmax_pipeline, helpers.get_categorical_cols_ordinal()),
            ('num', MinMaxScaler(), helpers.get_minmax_cols())
        ],
        remainder='passthrough',
        sparse_threshold=1
    )

model_pipeline.fit(X_train, Y_train)
X_train = model_pipeline.transform(X_train)
X_test = model_pipeline.transform(X_test)

# to remove after sensitivty analysis
to_remove_1 = [
    "num__L1_I_PR_MA6",
    "remainder__ohe__SaleType_CWD",
    "remainder__ohe__LandContour_Low",
    "remainder__ohe__Condition1_PosA",
    "remainder__ohe__BsmtFinType2_BLQ",
    "log_num__LowQualFinSF",
    "remainder__ohe__RoofMatl_WdShake",
    "remainder__ohe__Neighborhood_NPkVill",
    "remainder__ohe__HouseStyle_SFoyer",
    "remainder__ohe__Street_Pave",
    "remainder__ohe__MSSubClass_80",
    "remainder__ohe__Neighborhood_CollgCr",
    "ord__PoolQC",
    "remainder__ohe__Exterior1st_Stucco",
    "log_num__WoodDeckSF",
    "remainder__ohe__Neighborhood_IDOTRR",
    "remainder__HasPool",  # Note: This is the binary feature you created
    "ord__HeatingQC",
    "remainder__ohe__Exterior1st_VinylSd",  # This is one of two 'Exterior1st_VinylSd' if it appeared twice
    "remainder__ohe__LotConfig_Inside",
    "num__L1_I_HPI_MA3",
    "remainder__ohe__BsmtFinType1_BLQ",
    "num__L1_I_PR_MA3",
    "remainder__ohe__Exterior2nd_HdBoard",
    "remainder__ohe__MSSubClass_85",
    "remainder__ohe__Condition1_RRAe",
    "remainder__ohe__Exterior2nd_Stucco",
    "log_num__BsmtUnfSF",
    "remainder__ohe__SaleCondition_AdjLand",
    "remainder__ohe__BsmtFinType1_Rec",
    "ord__LotShape",
    "remainder__ohe__SaleType_WD",  # This is one of two 'SaleType_WD' if it appeared twice
    "remainder__ohe__SaleCondition_Family",
    "ord__GarageFinish",
    "remainder__ohe__Neighborhood_SawyerW",
    "remainder__ohe__BldgType_Twnhs",
    "remainder__ohe__BldgType_Duplex",
    "remainder__ohe__BsmtExposure_Mn",
    "remainder__ohe__BsmtFinType1_GLQ",  # This is one of two 'BsmtFinType1_GLQ' if it appeared twice
    "remainder__ohe__HouseStyle_2.5Unf",
    "remainder__ohe__Condition1_RRNn",
    "num__L1_I_HPI_MA6",
    "remainder__ohe__Exterior2nd_Plywood",
    "remainder__ohe__KitchenAbvGr_3",
    "ord__LandSlope",
    "remainder__ohe__HouseStyle_2.5Fin",
    "remainder__ohe__MSSubClass_70",
    "num__L1_I_HPI",
    "remainder__ohe__Exterior1st_MetalSd",
    "remainder__ohe__Exterior1st_HdBoard",
    "remainder__ohe__Heating_Grav",
    "ord__Fence",
    "remainder__ohe__Foundation_Wood",
    "remainder__ohe__MiscFeature_Othr",
    "remainder__ohe__Neighborhood_Mitchel",
    "remainder__ohe__MSSubClass_90",
    "remainder__ohe__BsmtHalfBath_2",
    "remainder__ohe__Exterior2nd_BrkFace",
    "remainder__ohe__BsmtFinType2_Rec",
    "remainder__ohe__Foundation_Stone",
    "remainder__ohe__Exterior1st_Plywood",
    "log_num__EnclosedPorch",
    "remainder__ohe__Exterior1st_WdShing",
    "remainder__ohe__Alley_None",
    "remainder__ohe__Neighborhood_NWAmes",
    "remainder__ohe__Exterior2nd_Stone",
    "remainder__ohe__BsmtHalfBath_1",
    "remainder__ohe__Exterior1st_CemntBd",  # This is one of two 'Exterior1st_CemntBd' if it appeared twice
    "remainder__ohe__SaleType_Con",
    "remainder__ohe__Electrical_FuseF",
    "remainder__ohe__Exterior2nd_VinylSd",  # This is the second one
    "remainder__ohe__Exterior2nd_AsphShn",
    "remainder__ohe__Neighborhood_BrDale",
    "remainder__ohe__Exterior2nd_Wd Shng",  # Note the space, ensure this matches your column name
    "remainder__ohe__BsmtFinType2_GLQ",
    "remainder__ohe__Neighborhood_Gilbert",
    "ord__ExterQual",
    "ord__ExterCond",
    "remainder__ohe__Heating_OthW",
    "remainder__ohe__MSSubClass_120",
    "remainder__ohe__Electrical_FuseP",
    "remainder__ohe__HalfBath_2",  # This is one of two 'HalfBath_2' if it appeared twice
    "remainder__ohe__SaleType_ConLw",
    "remainder__ohe__SaleType_ConLI",
    "remainder__ohe__BsmtFinType2_LwQ",
    "remainder__ohe__Exterior2nd_Brk Cmn",  # Note the space
    "remainder__ohe__MSSubClass_40",
    "remainder__ohe__BsmtFullBath_3",
    "remainder__ohe__MSSubClass_180"
]

X_train = X_train.drop(to_remove_1, axis=1)
X_test = X_test.drop(to_remove_1, axis=1)

print(X_train.shape)

params = torch.load("best_model_weights.pth")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Define device

model, optim = helpers.get_model_and_optim(X_train.shape[1])
state_dict = torch.load("best_model_weights.pth", map_location=device)
model.load_state_dict(state_dict)
model.to(device) # Ensure model is on the correct device
model.eval()
wrapper = estimator_wrapper.EstimatorWrapper(model, device)

perm_analysis = permutation_importance_analysis.PermutationImportanceAnalysis(wrapper)

perm_analysis.fit(X_train, Y_train)

for index, data_row in perm_analysis.get_importances_df().iterrows():
        # 'index' is the numerical index of the row in the DataFrame (0, 1, 2...)
        # 'data_row' is a pandas Series containing all data for that row
        
        feature_name = data_row['feature']
        mean_importance = data_row['importance_mean']
        std_importance = data_row['importance_std']
        
        print(f"Feature: {feature_name}")
        print(f"  Mean Importance: {mean_importance:.5f}") # Access by column name
        print(f"  Std Importance:  {std_importance:.5f}")  # Access by column name
        print("-" * 30)


(1168, 151)
Calculating Permutation Importance...
Feature: remainder__ohe__GarageType_None
  Mean Importance: 0.03389
  Std Importance:  0.00108
------------------------------
Feature: remainder__ohe__MSZoning_RL
  Mean Importance: 0.01415
  Std Importance:  0.00068
------------------------------
Feature: remainder__ohe__MSZoning_RM
  Mean Importance: 0.00868
  Std Importance:  0.00017
------------------------------
Feature: remainder__ohe__GarageType_Attchd
  Mean Importance: 0.00758
  Std Importance:  0.00035
------------------------------
Feature: log_num__BsmtFinSF1
  Mean Importance: 0.00654
  Std Importance:  0.00035
------------------------------
Feature: remainder__ohe__GarageType_Detchd
  Mean Importance: 0.00628
  Std Importance:  0.00028
------------------------------
Feature: log_num__BsmtFinSF2
  Mean Importance: 0.00576
  Std Importance:  0.00031
------------------------------
Feature: remainder__ohe__MasVnrType_None
  Mean Importance: 0.00572
  Std Importance:  0.00029
-