In [7]:
import torch
from utils import helpers
from data_management import preprocess_data
from sensitivity_analysis import permutation_importance_analysis
from utils import estimator_wrapper
import numpy as np

data_preprocessor = preprocess_data.DataPreprocessor()

X_train, X_test, Y_train, T_test = data_preprocessor.preprocess_data(13)
print(X_train.shape)

Calculated Global Median Ratio: 0.7235 (from 951 samples)
Calculating for group level: 3way (['MSZoning', 'BldgType', 'LotShape'])
 -> Found 39 groups for 3way
Calculating for group level: 2way_ZS (['MSZoning', 'LotShape'])
 -> Found 16 groups for 2way_ZS
Calculating for group level: 2way_ZB (['MSZoning', 'BldgType'])
 -> Found 19 groups for 2way_ZB
Calculating for group level: 2way_BS (['BldgType', 'LotShape'])
 -> Found 14 groups for 2way_BS
Calculating for group level: 1way_Z (['MSZoning'])
 -> Found 5 groups for 1way_Z
Calculating for group level: 1way_B (['BldgType'])
 -> Found 5 groups for 1way_B
Calculating for group level: 1way_S (['LotShape'])
 -> Found 4 groups for 1way_S
(1168, 238)


In [8]:

from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

log_then_minmax = Pipeline([
    ('log_transform', FunctionTransformer(np.log1p)), # Example log transform
    ('min_max_scaler', MinMaxScaler())
])

ordinal_then_minmax_pipeline = Pipeline([
    ('ordinal_encode', OrdinalEncoder(
        categories=helpers.get_ordinal_cats_ordered(), # Make sure this returns the correct list of lists for categories
        handle_unknown='use_encoded_value',
        unknown_value=-1 # Or np.nan, but -1 works fine with MinMaxScaler
    )),
    ('minmax_scale_ordinal', MinMaxScaler()) # Scale the 0,1,2... output of OrdinalEncoder to [0,1]
])

model_pipeline = ColumnTransformer(
        transformers=[
            ('log_num', log_then_minmax, helpers.get_log_minmax_cols()),
            ('ord', ordinal_then_minmax_pipeline, helpers.get_categorical_cols_ordinal()),
            ('num', MinMaxScaler(), helpers.get_minmax_cols())
        ],
        remainder='passthrough',
        sparse_threshold=1
    )

model_pipeline.fit(X_train, Y_train)
X_train = model_pipeline.transform(X_train)
X_test = model_pipeline.transform(X_test)

# to remove after sensitivty analysis
to_remove_1 = [
    # Starting from the bottom of your list and going up to importance 0.00010
    "remainder__ohe__Exterior1st_VinylSd",   # Mean: -0.00004
    "remainder__ohe__Foundation_Slab",       # Mean: -0.00001
    "remainder__ohe__LandContour_Low",       # Mean: -0.00001
    "ord__BsmtCond",                         # Mean: -0.00001
    "remainder__ohe__MSSubClass_90",         # Mean: -0.00001
    "ord__GarageFinish",                     # Mean: -0.00001
    "remainder__ohe__BldgType_Duplex",       # Mean: -0.00000
    "remainder__ohe__RoofStyle_Hip",         # Mean: -0.00000
    "ord__ExterQual",                        # Mean: -0.00000
    "log_num__MasVnrArea",                   # Mean: -0.00000
    "remainder__ohe__MSSubClass_75",         # Mean: -0.00000
    "remainder__ohe__Neighborhood_OldTown",  # Mean: -0.00000
    "remainder__ohe__BedroomAbvGr_4",        # Mean:  0.00000 (from previous list, check current)
    "remainder__ohe__Neighborhood_IDOTRR",   # Mean:  0.00000
    "remainder__ohe__SaleType_CWD",          # Mean:  0.00000
    "remainder__ohe__Exterior1st_MetalSd",   # Mean:  0.00001 (from previous list, check current)
    "remainder__ohe__FullBath_2",            # Mean:  0.00001 (from previous list, check current)
    "remainder__ohe__RoofStyle_Gable",       # Mean:  0.00001 (from previous list, check current)
    "log_num__MiscVal",                      # Mean:  0.00001
    "remainder__ohe__RoofStyle_Gambrel",     # Mean:  0.00001
    "remainder__ohe__Exterior2nd_Brk Cmn",   # Mean:  0.00001
    "remainder__ohe__HalfBath_2",            # Mean:  0.00001
    "remainder__ohe__Neighborhood_Gilbert",  # Mean:  0.00001
    "ord__ExterCond",                        # Mean:  0.00001
    "remainder__ohe__SaleType_ConLI",        # Mean:  0.00001
    "remainder__ohe__Exterior1st_Stucco",    # Mean:  0.00001
    "remainder__ohe__BsmtFinType2_GLQ",      # Mean:  0.00001
    "remainder__ohe__Exterior2nd_Wd Shng",   # Mean:  0.00001
    "remainder__ohe__BsmtFinType2_LwQ",      # Mean:  0.00001
    "remainder__ohe__BsmtHalfBath_1",        # Mean:  0.00001
    "num__L1_I_PR",                          # Mean:  0.00001
    "num__L1_I_HPI_MA3",                     # Mean:  0.00001
    "remainder__ohe__Electrical_FuseP",      # Mean:  0.00001
    "remainder__ohe__Foundation_Stone",      # Mean:  0.00001
    "remainder__ohe__MSSubClass_40",         # Mean:  0.00001
    "remainder__ohe__Exterior2nd_Stone",     # Mean:  0.00001
    "remainder__ohe__BsmtFullBath_3",        # Mean:  0.00001 (from previous list, check current)
    "remainder__ohe__SaleType_Con",          # Mean:  0.00002
    "remainder__ohe__Exterior1st_CemntBd",   # Mean:  0.00002
    "remainder__ohe__BsmtFinType2_Rec",      # Mean:  0.00002
    "remainder__ohe__Foundation_Wood",       # Mean:  0.00002
    "remainder__ohe__Exterior2nd_BrkFace",   # Mean:  0.00002
    "remainder__ohe__BsmtExposure_Mn",       # Mean:  0.00002
    "remainder__ohe__BldgType_Twnhs",          # Mean:  0.00002
    "remainder__ohe__Neighborhood_SawyerW",  # Mean:  0.00002
    "remainder__ohe__MiscFeature_Othr",      # Mean:  0.00002
    "remainder__ohe__BedroomAbvGr_6",        # Mean:  0.00002
    "remainder__ohe__Exterior2nd_Stucco",    # Mean:  0.00002
    "remainder__ohe__Neighborhood_NPkVill",  # Mean:  0.00002
    "log_num__EnclosedPorch",                # Mean:  0.00003
    "remainder__ohe__MSSubClass_85",         # Mean:  0.00003
    "remainder__ohe__HouseStyle_2.5Fin",     # Mean:  0.00003
    "remainder__ohe__Neighborhood_Timber",   # Mean:  0.00003
    "remainder__ohe__BsmtFinType2_GLQ",      # Mean:  0.00003 (Appears again, likely meant a different feature or copy-paste from previous list)
    "remainder__ohe__Condition1_RRNn",       # Mean:  0.00003
    "remainder__ohe__BldgType_TwnhsE",       # Mean:  0.00003
    "remainder__ohe__BsmtFinType1_GLQ",      # Mean:  0.00003
    "remainder__ohe__Condition1_RRAe",       # Mean:  0.00003
    "remainder__ohe__Neighborhood_BrDale",   # Mean:  0.00003
    "remainder__ohe__BsmtHalfBath_1",        # Mean:  0.00003 (Appears again)
    "remainder__ohe__BsmtHalfBath_2",        # Mean:  0.00003
    "remainder__ohe__Street_Pave",           # Mean:  0.00004
    "remainder__ohe__Exterior1st_WdShing",   # Mean:  0.00004
    "remainder__ohe__BedroomAbvGr_8",        # Mean:  0.00004
    "remainder__ohe__Alley_None",            # Mean:  0.00004
    "num__L1_I_PR",                          # Mean:  0.00004
    "ord__Fence",                            # Mean:  0.00004
    "ord__FireplaceQu",                      # Mean:  0.00004
    "remainder__ohe__Electrical_SBrkr",      # Mean:  0.00005
    "remainder__ohe__BsmtExposure_Mn",       # Mean:  0.00005 (Appears again)
    "remainder__ohe__Exterior2nd_Stucco",    # Mean:  0.00005 (Appears again)
    "remainder__ohe__BsmtFinType2_Rec",      # Mean:  0.00005 (Appears again)
    "remainder__ohe__Condition1_RRAe",       # Mean:  0.00005 (Appears again)
    "remainder__ohe__SaleCondition_AdjLand", # Mean:  0.00005
    "remainder__ohe__Exterior2nd_Stone",     # Mean:  0.00005
    "num__L1_I_HPI",                         # Mean:  0.00005
    "remainder__ohe__KitchenAbvGr_3",        # Mean:  0.00005
    "remainder__ohe__Exterior1st_Plywood",   # Mean:  0.00005
    "remainder__ohe__Neighborhood_NWAmes",   # Mean:  0.00005
    "remainder__ohe__Electrical_FuseF",      # Mean:  0.00005
    "remainder__ohe__Exterior1st_MetalSd",   # Mean:  0.00005
    "remainder__MoSold_sin",                 # Mean:  0.00006
    "remainder__ohe__BsmtFinType1_BLQ",      # Mean:  0.00006 (Appears again)
    "ord__HeatingQC",                        # Mean:  0.00006
    "remainder__ohe__BsmtFinType1_Rec",      # Mean:  0.00006 (Appears again)
    "remainder__ohe__Exterior1st_Stucco",    # Mean:  0.00006 (Appears again)
    "num__L1_I_HPI_MA3",                     # Mean:  0.00006 (from previous list, check current. Your new list has it at 0.00020)
    "remainder__ohe__Exterior2nd_AsphShn",   # Mean:  0.00006
    "remainder__ohe__Exterior1st_CemntBd",   # Mean:  0.00006
    "log_num__BsmtUnfSF",                    # Mean:  0.00006
    "remainder__ohe__BedroomAbvGr_5",        # Mean:  0.00007
    "num__YrSold",                           # Mean:  0.00007
    "remainder__ohe__Neighborhood_CollgCr",  # Mean:  0.00007
    "remainder__ohe__LotConfig_Inside",      # Mean:  0.00007
    "remainder__ohe__BldgType_TwnhsE",       # Mean:  0.00007
    "remainder__HasPool",                    # Mean:  0.00007
    "remainder__ohe__Heating_Grav",          # Mean:  0.00007
    "remainder__ohe__SaleCondition_Family",  # Mean:  0.00007
    "num__L1_I_PR_MA3",                      # Mean:  0.00007
    "remainder__ohe__MSSubClass_70",         # Mean:  0.00008
    "remainder__ohe__Exterior2nd_Wd Shng",   # Mean:  0.00008
    "num__L1_I_UR_MA3",                      # Mean:  0.00008
    "remainder__ohe__BldgType_Duplex",       # Mean:  0.00008
    "remainder__ohe__MSSubClass_75",         # Mean:  0.00008
    "remainder__ohe__Exterior1st_HdBoard",   # Mean:  0.00009
    "remainder__ohe__SaleType_Oth",          # Mean:  0.00009
    "remainder__ohe__KitchenAbvGr_3",        # Mean:  0.00009 (Appears again)
    "log_num__LowQualFinSF",                 # Mean:  0.00009
    "remainder__ohe__Neighborhood_Sawyer",   # Mean:  0.00009
    "num__L1_I_PR_MA6",                      # Mean:  0.00009
    "remainder__ohe__Neighborhood_Mitchel",  # Mean:  0.00009 (Appears again)
    "ord__HeatingQC",                        # Mean:  0.00009 (Appears again)
    "remainder__ohe__SaleType_WD",           # Mean:  0.00010
    "remainder__ohe__Exterior1st_Plywood",   # Mean:  0.00010 (Appears again)
    "remainder__ohe__MSSubClass_80",         # Mean:  0.00010
    "remainder__ohe__Exterior2nd_VinylSd",   # Mean:  0.00010
    "remainder__ohe__Neighborhood_Veenker",  # Mean:  0.00010
    "remainder__ohe__Exterior2nd_CmentBd",   # Mean:  0.00010
    "remainder__ohe__BsmtFinType1_LwQ",
    "remainder__ohe__SaleType_ConLw",
    "ord__LotShape",
    "ord__LandSlope",
    "remainder__ohe__MSSubClass_180",
    "remainder__ohe__Heating_OthW",
    "remainder__ohe__MSSubClass_120",
    "remainder__ohe__Exterior2nd_MetalSd",
    "remainder__ohe__Neighborhood_NAmes",
    "remainder__ohe__BedroomAbvGr_2",
    "remainder__ohe__BedroomAbvGr_1",
    "remainder__ohe__Exterior2nd_ImStucc",
    "remainder__ohe__BsmtFinType2_BLQ",
    "remainder__ohe__RoofStyle_Mansard",
    "ord__GarageCond",
    "remainder__ohe__LotConfig_FR3"
]

X_train = X_train.drop(to_remove_1, axis=1)
X_test = X_test.drop(to_remove_1, axis=1)

print(X_train.shape)

params = torch.load("best_model_weights.pth")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Define device

model, optim = helpers.get_model_and_optim(X_train.shape[1])
state_dict = torch.load("best_model_weights.pth", map_location=device)
model.load_state_dict(state_dict)
model.to(device) # Ensure model is on the correct device
model.eval()
wrapper = estimator_wrapper.EstimatorWrapper(model, device)

perm_analysis = permutation_importance_analysis.PermutationImportanceAnalysis(wrapper)

perm_analysis.fit(X_train, Y_train)

for index, data_row in perm_analysis.get_importances_df().iterrows():
        # 'index' is the numerical index of the row in the DataFrame (0, 1, 2...)
        # 'data_row' is a pandas Series containing all data for that row
        
        feature_name = data_row['feature']
        mean_importance = data_row['importance_mean']
        std_importance = data_row['importance_std']
        
        print(f"Feature: {feature_name}")
        print(f"  Mean Importance: {mean_importance:.5f}") # Access by column name
        print(f"  Std Importance:  {std_importance:.5f}")  # Access by column name
        print("-" * 30)


(1168, 123)
Calculating Permutation Importance...
Feature: remainder__ohe__GarageType_None
  Mean Importance: 0.04701
  Std Importance:  0.00138
------------------------------
Feature: remainder__ohe__MSZoning_RL
  Mean Importance: 0.03012
  Std Importance:  0.00104
------------------------------
Feature: remainder__ohe__MSZoning_RM
  Mean Importance: 0.01851
  Std Importance:  0.00058
------------------------------
Feature: remainder__ohe__GarageType_Attchd
  Mean Importance: 0.01084
  Std Importance:  0.00049
------------------------------
Feature: remainder__ohe__BsmtFinType2_Unf
  Mean Importance: 0.00838
  Std Importance:  0.00035
------------------------------
Feature: remainder__ohe__MSZoning_FV
  Mean Importance: 0.00837
  Std Importance:  0.00031
------------------------------
Feature: log_num__BsmtFinSF1
  Mean Importance: 0.00822
  Std Importance:  0.00045
------------------------------
Feature: log_num__BsmtFinSF2
  Mean Importance: 0.00742
  Std Importance:  0.00038
------