In [None]:
import pandas as pd
from dmgpred.cleaning import clean_single
from dmgpred.featurize import featurize_single
from joblib import load
from sklearn.inspection import permutation_importance

## Feature importance based on mean decrease in impurity

In [None]:
DATA_PATH = "../data/"
OUTPUT_PATH = "../output/"
TEST_VALUES_PATH = f"{DATA_PATH}/test_values.csv"
TRAIN_VALUES_PATH = f"{DATA_PATH}/train_values.csv"
TRAIN_LABELS_PATH = f"{DATA_PATH}/train_labels.csv"
SUBMISSION_PATH = f"{OUTPUT_PATH}/Mandalorians_prediction.csv"
INDEX_COL = "building_id"

In [None]:
model = load(f"{OUTPUT_PATH}trained_model.pkl")
classifier = model.named_steps["clf"]
feature_names = classifier.feature_names_in_
importances = classifier.feature_importances_
importances = pd.Series(importances, index=feature_names).sort_values(ascending=False)
importances.plot.bar()

## Feature importance based on feature permutation

In [None]:
X_train = pd.read_csv(TRAIN_VALUES_PATH, index_col=INDEX_COL)
y_train = pd.read_csv(TRAIN_LABELS_PATH, index_col=INDEX_COL)
X_train = clean_single(X_train)
X_train = featurize_single(X_train)

In [None]:
train_feature_names = X_train.columns.tolist()
feature_names_aligned = [name for name in feature_names if name in train_feature_names]
perm_importances = permutation_importance(
    classifier,
    X_train[feature_names_aligned],
    y_train.to_numpy().ravel(),
    random_state=42,
    n_repeats=1,
    n_jobs=4,
)
perm_importances = pd.Series(
    perm_importances.importances_mean, index=feature_names_aligned
).sort_values(ascending=False)
perm_importances.plot.bar()