Skip to content

Feature Importance Snippets

Prem Piyush Goyal edited this page Apr 28, 2023 · 4 revisions

Feature Importance Snippets

We are providing some snippets to extract feature importances from some popular frameworks.

scikit-learn

Classification

import json

import pandas as pd
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier

X, y = make_hastie_10_2(n_samples=1000, random_state=0)
X = pd.DataFrame(X, columns=[f"Feature {i}" for i in range(X.shape[1])])
X_train, X_test = X[:800], X[800:]
y_train, y_test = y[:800], y[800:]
clf = GradientBoostingClassifier(random_state=0).fit(X_train, y_train)

with open("feature_importance.json", "w") as fp:
    feature_importance = dict(zip(clf.feature_names_in_, clf.feature_importances_))
    json.dump(feature_importance, fp, indent=4)

Regression

import json

import pandas as pd
from sklearn.datasets import make_friedman1
from sklearn.ensemble import RandomForestRegressor

X, y = make_friedman1(n_samples=1000, random_state=0)
X = pd.DataFrame(X, columns=[f"Feature {i}" for i in range(X.shape[1])])
X_train, X_test = X[:800], X[800:]
y_train, y_test = y[:800], y[800:]
est = RandomForestRegressor(random_state=0).fit(X_train, y_train)

with open("feature_importance.json", "w") as fp:
    feature_importance = dict(zip(est.feature_names_in_, est.feature_importances_))
    json.dump(feature_importance, fp, indent=4)

Permutation Importance

  1. We use mean absolute values of the importances provided by permutation_importance method. The in-built mean values may contain negative values too.
import json

import numpy as np
import pandas as pd
from sklearn.datasets import make_hastie_10_2
from sklearn.linear_model import LogisticRegression
from sklearn.inspection import permutation_importance

X, y = make_hastie_10_2(n_samples=1000, random_state=0)
X = pd.DataFrame(X, columns=[f"Feature {i}" for i in range(X.shape[1])])
X_train, X_test = X[:800], X[800:]
y_train, y_test = y[:800], y[800:]
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

with open("feature_importance.json", "w") as fp:
    result = permutation_importance(clf, X_test, y_test, random_state=0)
    result = np.mean(np.abs(result.importances), axis=1) # Take mean of absolute values, to avoid any negative values
    feature_importance = dict(zip(clf.feature_names_in_, result))
    json.dump(feature_importance, fp, indent=4)

Pipelines

import json

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

df = pd.read_csv("https://raw.githubusercontent.com/IBM/watson-openscale-samples/main/Cloud%20Pak%20for%20Data/WML/assets/data/credit_risk/credit_risk_training.csv")

label_column = "Risk"
feature_columns = list(df.columns)
feature_columns.remove(label_column)
categorical_columns = list(
    df[feature_columns].select_dtypes(include=["object", "bool"]))
numerical_columns = [
    col for col in feature_columns if col not in categorical_columns]

X_train, X_test, y_train, y_test = train_test_split(
    df[feature_columns], df[label_column], test_size=0.33, random_state=0)


numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])

categorical_transformer = Pipeline(
    steps=[("one_hot", OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numerical_columns),
    ("cat", categorical_transformer, categorical_columns)])

pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                           ("classifier", GradientBoostingClassifier(random_state=0))])

pipeline.fit(X_train, y_train)

with open("feature_importance.json", "w") as fp:
    result = permutation_importance(pipeline, X_test, y_test, n_jobs=-1, random_state=0)
    result = np.mean(np.abs(result.importances), axis=1) # Take mean of absolute values, to avoid any negative values
    feature_importance = dict(zip(pipeline.feature_names_in_, result))
    json.dump(feature_importance, fp, indent=4)

Spark

Pipeline with the model having featureImportances

import json

import pandas as pd
from sklearn.datasets import make_hastie_10_2
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorAssembler
from pyspark.sql.types import BooleanType, StringType
from pyspark.ml import Pipeline

spark = SparkSession.builder.appName("Dummy").getOrCreate()

X, y = make_hastie_10_2(n_samples=1000, random_state=0)
X = pd.DataFrame(X, columns=[f"Feature {i}" for i in range(X.shape[1])])
X = X.assign(**{"label_column": y})

training_df = spark.createDataFrame(X)
label_column = "label_column"

feature_columns = list(training_df.columns)
feature_columns.remove(label_column)
categorical_columns =  [f.name for f in training_df.schema.fields if isinstance(f.dataType, (BooleanType, StringType)) and f.name in feature_columns]
numerical_columns = [col for col in feature_columns if col not in categorical_columns]


model_train, model_test = training_df.randomSplit([0.9, 0.1], seed=272)
stages = []
for col in categorical_columns:
    stages.append(StringIndexer(inputCol=col, outputCol=f"{col}_modified", handleInvalid="keep"))

label_indexer = StringIndexer(inputCol=label_column, outputCol="label").fit(model_train)
stages.append(label_indexer)

input_cols = ["{}_modified".format(col) if col in categorical_columns else col for col in feature_columns]
assembler = VectorAssembler(inputCols=input_cols, outputCol="features", handleInvalid="keep")
stages.append(assembler)


classifier = RandomForestClassifier(seed=0)
stages.append(classifier)

label_converter = IndexToString(inputCol="prediction", outputCol="predicted_label", labels=label_indexer.labels)
stages.append(label_converter)

pipeline = Pipeline(stages=stages)
model = pipeline.fit(model_train)

with open("feature_importance.json", "w") as fp:
    feature_importance = dict(zip(feature_columns, model.stages[-2].featureImportances))
    json.dump(feature_importance, fp, indent=4)

XGBoost

Regressor

import json

import pandas as pd
import xgboost as xgb
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

housing = fetch_california_housing()

X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = pd.Series(housing.target)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.9, random_state=0)

regressor = xgb.XGBRegressor()

regressor.fit(X_train, y_train)

with open("feature_importance.json", "w") as fp:
    feature_importance = dict(
        zip(regressor.feature_names_in_, regressor.feature_importances_.tolist()))
    json.dump(feature_importance, fp, indent=4)