Feature Importance Snippets
Prem Piyush Goyal edited this page Apr 28, 2023
·
4 revisions
We are providing some snippets to extract feature importances from some popular frameworks.
import json
import pandas as pd
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier
X, y = make_hastie_10_2(n_samples=1000, random_state=0)
X = pd.DataFrame(X, columns=[f"Feature {i}" for i in range(X.shape[1])])
X_train, X_test = X[:800], X[800:]
y_train, y_test = y[:800], y[800:]
clf = GradientBoostingClassifier(random_state=0).fit(X_train, y_train)
with open("feature_importance.json", "w") as fp:
feature_importance = dict(zip(clf.feature_names_in_, clf.feature_importances_))
json.dump(feature_importance, fp, indent=4)
import json
import pandas as pd
from sklearn.datasets import make_friedman1
from sklearn.ensemble import RandomForestRegressor
X, y = make_friedman1(n_samples=1000, random_state=0)
X = pd.DataFrame(X, columns=[f"Feature {i}" for i in range(X.shape[1])])
X_train, X_test = X[:800], X[800:]
y_train, y_test = y[:800], y[800:]
est = RandomForestRegressor(random_state=0).fit(X_train, y_train)
with open("feature_importance.json", "w") as fp:
feature_importance = dict(zip(est.feature_names_in_, est.feature_importances_))
json.dump(feature_importance, fp, indent=4)
- We use mean absolute values of the importances provided by
permutation_importance
method. The in-built mean values may contain negative values too.
import json
import numpy as np
import pandas as pd
from sklearn.datasets import make_hastie_10_2
from sklearn.linear_model import LogisticRegression
from sklearn.inspection import permutation_importance
X, y = make_hastie_10_2(n_samples=1000, random_state=0)
X = pd.DataFrame(X, columns=[f"Feature {i}" for i in range(X.shape[1])])
X_train, X_test = X[:800], X[800:]
y_train, y_test = y[:800], y[800:]
clf = LogisticRegression(random_state=0).fit(X_train, y_train)
with open("feature_importance.json", "w") as fp:
result = permutation_importance(clf, X_test, y_test, random_state=0)
result = np.mean(np.abs(result.importances), axis=1) # Take mean of absolute values, to avoid any negative values
feature_importance = dict(zip(clf.feature_names_in_, result))
json.dump(feature_importance, fp, indent=4)
import json
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
df = pd.read_csv("https://raw.githubusercontent.com/IBM/watson-openscale-samples/main/Cloud%20Pak%20for%20Data/WML/assets/data/credit_risk/credit_risk_training.csv")
label_column = "Risk"
feature_columns = list(df.columns)
feature_columns.remove(label_column)
categorical_columns = list(
df[feature_columns].select_dtypes(include=["object", "bool"]))
numerical_columns = [
col for col in feature_columns if col not in categorical_columns]
X_train, X_test, y_train, y_test = train_test_split(
df[feature_columns], df[label_column], test_size=0.33, random_state=0)
numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(
steps=[("one_hot", OneHotEncoder(handle_unknown="ignore"))])
preprocessor = ColumnTransformer(transformers=[
("num", numeric_transformer, numerical_columns),
("cat", categorical_transformer, categorical_columns)])
pipeline = Pipeline(steps=[("preprocessor", preprocessor),
("classifier", GradientBoostingClassifier(random_state=0))])
pipeline.fit(X_train, y_train)
with open("feature_importance.json", "w") as fp:
result = permutation_importance(pipeline, X_test, y_test, n_jobs=-1, random_state=0)
result = np.mean(np.abs(result.importances), axis=1) # Take mean of absolute values, to avoid any negative values
feature_importance = dict(zip(pipeline.feature_names_in_, result))
json.dump(feature_importance, fp, indent=4)
import json
import pandas as pd
from sklearn.datasets import make_hastie_10_2
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorAssembler
from pyspark.sql.types import BooleanType, StringType
from pyspark.ml import Pipeline
spark = SparkSession.builder.appName("Dummy").getOrCreate()
X, y = make_hastie_10_2(n_samples=1000, random_state=0)
X = pd.DataFrame(X, columns=[f"Feature {i}" for i in range(X.shape[1])])
X = X.assign(**{"label_column": y})
training_df = spark.createDataFrame(X)
label_column = "label_column"
feature_columns = list(training_df.columns)
feature_columns.remove(label_column)
categorical_columns = [f.name for f in training_df.schema.fields if isinstance(f.dataType, (BooleanType, StringType)) and f.name in feature_columns]
numerical_columns = [col for col in feature_columns if col not in categorical_columns]
model_train, model_test = training_df.randomSplit([0.9, 0.1], seed=272)
stages = []
for col in categorical_columns:
stages.append(StringIndexer(inputCol=col, outputCol=f"{col}_modified", handleInvalid="keep"))
label_indexer = StringIndexer(inputCol=label_column, outputCol="label").fit(model_train)
stages.append(label_indexer)
input_cols = ["{}_modified".format(col) if col in categorical_columns else col for col in feature_columns]
assembler = VectorAssembler(inputCols=input_cols, outputCol="features", handleInvalid="keep")
stages.append(assembler)
classifier = RandomForestClassifier(seed=0)
stages.append(classifier)
label_converter = IndexToString(inputCol="prediction", outputCol="predicted_label", labels=label_indexer.labels)
stages.append(label_converter)
pipeline = Pipeline(stages=stages)
model = pipeline.fit(model_train)
with open("feature_importance.json", "w") as fp:
feature_importance = dict(zip(feature_columns, model.stages[-2].featureImportances))
json.dump(feature_importance, fp, indent=4)
import json
import pandas as pd
import xgboost as xgb
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
housing = fetch_california_housing()
X = pd.DataFrame(housing.data, columns=housing.feature_names)
y = pd.Series(housing.target)
X_train, X_test, y_train, y_test = train_test_split(
X, y, train_size=0.9, random_state=0)
regressor = xgb.XGBRegressor()
regressor.fit(X_train, y_train)
with open("feature_importance.json", "w") as fp:
feature_importance = dict(
zip(regressor.feature_names_in_, regressor.feature_importances_.tolist()))
json.dump(feature_importance, fp, indent=4)