In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

from sklearn.datasets import load_diabetes
from sklearn.preprocessing import OrdinalEncoder

from featimp import (
    get_corr_importances,
    get_chi2_crosstab_importances,
    get_chi2_importances,
    get_anova_importances,
    get_mutual_info_importances,
    get_ml_importances,
    get_permutation_importances,
    get_feature_importances,
    )

sns.set()
warnings.simplefilter("ignore")
cm = sns.light_palette("green", as_cmap=True)

# Data Preparations

In [2]:
diabetes_data = load_diabetes()
df_diabetes = pd.DataFrame(data=diabetes_data.data, columns=diabetes_data.feature_names)
df_diabetes['sex'] = df_diabetes['sex'].astype(str)
df_diabetes['sex'].replace({'0.0506801187398187':'M', '-0.044641636506989':'F'}, inplace=True)
df_diabetes['random_cat_col'] = np.random.choice(['A', 'B', 'C'], size=len(df_diabetes), p=[0.9, 0.05, 0.05])
df_diabetes['target_reg'] = diabetes_data['target']
df_diabetes['target_clf'] = pd.cut(diabetes_data['target'], 4)
df_diabetes.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,random_cat_col,target_reg,target_clf
0,0.038076,M,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,A,151.0,"(105.25, 185.5]"
1,-0.001882,F,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,A,75.0,"(24.679, 105.25]"
2,0.085299,M,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,A,141.0,"(105.25, 185.5]"
3,-0.089063,F,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,C,206.0,"(185.5, 265.75]"
4,0.005383,F,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,A,135.0,"(105.25, 185.5]"


In [3]:
cat_features = list(df_diabetes.select_dtypes('object').columns)
num_features = list(df_diabetes.select_dtypes('float').columns)
num_features.remove("target_reg")
features = num_features + cat_features
print("cat_features:", cat_features)
print("num_features:", num_features)

cat_features: ['sex', 'random_cat_col']
num_features: ['age', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']


# Correlation Importances

In [4]:
corr_df = get_corr_importances(data=df_diabetes, num_features=num_features, target='target_reg')
corr_df.style.background_gradient(cmap=cm)

Unnamed: 0,Corr
bmi,0.58645
s5,0.565883
bp,0.441484
s4,0.430453
s6,0.382483
s1,0.212022
age,0.187889
s2,0.174054
s3,-0.394789


# Chi2 Crosstab Importances

In [11]:
cat_corr_df = get_chi2_crosstab_importances(data=df_diabetes, cat_features=cat_features, target='target_clf')
cat_corr_df.style.background_gradient(cmap=cm)

Unnamed: 0,Chi_Square_Crosstab
sex,0.0
random_cat_col,0.0


# Chi2 Importances

In [10]:
df_diabetes_enc = pd.DataFrame(OrdinalEncoder().fit_transform(df_diabetes[cat_features+['target_clf']]), columns=cat_features+['target_clf'])
chi2_df = get_chi2_importances(data=df_diabetes_enc, features=cat_features, target='target_clf')
chi2_df.style.background_gradient(cmap=cm)

Unnamed: 0,Neg_Chi_Square
sex,-0.743798
random_cat_col,-1.0262


# ANOVA Importances

In [14]:
df_diabetes_enc = pd.DataFrame(OrdinalEncoder().fit_transform(df_diabetes[cat_features+['target_clf']]), columns=cat_features+['target_clf'])
df_diabetes_enc = pd.concat([df_diabetes_enc, df_diabetes[num_features]], axis=1)
df_diabetes_enc['target_reg'] = df_diabetes['target_reg']
df_diabetes_enc.head()

Unnamed: 0,sex,random_cat_col,target_clf,age,bmi,bp,s1,s2,s3,s4,s5,s6,target_reg
0,1.0,0.0,1.0,0.038076,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,0.0,0.0,0.0,-0.001882,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,1.0,0.0,1.0,0.085299,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,0.0,0.0,2.0,-0.089063,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.0,0.0,1.0,0.005383,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


In [8]:
anova_df = get_anova_importances(data=df_diabetes_enc, features=num_features, target='target_clf')
anova_df.style.background_gradient(cmap=cm)

Unnamed: 0,ANOVA
bmi,216.274934
s5,195.29013
bp,102.341164
s4,88.822664
s3,73.436993
s6,71.660065
s1,21.119417
age,14.437754
s2,13.22727


In [15]:
anova_df = get_anova_importances(data=df_diabetes_enc, features=cat_features, target='target_reg')
anova_df.style.background_gradient(cmap=cm)

Unnamed: 0,ANOVA
sex,0.817423
random_cat_col,0.029448


# Mution Informaton Importances

In [12]:
df_diabetes_enc = pd.DataFrame(OrdinalEncoder().fit_transform(df_diabetes[cat_features+['target_clf']]), columns=cat_features+['target_clf'])
df_diabetes_enc = pd.concat([df_diabetes_enc, df_diabetes[num_features]], axis=1)
df_diabetes_enc['target_reg'] = df_diabetes['target_reg']
df_diabetes_enc.head()

Unnamed: 0,sex,random_cat_col,target_clf,age,bmi,bp,s1,s2,s3,s4,s5,s6,target_reg
0,1.0,0.0,1.0,0.038076,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,0.0,0.0,0.0,-0.001882,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,1.0,0.0,1.0,0.085299,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,0.0,0.0,2.0,-0.089063,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.0,0.0,1.0,0.005383,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


In [13]:
mi_scores_df = get_mutual_info_importances(data=df_diabetes_enc, features=cat_features, target='target_clf')
mi_scores_df.style.background_gradient(cmap=cm)

Unnamed: 0,MI Scores
sex,0.02996
random_cat_col,0.0


# Machine Learning Importances

In [4]:
df_diabetes_enc = pd.DataFrame(OrdinalEncoder().fit_transform(df_diabetes[cat_features+['target_clf']]), columns=cat_features+['target_clf'])
df_diabetes_enc = pd.concat([df_diabetes_enc, df_diabetes[num_features]], axis=1)
df_diabetes_enc['target_reg'] = df_diabetes['target_reg']
df_diabetes_enc.head()

Unnamed: 0,sex,random_cat_col,target_clf,age,bmi,bp,s1,s2,s3,s4,s5,s6,target_reg
0,1.0,0.0,1.0,0.038076,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,0.0,0.0,0.0,-0.001882,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,1.0,0.0,1.0,0.085299,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,0.0,0.0,2.0,-0.089063,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.0,0.0,1.0,0.005383,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


In [5]:
ml_importance_df = get_ml_importances(data=df_diabetes_enc, num_features=num_features, cat_features=cat_features, target='target_clf', fold_type='skf', 
                                        nfold=10, model_name='LGBM', task='clf_multiable', random_state=0, early_stopping_rounds=100)
ml_importance_df.style.background_gradient(cmap=cm)

Unnamed: 0,LGBM Imp.
bmi,191.8
s5,170.6
bp,161.7
s6,160.2
s3,133.2
s2,118.3
age,101.2
s1,92.8
sex,30.8
s4,26.6


In [6]:
ml_importance_df = get_ml_importances(data=df_diabetes_enc, num_features=num_features, cat_features=cat_features, target='target_clf', fold_type='skf', 
                                        nfold=10, model_name='CATBOOST', task='clf_multiable', random_state=0, early_stopping_rounds=100)
ml_importance_df.style.background_gradient(cmap=cm)

Unnamed: 0,CATBOOST Imp.
bmi,19.467726
s5,17.187104
bp,10.967325
s6,10.084667
s3,8.734228
age,6.552651
random_cat_col,6.489186
sex,6.100396
s2,5.542121
s1,4.986113


In [7]:
ml_importance_df = get_ml_importances(data=df_diabetes_enc, num_features=num_features, cat_features=cat_features, target='target_reg', fold_type='kf', 
                                        nfold=10, model_name='LGBM', task='reg', random_state=0, early_stopping_rounds=100)
ml_importance_df.style.background_gradient(cmap=cm)

Unnamed: 0,LGBM Imp.
bmi,63.7
s5,52.6
bp,49.5
age,36.4
s6,34.3
s3,33.8
s2,28.6
s1,28.1
sex,15.3
s4,9.3


In [8]:
ml_importance_df = get_ml_importances(data=df_diabetes_enc, num_features=num_features, cat_features=cat_features, target='target_reg', fold_type='kf', 
                                        nfold=10, model_name='CATBOOST', task='reg', random_state=0, early_stopping_rounds=100)
ml_importance_df.style.background_gradient(cmap=cm)

Unnamed: 0,CATBOOST Imp.
bmi,26.915529
s5,23.87156
bp,11.311633
s6,6.401471
age,5.725008
s3,5.700651
sex,5.261912
s4,4.994234
s2,4.064067
s1,3.006027


# Permutation Importances

In [13]:
df_diabetes_enc = pd.DataFrame(OrdinalEncoder().fit_transform(df_diabetes[cat_features+['target_clf']]), columns=cat_features+['target_clf'])
df_diabetes_enc = pd.concat([df_diabetes_enc, df_diabetes[num_features]], axis=1)
df_diabetes_enc['target_reg'] = df_diabetes['target_reg']
df_diabetes_enc.head()

Unnamed: 0,sex,random_cat_col,target_clf,age,bmi,bp,s1,s2,s3,s4,s5,s6,target_reg
0,1.0,0.0,1.0,0.038076,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,0.0,0.0,0.0,-0.001882,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,1.0,0.0,1.0,0.085299,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,0.0,0.0,2.0,-0.089063,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.0,0.0,1.0,0.005383,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


In [20]:
from sklearn.ensemble import RandomForestClassifier
permutation_importance_df = get_permutation_importances(data=df_diabetes_enc, features=features, target='target_clf', fold_type='skf', nfold=10, 
                                                        score='f1_macro', model_base=RandomForestClassifier(random_state=0), random_state=0, n_repeats=30)
permutation_importance_df.style.background_gradient(cmap=cm)

Unnamed: 0,PI mean,PI std
bmi,0.066768,0.054883
s5,0.06484,0.05665
bp,0.024554,0.038514
s3,0.0196,0.046757
s4,0.017601,0.034553
sex,0.015422,0.020577
s6,0.0113,0.047009
s2,0.004398,0.039534
random_cat_col,0.002033,0.008549
s1,-0.003461,0.041008


In [21]:
from sklearn.ensemble import RandomForestRegressor
permutation_importance_df = get_permutation_importances(data=df_diabetes_enc, features=features, target='target_reg', fold_type='kf', nfold=10, 
                                                        score='neg_mean_squared_error', model_base=RandomForestRegressor(random_state=0), random_state=0, n_repeats=30)
permutation_importance_df.style.background_gradient(cmap=cm)

Unnamed: 0,PI mean,PI std
s5,1280.513798,532.720683
bmi,1247.791547,525.334353
bp,205.11425,249.412983
sex,32.049594,75.344128
s6,21.391434,130.214613
s3,15.135266,109.23031
random_cat_col,5.163639,18.953468
s4,-3.970418,52.749463
s1,-14.122122,69.584769
age,-17.160692,100.371133


# All Feature Importances

In [4]:
fi_df = get_feature_importances(data=df_diabetes, num_features=num_features, cat_features=cat_features, target='target_clf', task='clf_multiable', method='all')
fi_df.style.background_gradient(cmap=cm)

Unnamed: 0,Chi_Square_Crosstab,Neg_Chi_Square,ANOVA,MI Scores,LGBM Imp.,PI mean,PI std,Rank
bmi,,,216.274934,,155.1,0.055647,0.044177,0.5
s5,,,195.29013,,139.3,0.051065,0.057382,0.454173
bp,,,102.341164,,129.8,0.008784,0.041906,0.263009
sex,0.0,-0.743798,,0.0,26.8,0.009843,0.018465,0.246504
s3,,,73.436993,,118.1,0.004823,0.04202,0.216701
s6,,,71.660065,,133.9,-0.006157,0.042642,0.205388
s2,,,13.22727,,92.7,0.001826,0.034333,0.132094
s4,,,88.822664,,22.6,0.004127,0.032856,0.123169
age,,,14.437754,,83.4,-0.010223,0.033209,0.093143
s1,,,21.119417,,73.6,-0.011799,0.046015,0.084015


In [5]:
fi_df = get_feature_importances(data=df_diabetes, num_features=num_features, cat_features=cat_features, target='target_clf', task='clf_multiable', ml_model_name='CATBOOST', method='all')
fi_df.style.background_gradient(cmap=cm)

Unnamed: 0,Chi_Square_Crosstab,Neg_Chi_Square,ANOVA,MI Scores,CATBOOST Imp.,PI mean,PI std,Rank
bmi,,,216.274934,,20.875226,0.074789,0.050668,0.5
s5,,,195.29013,,18.602812,0.05465,0.055484,0.417308
sex,0.0,-0.743798,,0.02734,6.012535,0.012536,0.021909,0.217749
bp,,,102.341164,,10.981627,0.026724,0.039149,0.204678
random_cat_col,0.0,-2.798769,,0.034552,5.655815,-0.004545,0.00811,0.17823
s3,,,73.436993,,8.616563,0.0252,0.047472,0.153648
s6,,,71.660065,,8.815415,0.017033,0.042963,0.137058
s4,,,88.822664,,5.160295,0.009987,0.039452,0.099092
s2,,,13.22727,,4.586831,0.00827,0.043885,0.027591
s1,,,21.119417,,4.521221,0.001693,0.038055,0.019584


In [4]:
fi_df = get_feature_importances(data=df_diabetes, num_features=num_features, cat_features=cat_features, target='target_reg', task='reg', method='all')
fi_df.style.background_gradient(cmap=cm)

Unnamed: 0,Corr,ANOVA,LGBM Imp.,PI mean,PI std,Rank
bmi,0.58645,,56.2,9.672992,3.664179,0.75
s5,0.565883,,47.4,9.644869,3.950961,0.70456
bp,0.441484,,42.6,1.720011,2.109985,0.453293
s6,0.382483,,27.5,0.002761,1.305385,0.327577
sex,,0.817423,12.9,0.351298,0.564636,0.322723
age,0.187889,,28.6,-0.280145,0.887938,0.275866
s2,0.174054,,23.3,-0.028917,0.748511,0.25483
s4,0.430453,,7.1,0.053208,0.496548,0.249497
s1,0.212022,,21.5,-0.331727,0.675928,0.248858
s3,-0.394789,,27.7,0.347834,0.971806,0.139064


In [5]:
fi_df = get_feature_importances(data=df_diabetes, num_features=num_features, cat_features=cat_features, target='target_reg', task='reg', ml_model_name='CATBOOST', method='all')
fi_df.style.background_gradient(cmap=cm)

Unnamed: 0,Corr,ANOVA,CATBOOST Imp.,PI mean,PI std,Rank
bmi,0.58645,,26.242357,9.570208,3.804563,0.747399
s5,0.565883,,24.215704,9.675716,3.972392,0.723123
bp,0.441484,,11.443582,1.111112,2.005078,0.343969
sex,,0.817423,5.361172,0.370069,0.52808,0.297701
s6,0.382483,,6.703249,0.045953,1.179232,0.252073
s4,0.430453,,5.211297,0.066261,0.491222,0.248867
age,0.187889,,5.038524,-0.467018,0.949063,0.172078
s2,0.174054,,4.020709,-0.155082,0.792254,0.165375
s1,0.212022,,3.386964,-0.308417,0.669489,0.164503
s3,-0.394789,,5.55061,-0.05851,0.984329,0.039159


In [4]:
fi_df = get_feature_importances(data=df_diabetes, num_features=num_features, cat_features=cat_features, target='target_clf', task='clf_multiable', 
                                method=['chi2_crosstab', 'anova', 'ml'])
fi_df.style.background_gradient(cmap=cm)

Unnamed: 0,Chi_Square_Crosstab,ANOVA,LGBM Imp.,Rank
bmi,,216.274934,179.8,0.666667
s5,,195.29013,161.1,0.597159
bp,,102.341164,153.8,0.430883
s6,,71.660065,152.6,0.378266
s3,,73.436993,134.5,0.34725
random_cat_col,0.025817,,2.0,0.333333
s2,,13.22727,109.2,0.200975
age,,14.437754,102.0,0.189464
s4,,88.822664,25.2,0.167596
s1,,21.119417,81.9,0.16275


In [6]:
fi_df = get_feature_importances(data=df_diabetes, num_features=num_features, cat_features=cat_features, target='target_clf', task='clf_multiable', 
                                method=['chi2_crosstab', 'pi', 'anova'])
fi_df.style.background_gradient(cmap=cm)

Unnamed: 0,Chi_Square_Crosstab,PI mean,PI std,ANOVA,Rank
bmi,,0.089876,0.04788,216.274934,0.666667
s5,,0.063195,0.057227,195.29013,0.536622
random_cat_col,0.025817,-0.003158,0.00964,,0.333333
s3,,0.037803,0.043488,73.436993,0.245603
bp,,0.023798,0.044219,102.341164,0.242875
s4,,0.021099,0.032474,88.822664,0.211012
s6,,0.011098,0.042057,71.660065,0.147006
sex,0.0,0.01398,0.022119,,0.061407
s2,,0.012867,0.038025,13.22727,0.057417
age,,0.005275,0.031961,14.437754,0.032203
