AFML chapter 8 snippet

8.2 MDI(Mean Decrease Impurity) Feature Importance

In [1]:
import pandas as pd 
import numpy as np

def get_mdi_importance(fit, feature_names):
    df0={i:tree.feature_importances_ for i,tree in enumerate(fit.estimators_)}
    df0=pd.DataFrame.from_dict(df0, orient='index')
    df0.columns=feature_names
    df0=df0.replace(0, np.nan) # zero importance가 average되는 것을 방지
    imp=pd.concat({'mean':df0.mean(axis=0).rename('mean'), 'std':df0.std(axis=0).rename('std')/np.sqrt(len(df0))}, axis=1)
    imp/=imp['mean'].sum() # normalize  
    return imp

8.3 MDA(Mean Decrease Accuracy) Feature Importance

In [2]:
from utils import PurgedKFold
import time

def get_mda_importance(clf, X, y, cv, sample_weight:pd.Series, t1:pd.Series, pct_embargo:float, scoring='neg_log_loss'):
    # get feature importance based on OOS score reduction
    if scoring not in ['neg_log_loss', 'accuracy']:
        raise ValueError(f"scoring must be 'neg_log_loss' or 'accuracy', got {scoring}")
    from sklearn.metrics import log_loss, accuracy_score
    cv_gen=PurgedKFold(n_splits=cv, t1=t1, pct_embargo=pct_embargo)
    scr0, scr1=pd.Series(), pd.DataFrame(columns=X.columns)
    for i, (train_idx, test_idx) in enumerate(cv_gen.split(X, y)):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
        X_test, y_test = X.iloc[test_idx], y.iloc[test_idx]
        sample_weight_train=sample_weight.iloc[train_idx]
        sample_weight_test=sample_weight.iloc[test_idx]
        clf.fit(X_train, y_train, sample_weight=sample_weight_train)
        if scoring=='neg_log_loss':
            scr0.loc[i]=-log_loss(y_test, clf.predict_proba(X_test), sample_weight=sample_weight_test.values, labels=clf.classes_)
        else:
            scr0.loc[i]=accuracy_score(y_test, clf.predict(X_test), sample_weight=sample_weight_test.values)
        for j in X.columns:
            # inplace 전체 copy 제거 + numpy array shuffle
            shuffled_col = X_test[j].values.copy()
            np.random.shuffle(shuffled_col)
            X1_ = X_test.copy()
            X1_[j] = shuffled_col
            if scoring=='neg_log_loss':
                scr1.loc[i,j]=-log_loss(y_test, clf.predict_proba(X1_), sample_weight=sample_weight_test.values, labels=clf.classes_)
            else:
                scr1.loc[i,j]=accuracy_score(y_test, clf.predict(X1_), sample_weight=sample_weight_test.values)
    imp=(-scr1).add(scr0, axis=0)
    if scoring=='neg_log_loss':
        imp=imp/-scr1 # (original - permuted) / permuted
    else:
        imp=imp/(1.0-scr1)
    imp=pd.concat({'mean':imp.mean(axis=0).rename('mean'), 'std':imp.std(axis=0).rename('std')/np.sqrt(len(imp))}, axis=1)
    return imp, scr0.mean()

8.4 SFI(Single Feature Importance) 

In [10]:
from utils import cv_score

def get_sfi(feature_names, clf, normalizd_X, y:pd.Series,sample_weight:pd.Series, scoring, cv_gen)->pd.DataFrame:
    imp=pd.DataFrame(columns=['mean', 'std'])
    for feature in feature_names:
        df0=cv_score(clf, X=normalizd_X[[feature]], y=y, sample_weight=sample_weight, scoring=scoring, cv_gen=cv_gen)
        imp.loc[feature, 'mean']=df0.mean(axis=0)
        imp.loc[feature, 'std']=df0.std(axis=0)/np.sqrt(len(df0))
    return imp

8.5 Computation of Orthogonal features

In [4]:
def get_orthogonal_features(X:pd.DataFrame, threshold:float=0.95)->pd.DataFrame:
    def get_eigen(dot:pd.DataFrame, threshold:float=0.95):
        eigen_values, eigen_vectors = np.linalg.eig(dot)

        # reverse
        idx=eigen_values.argsort()[::-1] 
        eigen_values=eigen_values[idx]
        eigen_vectors=eigen_vectors[:,idx]

        eigen_values=pd.Series(eigen_values, index=['PC_'+str(i+1) for i in range(len(eigen_values))])
        eigen_vectors=pd.DataFrame(eigen_vectors, index=dot.index, columns=eigen_values.index)
        eigen_vectors=eigen_vectors.loc[: eigen_values.index]

        cum_var=eigen_values.cumsum()/eigen_values.sum()
        dim=cum_var.values.searchsorted(threshold)
        eigen_values, eigen_vectors=eigen_values.iloc[:dim+1], eigen_vectors.iloc[:,:dim+1]
        return eigen_values, eigen_vectors
    Z=X.subtract(X.mean(), axis=1).div(X.std(), axis=1) 
    dot=pd.DataFrame(np.dot(Z.T, Z), index=X.columns, columns=X.columns)
    eigen_values, eigen_vectors=get_eigen(dot, threshold)
    return np.dot(Z, eigen_vectors)

8.7 Creating A Synthetic Dataset

In [5]:
from datetime import datetime

def get_test_data(n_features=40, n_informative=10, n_redundant=10, n_samples=10000):
    from sklearn.datasets import make_classification
    X, cont=make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_redundant=n_redundant, random_state=0, shuffle=False)
    df0=pd.date_range(periods=n_samples, freq='1D', end=datetime.today())
    X, cont=pd.DataFrame(X, index=df0), pd.Series(cont, index=df0).to_frame('bin')
    df0=[f'I_{str(i)}' for i in range(n_informative)]+[f'R_{str(i)}' for i in range(n_informative, n_informative+n_redundant)]+[f'N_{str(i)}' for i in range(n_informative+n_redundant, n_features)]
    X.columns=df0
    cont['w']=1./cont.shape[0]
    cont['t1']=pd.Series(cont.index, index=cont.index)
    return X, cont

test_data, cont=get_test_data()
print(test_data.head())
print(cont.head())

                                 I_0       I_1       I_2       I_3       I_4  \
1997-12-26 21:22:15.218328  2.843740  0.456554  0.171107 -4.511382  0.278990   
1997-12-27 21:22:15.218328  3.561541 -1.566097  3.342813 -1.938909  2.075749   
1997-12-28 21:22:15.218328  7.699248 -3.030124 -0.859302 -0.033351  1.113719   
1997-12-29 21:22:15.218328 -0.149801 -3.182187  2.695894  1.359997  2.992416   
1997-12-30 21:22:15.218328 -2.157903  0.046380  0.697217 -1.012036  1.856002   

                                 I_5       I_6       I_7       I_8       I_9  \
1997-12-26 21:22:15.218328 -3.474726  2.955550  2.698865  1.542440  2.198168   
1997-12-27 21:22:15.218328 -3.486711  0.494908  0.309615  1.059439 -0.792433   
1997-12-28 21:22:15.218328 -0.877844  2.344033  4.089113  2.287786  0.611413   
1997-12-29 21:22:15.218328 -0.417971 -1.214058  1.268313 -3.720913 -2.580578   
1997-12-30 21:22:15.218328 -2.311465  2.715493  0.444433 -1.921790 -2.472372   

                            ...      N

8.8 Calling Feature Importance for any method

In [8]:
def get_feature_importance(trans_x, cont, n_estimators=1000, cv=10, max_samples=1, num_threads=24, pct_embargo=0.1, scoring='accuracy', method='SFI', min_weight_leaf=0, **kwargs):
    '''
    Random Forest Feature Importance
    '''
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import BaggingClassifier

    clf=DecisionTreeClassifier(criterion='entropy',max_features=1, class_weight='balanced', min_weight_fraction_leaf=min_weight_leaf)
    clf=BaggingClassifier(estimator=clf, n_estimators=n_estimators, max_features=1., max_samples=max_samples, oob_score=True)
    fit=clf.fit(X=trans_x, y=cont['bin'], sample_weight=cont['w'].values)
    oob_score=fit.oob_score_
    if method=='MDI':
        imp=get_mdi_importance(fit, feature_names=trans_x.columns)
        oos=cv_score(clf, X=trans_x, y=cont['bin'], sample_weight=cont['w'], scoring=scoring, cv_gen=PurgedKFold(n_splits=cv, t1=cont['t1'], pct_embargo=pct_embargo)).mean()
    elif method=='MDA':
        imp, oos=get_mda_importance(clf, X=trans_x, y=cont['bin'], cv=cv, sample_weight=cont['w'], t1=cont['t1'], pct_embargo=pct_embargo, scoring=scoring)
    elif method=='SFI':
        cv_gen=PurgedKFold(n_splits=cv, t1=cont['t1'], pct_embargo=pct_embargo)
        oos=cv_score(clf, X=trans_x, y=cont['bin'], sample_weight=cont['w'], scoring=scoring, cv_gen=cv_gen).mean()
        imp=get_sfi(trans_x.columns, clf, trans_x, y=cont['bin'], sample_weight=cont['w'], scoring=scoring, cv_gen=cv_gen)
    return imp, oob_score, oos

8.9 Calling All components

In [9]:
from itertools import product

def test_feature_importance(n_features=40, n_informative=10, n_redundant=10, n_estimators=1000, n_samples=10000, cv=10):
    trans_x, cont=get_test_data(n_features=n_features, n_informative=n_informative, n_redundant=n_redundant, n_samples=n_samples)
    dict0={'min_weight_leaf':[0.], 'scoring':['accuracy'], 'method':['MDA', 'SFI'], 'max_samples':[1.]}
    jobs, out=(dict(zip(dict0.keys(), values)) for values in product(*dict0.values())), []
    kargs={'n_estimators':n_estimators, 'tag':'test_func', 'cv': cv}

    def plot_feature_importance(imp, oob, oos, method, scoring, **kargs):
        import matplotlib.pyplot as plt
        plt.figure(figsize=(10, imp.shape[0]/5))
        imp=imp.sort_values('mean', ascending=True)
        ax=imp['mean'].plot(xerr=imp['std'], error_kw={'ecolor':'r'}, color='b', kind='barh')    
        if method=='MDI':
            plt.xlim([0, imp.sum(axis=1).max()])
            plt.axvline(1/len(imp), color='r', linestyle='--')
        for i, j in zip(ax.patches, imp.index):
            ax.text(i.get_width()/2, i.get_y()+i.get_height()/2, j, ha='center', va='center', fontsize=10)
        plt.title(f"{method} Feature Importance\nOOB: {oob:.4f}, OOS: {oos:.4f}", fontsize=15)
        plt.savefig(f"feature_importance_{method}_{scoring}.png", dpi=300)
        plt.clf()
        plt.close()
        
    out=[]
    for job in jobs:
        imp, oob, oos=get_feature_importance(trans_x=trans_x,cont=cont, **job)
        kargs.update(job)
        plot_feature_importance(imp, oob, oos, **kargs)
        df0=imp[['mean']]/imp[['mean']].abs().sum() 
        df0['type']=[i[0] for i in df0.index]   
        df0=df0.groupby('type').sum().to_dict()
        df0.update({'oob':oob, 'oos':oos})
        df0.update(job)
        out.append(df0)
    out=pd.DataFrame(out).sort_values(['method', 'scoring', 'max_samples', 'min_weight_leaf'])  
    out=out['method', 'scoring', 'max_samples', 'I', 'R', 'N', 'oob', 'oos']
    out.to_csv('feature_importance.csv')


test_feature_importance(n_features=40, n_informative=10, n_redundant=10, n_estimators=1000, n_samples=10000, cv=10)



    

  max_t1_idx = self.t1.index.searchsorted(self.t1[test_indices].max())
  max_t1_idx = self.t1.index.searchsorted(self.t1[test_indices].max())
  max_t1_idx = self.t1.index.searchsorted(self.t1[test_indices].max())
  max_t1_idx = self.t1.index.searchsorted(self.t1[test_indices].max())
  max_t1_idx = self.t1.index.searchsorted(self.t1[test_indices].max())
  max_t1_idx = self.t1.index.searchsorted(self.t1[test_indices].max())
  max_t1_idx = self.t1.index.searchsorted(self.t1[test_indices].max())
  max_t1_idx = self.t1.index.searchsorted(self.t1[test_indices].max())
  max_t1_idx = self.t1.index.searchsorted(self.t1[test_indices].max())
  max_t1_idx = self.t1.index.searchsorted(self.t1[test_indices].max())
  max_t1_idx = self.t1.index.searchsorted(self.t1[test_indices].max())
  max_t1_idx = self.t1.index.searchsorted(self.t1[test_indices].max())
  max_t1_idx = self.t1.index.searchsorted(self.t1[test_indices].max())
  max_t1_idx = self.t1.index.searchsorted(self.t1[test_indices].max())
  max_

TypeError: get_sfi() got an unexpected keyword argument 'sample_weight'