In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('./data/adult_train.csv')
train.head(3)

Unnamed: 0,age,workclass,fnlwgt,education.num,marital.status,race,sex,hours.per.week,native.country,income
0,48,0,80430,7,0,0,0,30,1,1
1,18,0,166889,10,0,1,0,35,1,1
2,33,-1,295621,13,1,1,1,25,1,0


In [3]:
test = pd.read_csv('./data/adult_test.csv')
test.head(3)

Unnamed: 0,age,workclass,fnlwgt,education.num,marital.status,race,sex,hours.per.week,native.country,income
0,19,0,410543,9,0,0,0,40,1,1
1,58,0,238438,10,1,0,1,42,1,1
2,46,1,216414,10,1,0,1,40,1,0


In [4]:
X_train = train.drop('income', axis=1)
y_train = train['income']

X_test = test.drop('income', axis=1)
y_test = test['income']

<h1>CLASS</h2>

In [54]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, f_classif, VarianceThreshold, mutual_info_classif, SelectFromModel
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import  RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score
import warnings

class Classification_Evaluator:
    
    # def __init__(self, X_train, y_train, X_test, y_test, wrapper_threshold=2):
    #     self.X_train = X_train
    #     self.y_train = y_train
    #     self.X_test = X_test
    #     self.y_test = y_test
    #     self.wrapper_threshold = wrapper_threshold

    #     self.X_train['dummy'] = np.random.random(self.X_train.shape[0])

    #     self.report = pd.DataFrame({'features': self.X_train.columns})

    def information_filter(self):
        """ Apply Information Value and filter out features that the value is more than zero"""

        importances = mutual_info_classif(self.X_train, self.y_train)
        # Format a dataframe with features and their information values
        importances = pd.DataFrame({'features':self.X_train.columns, 'info_value':importances})
        # Filtering out features that the values are more than the threshold
        threshold = importances[importances['features']=='dummy']['info_value'].values[0]
        self.info_features = importances[importances['info_value']>threshold]

    def lasso_embedded(self):
        """ Apply L1 Regularization to filter out the best set of features."""

        lr = LogisticRegression(C=1, penalty='l1', solver='liblinear', random_state=79)
        lasso = SelectFromModel(lr).fit(self.X_train, self.y_train)
        importances = (
            pd.DataFrame(
                {
                    'features':self.X_train.columns,
                    'lasso':lasso.estimator_.coef_.reshape(-1)
                }
            )
        )
        self.lasso_features = (
            importances[importances['features'].isin(lasso.get_feature_names_out())]
            .reset_index(drop=True)
        )

    def rf_embedded(self):
        """ Uses the Feature Importance from a Random Forest to filter out some low importance features."""

        rf = RandomForestClassifier(n_estimators=100, random_state=79)
        rf.fit(self.X_train, self.y_train)
        importances = (
            pd.DataFrame(
                {
                    'features':self.X_train.columns, 
                    'rf':rf.feature_importances_
                }
            )
        )
        threshold = importances[importances['features']=='dummy']['rf'].values[0]
        self.rf_features = (
            importances[importances['rf']>threshold]
        )

    def bfs_wrapper(self, model, model_name):
        """ Use Backward Feature Selection to select the best set of fetures.

        Returns:
            feature_names (pd.DataFrame): Names of the best set of features.
        """

        bfs = SequentialFeatureSelector(model, k_features='best', forward=False, n_jobs=-1)
        bfs.fit(self.X_train, self.y_train)

        return pd.DataFrame({'features': bfs.k_feature_names_, model_name: np.ones(len(bfs.k_feature_names_))})
    
    def ffs_wrapper(self, model, model_name):
        """ Use Forward Feature Selection to select the best set of fetures.

        Returns:
            feature_names (pd.DataFrame): Names of the best set of features.
        """

        bfs = SequentialFeatureSelector(model, k_features='best', forward=True, n_jobs=-1)
        bfs.fit(self.X_train, self.y_train)

        return pd.DataFrame({'features': bfs.k_feature_names_, model_name: np.ones(len(bfs.k_feature_names_))})
    
    def compute_bfs_wrapper(self):
        lr = LogisticRegression(max_iter=200, random_state=79)
        nb = GaussianNB()
        dt = DecisionTreeClassifier()
        rf = RandomForestClassifier(n_estimators=10, random_state=79)
        lgbm = LGBMClassifier(n_estimators=10, random_state=79, verbose=-1)

        wrapper_summary = pd.DataFrame({'features': X_train.columns})
        models = [lr, nb, dt, rf, lgbm]
        names = ['lr', 'nb', 'dt', 'rf', 'lgbm']

        for model in zip(models, names):
            temp = self.bfs_wrapper(model[0], model[1])
            wrapper_summary = (
                wrapper_summary
                .merge(
                    temp,
                    how='left',
                    on='features'
                )
            )
        wrapper_summary['bfs'] = wrapper_summary.set_index('features').sum(axis=1).values

        self.bfs_features = (
            wrapper_summary[wrapper_summary['bfs']>=self.wrapper_threshold]
            [['features', 'bfs']]
            .reset_index(drop=True)
        )

    def compute_ffs_wrapper(self):
        lr = LogisticRegression(max_iter=200, random_state=79)
        nb = GaussianNB()
        dt = DecisionTreeClassifier()
        rf = RandomForestClassifier(n_estimators=10, random_state=79)
        lgbm = LGBMClassifier(n_estimators=10, random_state=79, verbose=-1)

        wrapper_summary = pd.DataFrame({'features': X_train.columns})
        models = [lr, nb, dt, rf, lgbm]
        names = ['lr', 'nb', 'dt', 'rf', 'lgbm']

        for model in zip(models, names):
            temp = self.ffs_wrapper(model[0], model[1])
            wrapper_summary = (
                wrapper_summary
                .merge(
                    temp,
                    how='left',
                    on='features'
                )
            )
        wrapper_summary['ffs'] = wrapper_summary.set_index('features').sum(axis=1).values

        self.ffs_features = (
            wrapper_summary[wrapper_summary['ffs']>=self.wrapper_threshold]
            [['features', 'ffs']]
            .reset_index(drop=True)
        )

    def parse_summary(self):
        self.summary = (
            self.report
            .merge(
                self.info_features,
                how='left',
                on='features'
            )
            .merge(
                self.lasso_features,
                how='left',
                on='features'
            )
            .merge(
                self.rf_features,
                how='left',
                on='features'
            )
            .merge(
                self.bfs_features,
                how='left',
                on='features'
            )
            .merge(
                self.ffs_features,
                how='left',
                on='features'
            )
        )
        self.summary['chosen'] = (
            self
            .summary
            .set_index('features')
            .notna()
            .sum(axis=1)
            .values
        )
        self.summary = self.summary[self.summary['features']!='dummy']
    
    def run_evaluation(self, X_train, y_train, X_test, y_test, wrapper_threshold=2):

        self.X_train = X_train
        self.y_train = y_train
        self.X_test = X_test
        self.y_test = y_test
        self.wrapper_threshold = wrapper_threshold

        self.X_train['dummy'] = np.random.random(self.X_train.shape[0])

        self.report = pd.DataFrame({'features': self.X_train.columns})
        
        self.information_filter()
        self.lasso_embedded()
        self.rf_embedded()
        self.compute_bfs_wrapper()
        self.compute_ffs_wrapper()
        self.parse_summary()
        print(self.summary)

<h2>Sandbox</h2>

In [55]:
eva = Classification_Evaluator()
eva.run_evaluation(X_train, y_train, X_test, y_test)

         features  info_value     lasso        rf  bfs  ffs  chosen
0             age    0.069497 -0.029078       NaN  3.0  3.0       4
1       workclass         NaN -0.164057       NaN  NaN  3.0       2
2          fnlwgt    0.027057       NaN  0.183483  NaN  NaN       2
3   education.num    0.068587 -0.372961       NaN  5.0  5.0       4
4  marital.status    0.105473 -2.266892       NaN  5.0  5.0       4
5            race         NaN  0.144219       NaN  NaN  NaN       1
6             sex    0.031221 -0.209407       NaN  2.0  NaN       3
7  hours.per.week    0.040722 -0.032432       NaN  3.0  3.0       4
8  native.country    0.004726 -0.276219       NaN  2.0  2.0       4


In [None]:
(
    eva.report
    .merge(
        eva.info_features,
        how='left',
        on='features'
    )
    .merge(
        eva.lasso_features,
        how='left',
        on='features'
    )
    .merge(
        eva.rf_features,
        how='left',
        on='features'
    )
    .merge(
        eva.bfs_features,
        how='left',
        on='features'
    )
    .merge(
        eva.ffs_features,
        how='left',
        on='features'
    )
)

Unnamed: 0,features,info_value,lasso,rf,bfs,ffs
0,age,0.074455,-0.029007,,3.0,3.0
1,workclass,0.002439,-0.165206,,2.0,3.0
2,fnlwgt,0.028112,,0.183108,,
3,education.num,0.06704,-0.372747,,5.0,5.0
4,marital.status,0.106396,-2.263822,,5.0,5.0
5,race,0.007573,0.144999,,,
6,sex,0.028074,-0.209469,,,
7,hours.per.week,0.043479,-0.032357,,3.0,3.0
8,native.country,0.002385,-0.273757,,2.0,
9,dummy,,-0.094079,,2.0,3.0


In [37]:
temp.set_index('features').notna().sum(axis=1).values

array([4, 4, 2, 4, 4, 2, 2, 4, 3, 3], dtype=int64)

In [33]:
(
    eva.report
    .merge(
        eva.info_features,
        how='left',
        on='features'
    )
    .merge(
        eva.lasso_features,
        how='left',
        on='features'
    )
    .merge(
        eva.rf_features,
        how='left',
        on='features'
    )
    .merge(
        eva.bfs_features,
        how='left',
        on='features'
    )
    .merge(
        eva.ffs_features,
        how='left',
        on='features'
    )
)

Unnamed: 0,features,info_value,lasso,rf,bfs,ffs
0,age,0.074455,-0.029007,,3.0,3.0
1,workclass,0.002439,-0.165206,,2.0,3.0
2,fnlwgt,0.028112,,0.183108,,
3,education.num,0.06704,-0.372747,,5.0,5.0
4,marital.status,0.106396,-2.263822,,5.0,5.0
5,race,0.007573,0.144999,,,
6,sex,0.028074,-0.209469,,,
7,hours.per.week,0.043479,-0.032357,,3.0,3.0
8,native.country,0.002385,-0.273757,,2.0,
9,dummy,,-0.094079,,2.0,3.0
