In [21]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import scipy.optimize
import seaborn as sn
import statistics
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import chi2, SelectKBest, GenericUnivariateSelect
from sklearn.svm import SVC
from sklearn.feature_selection import RFE, f_classif
from collections import Counter
import xgboost
# xgboost for feature importance on a regression problem
from sklearn.datasets import make_regression
from matplotlib import pyplot
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
#import shap
from sklearn.model_selection import train_test_split
from random import seed
import pandas as pd 
import os 
from boruta import BorutaPy
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
import json


In [22]:
def read_dataset(dataset_name):
    filepath = os.path.join('AML_2_dane', f'{dataset_name}_')
    X = np.genfromtxt(filepath + 'train.data')
    Y = np.genfromtxt(filepath + 'train.labels')
    Y = (Y + 1)/2


    return X, Y
#X, Y = read_dataset('digits')

In [23]:
def scale(X_train, X_test):
    scaler = MinMaxScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test) 
    return X_train, X_test

In [24]:
def fs_univariate(X_train,y_train,args):
  #perform univariate feature selection with a configurable strategy 
  #with hyper-parameter search estimator
  fi, _ = chi2(X_train, y_train)
  return fi

In [25]:
def fs_boruta(X_train,y_train,classif_args, boruta_args):

    rf = RandomForestClassifier(**classif_args)

    # define Boruta feature selection method
    feat_selector = BorutaPy(rf, **boruta_args)
    feat_selector.fit(X_train, y_train)

    # check selected features - first 5 features are selected
    return -feat_selector.ranking_


In [26]:
def fs_MCFS(X_train,y_train, mcfs_args):
    fs = return_MCFS(X_train,y_train, **{arg: value for arg, value in mcfs_args.items() if arg !='topk'})
    #top_indices =  np.argpartition(fs, -mcfs_args['topk'])[-mcfs_args['topk']:]
    return fs

In [27]:
def fs_random_forest(X_train,y_train, rf_args):
    rfc = RandomForestClassifier(**{arg: value for arg, value in rf_args.items() if arg !='topk'})
    rfc.fit(X_train, y_train)
    #top_indices =  np.argpartition(rfc.feature_importances_, -rf_args['topk'])[-rf_args['topk']:]
    return rfc.feature_importances_

In [28]:
def select_top_k_indices(feature_importance, topk): 
    return np.argpartition(feature_importance, -topk)[-topk:]

In [29]:
def vote_fs(chi2_fi, boruta_fi, MCFS_fi, random_forest_fi, topk):
    chi2_indices = select_top_k_indices(chi2_fi, topk)
    boruta_indices = select_top_k_indices(boruta_fi, topk)
    MCFS_indices = select_top_k_indices(MCFS_fi, topk)
    random_forest_indices = select_top_k_indices(random_forest_fi, topk)
    all_indices = np.r_[chi2_indices, boruta_indices, MCFS_indices, random_forest_indices]
    counter = Counter(all_indices)
    indices = [idx for idx, value in counter.items() if value > 1]
    return np.array(indices)

In [30]:
%load_ext autoreload
%autoreload 2
from MCFS import return_MCFS

def return_fi(X_train,y_train, method, method_args): 
    if method == 'chi2': 
        return fs_univariate(X_train,y_train, method_args)
    elif method == 'boruta':    
        return fs_boruta(X_train,y_train, method_args['classif_args'], method_args['boruta_args'])
    elif method == 'MCFS': 
        return fs_MCFS(X_train,y_train, method_args)
    elif method == 'RandomForest': 
        return fs_random_forest(X_train,y_train, method_args)





The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
def return_aml_score(balanced_accuracy, m_features_chosen, dataset='artificial'):
    if dataset == 'artificial':
        return balanced_accuracy - max(0, 0.01*(0.2*m_features_chosen-1))
    elif dataset == 'digits': 
        return balanced_accuracy - max(0, 0.01*(0.005*m_features_chosen-0.25))
    else: 
        raise ValueError(f"wrong dataset: {dataset}")
assert return_aml_score(0.9, 5) == 0.9
assert return_aml_score(0.9, 20) == 0.87


In [32]:
def return_metrics(model, X_test, y_test, dataset): 
    yhat = model.predict(X_test)
    accuracy = accuracy_score(y_test, yhat)
    balanced_accuracy = balanced_accuracy_score(y_test, yhat)
    aml_score = return_aml_score(balanced_accuracy, X_test.shape[1], dataset)
    return accuracy, balanced_accuracy, aml_score

In [33]:
def return_best_ccp_alpha(X, Y): 
    print('return_best_ccp_alpha')
    alphas = np.linspace(0, 1.0, 15)
    parameters = {'ccp_alpha': alphas}
    model = RandomForestClassifier()
    gscv = GridSearchCV(model, parameters, cv=5)
    gscv.fit(X, Y)
    return gscv.best_params_['ccp_alpha']

In [34]:
def cv_gen(X, Y): 
    kfolds = KFold(n_splits=5, shuffle=True)
    for train_idx, test_idx in kfolds.split(X, Y):
        x_train = X[train_idx]
        x_test = X[test_idx]

        y_train = Y[train_idx]
        y_test = Y[test_idx]
        yield x_train, x_test, y_train, y_test

In [35]:
models = {
    'xgboost': lambda : XGBClassifier(eval_metric='logloss', use_label_encoder=False),
    'random_forest': lambda ccp_alpha: RandomForestClassifier(ccp_alpha=ccp_alpha)
}

In [36]:
from frozendict import frozendict
best_ccp_alphas = {'artificial': 0.0}
def grid_search(dataset, config, percentiles=(0.1,)): 
    X, Y  = read_dataset(dataset)
    if dataset in best_ccp_alphas:
        best_ccp_alpha = best_ccp_alphas.get(dataset)
    else: 
        best_ccp_alpha = return_best_ccp_alpha(X, Y)
    print(best_ccp_alpha)
    metrics = {model_name: {method_name: {percentile: {"accuracies": [], "balanced_accuracies": [], "aml_scores": []} for percentile in percentiles} for method_name in ['chi2', 'boruta', 'MCFS', 'RandomForest']}   for model_name in models}
    results = []
    for idx_fold, (x_train, x_test, y_train, y_test) in enumerate(cv_gen(X, Y)):
        print(idx_fold)
        x_train, x_test = scale(x_train, x_test)
        for cfg_idx, args in enumerate(config['config']):
            print(f'{cfg_idx}/{len(config["config"])}', end='\r')
            args['results'] = []
            if args['method'] == 'MCFS' and args['method_args']['m'] > X.shape[1]: 
                continue
            fi = return_fi(x_train, y_train, args['method'], args['method_args'])
            for percentile in percentiles: 
                print(int(X.shape[1]*percentile/100))
                indices = select_top_k_indices(fi, int(X.shape[1]*percentile/100))
                x_train_cut = x_train[:, indices]
                x_test_cut = x_test[:, indices]
                for model_name, model in models.items(): 
                    if model_name == 'xgboost': 
                        model = model()
                    else: 
                        model = model(best_ccp_alpha)
                    model.fit(x_train_cut, y_train)
                    accuracy, balanced_accuracy, aml_score = return_metrics(model, x_test_cut, y_test, dataset=dataset)
                    args['results'].append({"percentile": percentile, 
                                        "model_name": model_name,
                                        "accuracy": accuracy,
                                        "balanced_accuracy": balanced_accuracy,
                                        "aml_score": aml_score
                                        })
            results.append(args)
            with open(f'results_{dataset}_loxxxxtile.json', 'w') as file: 
                json.dump({"results": results}, file)
            print()

with open('fs_config.json', 'rb') as file: 
    config = json.load(file)


config['config'] = [conf for idx, conf in enumerate(config['config']) if conf['method'] != 'MCFS' or (conf['method'] == 'MCFS' and conf['method_args']['u'] == 1 and conf['method_args']['v'] == 1 and conf['method_args']['m'] == 5)]
grid_search('artificial', config)
grid_search('digits', config)


0.0
0
0/6
1/6
2/6
3/6
4/6
5/6
1
0/6
1/6
2/6
3/6
4/6
5/6
2
0/6
1/6
2/6
3/6
4/6
5/6
3
0/6
1/6
2/6
3/6
4/6
5/6
4
0/6
1/6
2/6
3/6
4/6
5/6
return_best_ccp_alpha
0.0
0
0/6
1/6
2/6
3/6
4/6
5/6
1
0/6
1/6
2/6
3/6
4/6
5/6
2
0/6
1/6
2/6
3/6
4/6
5/6
3
0/6
1/6
2/6
3/6
4/6
5/6
4
0/6
1/6
2/6
3/6
4/6
5/6
