In [1]:
from UCIDataset import UCIDataset
from EvolutionaryWrapperFeatureSelection import EvolutionaryWrapperFeatureSelection
from SurrogateAssistedWrapperFeatureSelection import SurrogateAssistedWrapperFeatureSelection
from sklearn.datasets import make_classification
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Synthetic Data
Feature selection is done for synthetic data of which the informative features are known. In the below example a binary classification dataset of 1000 instances are created of which only the first two features (indexes [0,1]) are informative and the rest are random noise. Three wrapper feature selection methods are used to identify the infromative features using a Decision Tree classifier:
<ul>
<li>SAGA</li>
<li>CHC</li>
<li>GA</li>
</ul>

In [2]:
def createDataset(n_samples, n_features, n_classes, n_informative, classifier):
    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes,
                               n_informative=n_informative, n_redundant=0, n_repeated=0,
                               random_state=1, shuffle=False)

    df = pd.DataFrame(X)
    df['labels'] = pd.Series(y)
    label = -1
    header = None

    dataset = UCIDataset(df, 'df', label, divide_dataset=False, header=header)
    dataset.divideDataset(classifier,
                          normalize=True, 
                          shuffle=True, 
                          all_features=True, 
                          all_instances=True, 
                          evaluate=True, 
                          partial_sample=False,
                          folds=10)

    print('Number of classes:', n_classes)
    print('Number of instances:', n_samples)
    print('Number of features:', n_features)
    print('Informative features:', list(range(0, n_informative)))
    accuracy = np.round(100* dataset.getTestAccuracy(), 2)
    print('Test accuracy before feature selection: %', accuracy)



    
    return dataset

In [69]:
dataset = createDataset(5000, 100, 2, 2, DecisionTreeClassifier(random_state=0))
alpha = 0.88
verbose = 0
evaluation = 'validation'

Number of classes: 2
Number of instances: 5000
Number of features: 100
Informative features: [0, 1]
Test accuracy before feature selection: % 82.27


# SAGA

In [70]:
log, population = SurrogateAssistedWrapperFeatureSelection.SAGA(dataset, 
                                                                populationSize=40, 
                                                                a=16, 
                                                                reductionRate=0.5, 
                                                                step=10, 
                                                                alpha=alpha,
                                                                verbose=verbose,
                                                                evaluation=evaluation,
                                                                noChange=10,
                                                                timeout=np.inf)

feature_subset = log.iloc[-1]['best_solution']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-1]['time'], 2)
accuracy = np.round(100*EvolutionaryWrapperFeatureSelection.evaluate(feature_subset, 'feature_selection', 'test', dataset, 1)[0], 2)

print()
print('Test accuracy: %', accuracy)
print('Solution found in: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])


Test accuracy: % 81.92
Solution found in:  5.03 sec
Selected features indexes:  [ 0  1  2  7 12 15 21 23 26 27 30 32 34 35 37 42 48 54 58 63 66 69 72 74
 77 80 86 90 95 97 99]


# CHC

In [71]:
log, population, d = EvolutionaryWrapperFeatureSelection.CHC(dataset, 
                                                          populationSize=40, 
                                                          d=False, 
                                                          divergence=0.35, 
                                                          alpha=alpha,
                                                          evaluation=evaluation,
                                                          maxNochange=10,
                                                          verbose=verbose,
                                                          timeout=np.inf)

feature_subset = log.iloc[-1]['best_solution']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-1]['time'], 2)
accuracy = np.round(100*EvolutionaryWrapperFeatureSelection.evaluate(feature_subset, 'feature_selection', 'test', dataset, 1)[0], 2)

print()
print('Test accuracy: %', accuracy)
print('Solution found in: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])


Test accuracy: % 80.76
Solution found in:  33.08 sec
Selected features indexes:  [ 0  1  3  4  5  7 10 11 14 16 21 25 26 30 31 34 35 41 43 49 56 58 59 60
 64 68 69 72 73 75 76 78 79 83 87 91 92 94]


# GA

In [None]:
log, population = EvolutionaryWrapperFeatureSelection.GA(dataset, 
                                                         populationSize=40, 
                                                         crossOverP=0.9, 
                                                         mutationP=0.1,
                                                         alpha=alpha,
                                                         evaluation=evaluation,
                                                         maxNochange=10,
                                                         verbose=verbose,
                                                         timeout=np.inf)
feature_subset = log.iloc[-1]['best_solution']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-10]['time'], 2)
accuracy = np.round(100*EvolutionaryWrapperFeatureSelection.evaluate(feature_subset, 'feature_selection', 'test', dataset, 1)[0], 2)

print()
print('Test accuracy: %', accuracy)
print('Solution found in: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])