In [1]:
from UCIDataset import UCIDataset
from EvolutionaryWrapperFeatureSelection import EvolutionaryWrapperFeatureSelection
from SurrogateAssistedWrapperFeatureSelection import SurrogateAssistedWrapperFeatureSelection
from sklearn.datasets import make_classification
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import numpy as np

# Synthetic Data
Feature selection is done for synthetic data of which the informative features are known. In the below example a binary classification dataset of 1000 instances are created of which only the first two features (indexes [0,1]) are informative and the rest are random noise. Three wrapper feature selection methods are used to identify the infromative features using a Decision Tree classifier:
<ul>
<li>SAGA</li>
<li>CHC</li>
<li>GA</li>
</ul>

In [60]:
def createDataset(n_samples, n_features, n_classes, n_informative, classifier):
    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes,
                               n_informative=n_informative, n_redundant=0, n_repeated=0,
                               random_state=1, shuffle=False)

    df = pd.DataFrame(X)
    df['labels'] = pd.Series(y)
    label = -1
    header = None

    dataset = UCIDataset(df, 'df', label, divide_dataset=False, header=header)
    dataset.divideDataset(classifier,
                          normalize=True, 
                          shuffle=True, 
                          all_features=True, 
                          all_instances=True, 
                          evaluate=True, 
                          partial_sample=False)

    print('Number of classes:', n_classes)
    print('Number of instances:', n_samples)
    print('Number of features:', n_features)
    print('Informative features:', list(range(0, n_informative)))
    accuracy = np.round(100* dataset.getTestAccuracy(), 2)
    print('Test accuracy before feature selection: %', accuracy)



    
    return dataset

In [61]:
dataset = createDataset(1000, 100, 2, 2, DecisionTreeClassifier(random_state=0))

Number of classes: 2
Number of instances: 1000
Number of features: 100
Informative features: [0, 1]
Test accuracy before feature selection: % 79.5


# SAGA

In [62]:
log, population = SurrogateAssistedWrapperFeatureSelection.SAGA(dataset, 
                                                                populationSize=40, 
                                                                a=16, 
                                                                reductionRate=0.5, 
                                                                step=10, 
                                                                verbose=0,
                                                                evaluation='validation',
                                                                noChange=100,
                                                                timeout=np.inf)

feature_subset = log.iloc[-1]['best_solution']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-1]['time'], 2)
accuracy = np.round(100*EvolutionaryWrapperFeatureSelection.evaluate(feature_subset, 'feature_selection', 'test', dataset, 1)[0], 2)

print('Test accuracy: %', accuracy)
print('Computation time: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])

Test accuracy: % 85.0
Computation time:  9.11 sec
Selected features indexes:  [ 0  1 22]


# CHC

In [73]:
log, population = EvolutionaryWrapperFeatureSelection.CHC(dataset, 
                                                          populationSize=40, 
                                                          d=False, 
                                                          divergence=0.35, 
                                                          evaluation='validation',
                                                          maxNochange=100,
                                                          verbose=0,
                                                          timeout=np.inf)

feature_subset = log.iloc[-1]['best_solution']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-1]['time'], 2)
accuracy = np.round(100*EvolutionaryWrapperFeatureSelection.evaluate(feature_subset, 'feature_selection', 'test', dataset, 1)[0], 2)

print('Test accuracy: %', accuracy)
print('Computation time: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])

Test accuracy: % 84.0
Computation time:  19.45 sec
Selected features indexes:  [ 0  1  4  7 11 22]


# GA

In [64]:
log, population = EvolutionaryWrapperFeatureSelection.GA(dataset, 
                                                         populationSize=40, 
                                                         crossOverP=0.9, 
                                                         mutationP=0.1,
                                                         evaluation='validation',
                                                         maxNochange=100,
                                                         verbose=0,
                                                         timeout=np.inf)
feature_subset = log.iloc[-1]['best_solution']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-1]['time'], 2)
accuracy = np.round(100*EvolutionaryWrapperFeatureSelection.evaluate(feature_subset, 'feature_selection', 'test', dataset, 1)[0], 2)

print('Test accuracy: %', accuracy)
print('Computation time: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])

Test accuracy: % 82.0
Computation time:  69.26 sec
Selected features indexes:  [ 0  1  3  4 20 21]
