In [1]:
from UCIDataset import UCIDataset
from EvolutionaryWrapperFeatureSelection import EvolutionaryWrapperFeatureSelection
from SurrogateAssistedWrapperFeatureSelection import SurrogateAssistedWrapperFeatureSelection
from sklearn.datasets import make_classification
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Synthetic Data
Feature selection is done for synthetic data of which the informative features are known. In the below example a binary classification dataset of 1000 instances are created of which only the first two features (indexes [0,1]) are informative and the rest are random noise. Three wrapper feature selection methods are used to identify the infromative features using a Decision Tree classifier:
<ul>
<li>SAGA</li>
<li>CHC</li>
<li>GA</li>
</ul>

In [2]:
def createDataset(n_samples, n_features, n_classes, n_informative, classifier):
    X, y = make_classification(n_samples=n_samples, n_features=n_features, n_classes=n_classes,
                               n_informative=n_informative, n_redundant=0, n_repeated=0,
                               random_state=1, shuffle=False)

    df = pd.DataFrame(X)
    df['labels'] = pd.Series(y)
    label = -1
    header = None

    dataset = UCIDataset(df, 'df', label, divide_dataset=False, header=header)
    dataset.divideDataset(classifier,
                          normalize=True, 
                          shuffle=True, 
                          all_features=True, 
                          all_instances=True, 
                          evaluate=True, 
                          partial_sample=False,
                          folds=10)

    print('Number of classes:', n_classes)
    print('Number of instances:', n_samples)
    print('Number of features:', n_features)
    print('Informative features:', list(range(0, n_informative)))
    accuracy = np.round(100* dataset.getTestAccuracy(), 2)
    print('Test accuracy before feature selection: %', accuracy)



    
    return dataset

In [3]:
dataset = createDataset(1000, 100, 2, 2, DecisionTreeClassifier(random_state=0))
alpha = 0.88
verbose = 1
evaluation = 'validation'

Number of classes: 2
Number of instances: 1000
Number of features: 100
Informative features: [0, 1]
Test accuracy before feature selection: % 81.0


# SAGA

In [4]:
log, population = SurrogateAssistedWrapperFeatureSelection.SAGA(dataset, 
                                                                populationSize=4, 
                                                                a=16, 
                                                                reductionRate=1, 
                                                                step=10, 
                                                                alpha=alpha,
                                                                verbose=verbose,
                                                                evaluation=evaluation,
                                                                noChange=100,
                                                                timeout=np.inf)

feature_subset = log.iloc[-1]['best_solution']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-1]['time'], 2)
accuracy = np.round(100*EvolutionaryWrapperFeatureSelection.evaluate(feature_subset, 'feature_selection', 'test', dataset, 1)[0], 2)

print('Test accuracy: %', accuracy)
print('Computation time: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])

Current Approx Sample Size: 37
Current Population Size: 4
[10, 0.04687619209289551, -156.32, 'NA', 1, [1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0], -153.68, 49, 0]
The best individual is saved -153.68
Number of features in selected individual:  40
[20, 0.07776236534118652, -139.76, 'NA', 2, [0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1], -140.64, 46, 0]
The best individual is saved -140.64
Number of features in selected individual:  33
The approximation converged!Gen =  10 
Current App

# CHC

In [4]:
log, population = EvolutionaryWrapperFeatureSelection.CHC(dataset, 
                                                          populationSize=40, 
                                                          d=False, 
                                                          divergence=0.35, 
                                                          alpha=alpha,
                                                          evaluation=evaluation,
                                                          maxNochange=100,
                                                          verbose=verbose,
                                                          timeout=np.inf)

feature_subset = log.iloc[-1]['best_solution']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-1]['time'], 2)
accuracy = np.round(100*EvolutionaryWrapperFeatureSelection.evaluate(feature_subset, 'feature_selection', 'test', dataset, 1)[0], 2)

print('Test accuracy: %', accuracy)
print('Computation time: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])

Test accuracy: % 77.5.0 , Gen =  384 , Gen =  199   71.0 , Gen =  350 
Computation time:  36.75 sec
Selected features indexes:  [1 2 3 7 9]


# GA

In [5]:
log, population = EvolutionaryWrapperFeatureSelection.GA(dataset, 
                                                         populationSize=40, 
                                                         crossOverP=0.9, 
                                                         mutationP=0.1,
                                                         alpha=alpha,
                                                         evaluation=evaluation,
                                                         maxNochange=100,
                                                         verbose=verbose,
                                                         timeout=np.inf)
feature_subset = log.iloc[-1]['best_solution']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-1]['time'], 2)
accuracy = np.round(100*EvolutionaryWrapperFeatureSelection.evaluate(feature_subset, 'feature_selection', 'test', dataset, 1)[0], 2)

print('Test accuracy: %', accuracy)
print('Computation time: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])

Test accuracy: % 84.529.76 , Gen =  113 
Computation time:  33.72 sec
Selected features indexes:  [ 1  3  6  7 14 20 21 22 23 24 26 32 35 40 42 43 47 48 55 64 67 70 73 81]
