In [1]:
from Dataset import Dataset
from Evolution import Evolution
from sklearn.tree import DecisionTreeClassifier 
from sklearn.datasets import make_classification
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import logging

logging.getLogger("imported_module").setLevel(logging.CRITICAL)
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)  

# Instructions

In this notebook feature selection can be carried for a number of UCI datasets. In order to test a particular datasets uncomment the correpsonding line in the next cell. Four wrapper feature selection methods are used to identify the infromative features using a Decision Tree classifier:

* ## CHC$_{QX}$: 
The hyper-parameter choices of CHC$_{QX}$ are based on the paper “Fast Genetic Algorithm For Feature Selection - A Qualitative Approximation Approach”. The values are set to $q=10$ and $f=10$.

* ## PSO$_{QX}$: 
The hyper-parameter choices of PSO$_{QX}$ are based on the paper “Fast Genetic Algorithm For Feature Selection - A Qualitative Approximation Approach”. The values are set to $q=10$ and $f=10$.

* ## CHC: 
The implementation of a CHC algorithm is according to the paper: “The CHC Adaptive Search Algorithm: How to Have Safe Search When Engaging in Nontraditional Genetic Recombination”. The population size of is 50, the diversity parameter is set to $(d = \frac{k}{4})$, where $k$ is the length of the individual (number of features), while the divergence rate is $(div = 0.35)$.

* ## PSO:
The global version of PSO with a topology connecting all particles to one another. The following options are used \{c1: 1.49618, c2: 1.49618, w: 0.7298\}, while the number of particles is set to 50.

In [11]:
work_dir = 'UCI Datasets/Shuffled/'

#file_name, sep, label = 'covtype.data', ',', -1
#file_name, sep, label = 'adult.data', ',', -1
#file_name, sep, label = 'census-income.data', ',', -1
#file_name, sep, label = 'dota2Train.csv', ',', 0
#file_name, sep, label = 'diabetic_data.csv', ',', -1
file_name, sep, label, header = 'bank-full.csv', ',', -1, 0
#file_name, sep, label = 'connect-4.data', ',', -1

 
dataset = Dataset(work_dir+file_name, sep, label, divide_dataset=False, header=None)

#classifier = KNeighborsClassifier(n_neighbors=1, n_jobs=-1)
classifier = DecisionTreeClassifier(random_state=0)
#classifier = RandomForestClassifier(n_estimators=11, random_state=0, n_jobs=-1)


dataset.divide_dataset(classifier,
                                      normalize=True, 
                                      shuffle=False, 
                                      all_features=True, 
                                      all_instances=True, 
                                      evaluate=True, 
                                      partial_sample=False)

task = 'feature_selection'
target_dataset = 'validation'
ind_size = dataset.X_train.shape[1]
population_size = 50

## CHC$_{QX}$

In [12]:
log, baseline_full_data = Evolution.CHCqx(dataset, 10, 10, 2, population_size, verbose=1)

Meta-model sample size: 6781
Best Individual =  0.8998 , Gen =  20 

In [13]:
feature_subset = log.iloc[-1]['ind']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-1]['time'], 2)
accuracy = np.round(100*Evolution.evaluate(feature_subset, 'feature_selection', 'test', dataset)[0], 2)

print()
print('Test accuracy: %', accuracy)
print('Solution found in: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])


Test accuracy: % 89.19
Solution found in:  6.85 sec
Selected features indexes:  [ 6  8 14 15]


## PSO$_{QX}$

In [15]:
options = {'c1': 1.49618, 'c2': 1.49618, 'w': 0.7298, 'k': population_size, 'p':2}

log, baseline_full_data = Evolution.PSOqx(dataset, options, 10, 10, 2, population_size, verbose=1)

Meta-model sample size: 6781
Best Individual =  0.8996 , Step =  10 

In [16]:
feature_subset = log.iloc[-1]['ind']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-1]['time'], 2)
accuracy = np.round(100*Evolution.evaluate(feature_subset, 'feature_selection', 'test', dataset)[0], 2)

print()
print('Test accuracy: %', accuracy)
print('Solution found in: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])


Test accuracy: % 89.17
Solution found in:  4.53 sec
Selected features indexes:  [ 1  3 15]


## CHC

In [17]:
ind_size = dataset.X_train.shape[1]
toolbox = Evolution.create_toolbox(task, target_dataset, dataset, baseline_full_data)
population = Evolution.create_population(population_size, ind_size)
d = ind_size // 4
log, population, d = Evolution.CHC(dataset, toolbox, d, population, verbose=1, max_no_change=10)

Best Individual =  0.8998 , Gen =  20 

In [18]:
feature_subset = log.iloc[-1]['best_solution']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-1]['time'], 2)
accuracy = np.round(100*Evolution.evaluate(feature_subset, 'feature_selection', 'test', dataset)[0], 2)

print()
print('Test accuracy: %', accuracy)
print('Solution found in: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])


Test accuracy: % 89.19
Solution found in:  13.78 sec
Selected features indexes:  [ 4  6  8 14 15]


## PSO

In [19]:
options = {'c1': 1.49618, 'c2': 1.49618, 'w': 0.7298, 'k': population_size, 'p':2}

log = Evolution.PSO(dataset, options, population_size, steps_no_change=10, verbose=1)

Best Individual =  -0.8995 , Step =  17 

In [20]:
feature_subset = log.iloc[-1]['ind']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-1]['time'], 2)
accuracy = np.round(100*Evolution.evaluate(feature_subset, 'feature_selection', 'test', dataset)[0], 2)

print()
print('Test accuracy: %', accuracy)
print('Solution found in: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])


Test accuracy: % 89.25
Solution found in:  18.01 sec
Selected features indexes:  [ 2  3  4  6 15]
