In [3]:
#-*- coding: utf-8 -*-

from sklearn.feature_selection import SelectKBest, chi2, f_classif, RFE, VarianceThreshold, SelectFromModel
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression

In [6]:
import pandas as pd
import numpy as np

import time

In [14]:
# read mushroom data from csv
mushroom = pd.read_csv('mushrooms.csv')
mushroom_target = mushroom['class']

# transform string data to binary
X = pd.get_dummies(mushroom, prefix=list(mushroom))

# set target
Y = X['class_p'].values

# drop target from train data
X.drop('class_e', inplace=True, axis=1)
X.drop('class_p', inplace=True, axis=1)

# data preprocessing
missing = raw_input('Process data? y/n ')
if missing != 'n':
    # drop data that was orginally binary values such as bruises
    X.drop('bruises_t', inplace=True, axis=1)
    X.drop('gill-size_b', inplace=True, axis=1)
    X.drop('stalk-shape_e', inplace=True, axis=1)
    X.drop('veil-type_p', inplace=True, axis=1)
    #missing values
    X.drop('stalk-root_?', inplace=True, axis=1)
    
X = X.values

Process data? y/n y


In [33]:
# maximum number of features
num = raw_input('Max number of features? [default=32] ')
if not num: num = 32
else: num = int(num)
    
# percent used in low variance feature selection
percent = raw_input('Percent? [default=.81] ')
if not percent: percent = .81
else: percent = float(percent)
# number of steps to iterate in RFE
num_of_steps = raw_input('Number of steps? [default=1] ')
if not num_of_steps: num_of_steps = 1
else: num_of_steps = int(num_of_steps)

Max number of features? [default=32] 
Percent? [default=.81] 
Number of steps? [default=1] 


In [16]:
# univariate feature extraction
start_time = time.time()
test = SelectKBest(score_func=chi2, k=num)
fit = test.fit(X, Y)
np.set_printoptions(precision=3)
X_univariate_chi = fit.transform(X)
X_univariate_chi.shape
print "univariate time elapsed: {} ".format(time.time() - start_time)

(8124, 32)
univariate time elapsed: 0.0211451053619 


In [35]:
# univariate feature extraction
start_time = time.time()
test = SelectKBest(score_func=f_classif, k=num)
fit = test.fit(X, Y)
np.set_printoptions(precision=3)
X_univariate_f = fit.transform(X)
X_univariate_f.shape
print "univariate time elapsed: {} ".format(time.time() - start_time)

univariate time elapsed: 0.0174260139465 


In [34]:
# low variance
start_time = time.time()
sel = VarianceThreshold(threshold=(percent * (1 - percent)))
X_low_variance = sel.fit_transform(X)
X_low_variance.shape
print "low variance time elapsed: {} ".format(time.time() - start_time)

(8124, 32)
low variance time elapsed: 0.0105011463165 


In [20]:
# tree based feature selection
start_time = time.time()
clf = ExtraTreesClassifier(max_features=num)
clf.fit(X, Y)
model = SelectFromModel(estimator=clf, prefit=True)
X_tree = model.transform(X)
X_tree.shape
print "tree based time elapsed: {} ".format(time.time() - start_time)

tree based time elapsed: 0.117341995239 


In [21]:
# PCA feature extraction
start_time = time.time()
pca = PCA(n_components=num)
fit = pca.fit(X)
# summarize components
# print "Explained Variance: %s" % fit.explained_variance_ratio_
X_PCA = fit.fit_transform(X, Y)
X_PCA.shape
print "PCA time elapsed: {} ".format(time.time() - start_time)

PCA time elapsed: 0.51239490509 


In [22]:
# recursive feature elimination
start_time = time.time()
logreg = LogisticRegression()
rfe = RFE(logreg, num, step=num_of_steps)
fit = rfe.fit(X, Y)
# print("Selected Features: %s") % fit.support_
# print("Feature Ranking: %s") % fit.ranking_
X_RFE = fit.fit_transform(X, Y)
X_RFE.shape
print "RFE time elapsed: {} ".format(time.time() - start_time)

Number of steps? [default=5] 
RFE time elapsed: 1.25409698486 


In [23]:
print ""


