In [16]:
#-*- coding: utf-8 -*-

from sklearn.feature_selection import SelectKBest, chi2, RFE, VarianceThreshold, SelectFromModel
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression

In [17]:
import pandas as pd
import numpy as np

import time

In [18]:
# read mushroom data from csv
mushroom = pd.read_csv('mushrooms.csv')

# transform string data to binary
X = pd.get_dummies(mushroom, prefix=list(mushroom))

# set target
Y = X['class_p'].values

# drop target from train data
X.drop('class_e', inplace=True, axis=1)
X.drop('class_p', inplace=True, axis=1)

# drop data that was orginally binary values such as bruises
X.drop('bruises_t', inplace=True, axis=1)
X.drop('gill-size_b', inplace=True, axis=1)
X.drop('stalk-shape_e', inplace=True, axis=1)
X.drop('veil-type_p', inplace=True, axis=1)
X = X.values

In [27]:
# maximum number of features
num = 20

In [21]:
# univariate feature extraction
start_time = time.time()
test = SelectKBest(score_func=chi2, k=num)
fit = test.fit(X, Y)
np.set_printoptions(precision=3)
X_univariate = fit.transform(X)
print X_univariate.shape
print "univariate time elapsed: {} ".format(time.time() - start_time)

(8124, 20)
univariate time elapsed: 0.0102970600128 


In [30]:
# percent used in low variance feature selection
percent = .71

# low variance
start_time = time.time()
sel = VarianceThreshold(threshold=(percent * (1 - percent)))
X_low_variance = sel.fit_transform(X)
print X_low_variance.shape
print "low variance time elapsed: {} ".format(time.time() - start_time)

(8124, 20)
low variance time elapsed: 0.0107748508453 


In [34]:
# tree based feature selection
start_time = time.time()
clf = ExtraTreesClassifier(max_features=num)
clf.fit(X, Y)
model = SelectFromModel(estimator=clf, prefit=True)
X_tree = model.transform(X)
X_tree.shape
print "tree based time elapsed: {} ".format(time.time() - start_time)

(8124, 22)
tree based time elapsed: 0.108003139496 


In [33]:
# PCA feature extraction
start_time = time.time()
pca = PCA(n_components=num)
fit = pca.fit(X)
# summarize components
# print "Explained Variance: %s" % fit.explained_variance_ratio_
X_PCA = fit.fit_transform(X, Y)
X_PCA.shape
print "PCA time elapsed: {} ".format(time.time() - start_time)

(8124, 20)
PCA time elapsed: 0.363460063934 


In [32]:
# number of steps to iterate in RFE
num_of_steps = 5

# recursive feature elimination
start_time = time.time()
logreg = LogisticRegression()
rfe = RFE(logreg, num, step=num_of_steps)
fit = rfe.fit(X, Y)
# print("Selected Features: %s") % fit.support_
# print("Feature Ranking: %s") % fit.ranking_
X_RFE = fit.fit_transform(X, Y)
X_RFE.shape
print "RFE time elapsed: {} ".format(time.time() - start_time)

(8124, 20)
RFE time elapsed: 1.17895007133 


In [None]:
print ""