# Import necessary dependencies and settings

In [1]:
import numpy as np
import pandas as pd
np.set_printoptions(suppress=True)
pt = np.get_printoptions()['threshold']

# Threshold based methods

## Limiting features in bag of word based models

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0.1, max_df=0.85, max_features=2000)
cv

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.85, max_features=2000, min_df=0.1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

## Variance based thresholding

In [3]:
df = pd.read_csv('datasets/Pokemon.csv')
poke_gen = pd.get_dummies(df['Generation'])
poke_gen.head()

Unnamed: 0,Gen 1,Gen 2,Gen 3,Gen 4,Gen 5,Gen 6
0,1,0,0,0,0,0
1,1,0,0,0,0,0
2,1,0,0,0,0,0
3,1,0,0,0,0,0
4,1,0,0,0,0,0


In [4]:
from sklearn.feature_selection import VarianceThreshold

vt = VarianceThreshold(threshold=.15)
vt.fit(poke_gen)

VarianceThreshold(threshold=0.15)

In [5]:
pd.DataFrame({'variance': vt.variances_,
              'select_feature': vt.get_support()},
            index=poke_gen.columns).T

Unnamed: 0,Gen 1,Gen 2,Gen 3,Gen 4,Gen 5,Gen 6
select_feature,True,False,True,False,True,False
variance,0.164444,0.114944,0.16,0.128373,0.163711,0.0919937


In [6]:
poke_gen_subset = poke_gen.iloc[:,vt.get_support()].head()
poke_gen_subset

Unnamed: 0,Gen 1,Gen 3,Gen 5
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


# Recursive Feature Elimination

In [14]:
from sklearn.feature_selection import RFE

lr = LogisticRegression()
rfe = RFE(estimator=lr, n_features_to_select=15, step=1)
rfe.fit(bc_X, bc_y)

RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
  n_features_to_select=15, step=1, verbose=0)

In [15]:
select_features_rfe = rfe.get_support()
feature_names_rfe = bc_data.feature_names[select_features_rfe]
print(feature_names_rfe)

['mean radius' 'mean texture' 'mean perimeter' 'mean smoothness'
 'mean concavity' 'mean concave points' 'mean symmetry' 'texture error'
 'worst radius' 'worst texture' 'worst smoothness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']


In [16]:
set(feature_names_kbest) & set(feature_names_rfe)

{'mean concavity',
 'mean perimeter',
 'mean radius',
 'mean texture',
 'worst concave points',
 'worst concavity',
 'worst radius',
 'worst texture'}

# Model based selection

In [17]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(bc_X, bc_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [18]:
importance_scores = rfc.feature_importances_
feature_importances = [(feature, score) for feature, score in zip(bc_data.feature_names, importance_scores)]
sorted(feature_importances, key=lambda x: -x[1])[:10]

[('worst concave points', 0.22465186401289805),
 ('worst area', 0.22183657032316897),
 ('mean concave points', 0.18192574025833769),
 ('worst perimeter', 0.099521838900566054),
 ('worst radius', 0.084068507192381611),
 ('worst texture', 0.02243708745933972),
 ('mean perimeter', 0.020073882937172081),
 ('worst smoothness', 0.014608966775322443),
 ('mean radius', 0.01374196961657885),
 ('worst concavity', 0.011340255118074721)]

# Feature extraction using dimensionality reduction

In [19]:
# center the feature set
bc_XC = bc_X - bc_X.mean(axis=0)

# decompose using SVD
U, S, VT = np.linalg.svd(bc_XC)

# get principal components
PC = VT.T

# get first 3 principal components
PC3 = PC[:, 0:3]
PC3.shape

(30, 3)

In [20]:
# reduce feature set dimensionality 
np.round(bc_XC.dot(PC3), 2)

array([[-1160.14,  -293.92,   -48.58],
       [-1269.12,    15.63,    35.39],
       [ -995.79,    39.16,     1.71],
       ..., 
       [ -314.5 ,    47.55,    10.44],
       [-1124.86,    34.13,    19.74],
       [  771.53,   -88.64,   -23.89]])

In [21]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca.fit(bc_X)

PCA(copy=True, iterated_power='auto', n_components=3, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [22]:
pca.explained_variance_ratio_

array([ 0.98204467,  0.01617649,  0.00155751])

In [23]:
bc_pca = pca.transform(bc_X)
np.round(bc_pca, 2)

array([[ 1160.14,  -293.92,    48.58],
       [ 1269.12,    15.63,   -35.39],
       [  995.79,    39.16,    -1.71],
       ..., 
       [  314.5 ,    47.55,   -10.44],
       [ 1124.86,    34.13,   -19.74],
       [ -771.53,   -88.64,    23.89]])

In [24]:
np.average(cross_val_score(lr, bc_pca, bc_y, scoring='accuracy', cv=5))

0.92808003078106949