In [3]:
# Importing libraries
import numpy as np
import pandas as pd
%matplotlib inline 
import matplotlib.pyplot as plt
import seaborn as sns

# for feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import SelectFpr

In [4]:
location = r"C:\Users\Vaibhav\Desktop\BA\Datasets\iris.csv"

In [5]:
word = 'sepal length (cm)'
"_".join(word.split(" ")[:2])

'sepal_length'

In [6]:
# load the training data from iris.txt
df_training = pd.read_csv(location )
df_training.head()
#df_training.drop(["Unnamed: 0"], axis = 1, inplace = True)

Unnamed: 0,Species,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0,5.1,3.5,1.4,0.2
1,0,4.9,3.0,1.4,0.2
2,0,4.7,3.2,1.3,0.2
3,0,4.6,3.1,1.5,0.2
4,0,5.0,3.6,1.4,0.2


In [7]:
flower_dim = ["_".join(word.split(" ")[:2]) for word in list(df_training.columns[1:])]
flower_dim
df_training.columns = ["class"]+flower_dim
df_training.head(2)

Unnamed: 0,class,sepal_length,sepal_width,petal_length,petal_width
0,0,5.1,3.5,1.4,0.2
1,0,4.9,3.0,1.4,0.2


In [8]:
# copy the predictor variables into X and responses in y
X = df_training[['sepal_length','sepal_width', 'petal_length', 'petal_width']]
y = df_training['class']

In [9]:
df_training.var()

class           0.671141
sepal_length    0.685694
sepal_width     0.188004
petal_length    3.113179
petal_width     0.582414
dtype: float64

In [10]:
print(X.shape)
print(y.shape)

(150, 4)
(150,)


#### sklearn.feature_selection.SelectKBest(score_func=<function f_classif>, k=10)

Select features according to the k highest scores.

** score_func **

Function taking two arrays X and y, and returning a pair of arrays (scores, pvalues) 

- For regression: f_regression, mutual_info_regression
- For classification: chi2, f_classif, mutual_info_classif

In [11]:
# Instantiate the method
selector = SelectKBest(chi2, k = 3)

In [12]:
# fit the method
# Run score function on (X, y) and get the appropriate features.
X_fit = selector.fit(X, y)

In [13]:
df_training.head()

Unnamed: 0,class,sepal_length,sepal_width,petal_length,petal_width
0,0,5.1,3.5,1.4,0.2
1,0,4.9,3.0,1.4,0.2
2,0,4.7,3.2,1.3,0.2
3,0,4.6,3.1,1.5,0.2
4,0,5.0,3.6,1.4,0.2


In [14]:
# print Scores of features.
X_fit.scores_

array([ 10.81782088,   3.59449902, 116.16984746,  67.24482759])

In [15]:
print("selected index:", selector.get_support(True))

selected index: [0 2 3]


In [16]:
# transform
# X_new : numpy array of shape [n_samples, n_features_new]
# Transformed array.
X_new = X_fit.transform(X)

In [17]:
X_new[:5]

array([[5.1, 1.4, 0.2],
       [4.9, 1.4, 0.2],
       [4.7, 1.3, 0.2],
       [4.6, 1.5, 0.2],
       [5. , 1.4, 0.2]])

In [18]:
X.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width'], dtype='object')

In [19]:
X.columns[selector.get_support(indices=True)]

Index(['sepal_length', 'petal_length', 'petal_width'], dtype='object')

In [20]:
# 1st way to get the list
vector_names = list(X.columns[selector.get_support(indices=True)])
print(vector_names)

['sepal_length', 'petal_length', 'petal_width']


In [21]:
# 2nd way
X.columns[selector.get_support(indices=True)].tolist()

['sepal_length', 'petal_length', 'petal_width']

In [22]:
# func list
func_list = ['f_classif', 'chi2', 'f_regression', 'mutual_info_regression', \
             'SelectPercentile']

In [23]:
# Instantiate the method
selector = SelectKBest(SelectPercentile, k = 3)

X_fit    = selector.fit(X, y)
X_fit.scores_

array(SelectPercentile(percentile=array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64),
                 s...
       [7.4, 2.8, 6.1, 1.9],
       [7.9, 3.8, 6.4, 2. ],
       [6.4, 2.8, 5.6, 2.2],
       [6.3, 2.8, 5.1, 1.5],
       [6.1, 2.6, 5.6, 1.4],
       [7.7, 3. , 6.1, 2.3],
       [6.3, 3.4, 5.6, 2.4],
       [6.4, 3.1, 5.5, 1.8],
       [6. , 3. , 4.8, 1.8],
       [6.9, 3.1, 5.4, 2.1],
       [6.7, 3.1, 5.6, 2.4],
       [6.9, 3.1, 5.1, 2.3],
       [5.8, 2.7, 5.1, 1.9],
       [6.8, 3.2, 5.9, 2.3],
       [6.7, 3.3, 5.7, 2

In [24]:
# Instantiate the method
selector = SelectKBest(f_classif, k = 3)

X_fit    = selector.fit(X, y)
X_fit.scores_

array([ 119.26450218,   47.3644614 , 1179.0343277 ,  959.32440573])

In [25]:
X = X_fit.transform(X) #so this deletes sepal width column with lowest F value

----

### Implementing Grid Search

In [26]:
# instantiate the KNN model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)

In [27]:
# Instantiate the method
selector = SelectKBest(f_classif, k = 3)

In [28]:
#creating a pipeline
from sklearn.pipeline import Pipeline
pipe = Pipeline([('select_k_best', selector), 
                 ('knn', knn)])

In [29]:
pipe.get_params()

{'memory': None,
 'steps': [('select_k_best',
   SelectKBest(k=3, score_func=<function f_classif at 0x000002602967D268>)),
  ('knn',
   KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                        metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                        weights='uniform'))],
 'verbose': False,
 'select_k_best': SelectKBest(k=3, score_func=<function f_classif at 0x000002602967D268>),
 'knn': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                      metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                      weights='uniform'),
 'select_k_best__k': 3,
 'select_k_best__score_func': <function sklearn.feature_selection.univariate_selection.f_classif(X, y)>,
 'knn__algorithm': 'auto',
 'knn__leaf_size': 30,
 'knn__metric': 'minkowski',
 'knn__metric_params': None,
 'knn__n_jobs': None,
 'knn__n_neighbors': 5,
 'knn__p': 2,
 'knn__weights': 'uniform'}

In [30]:
param_grid = [
  {'select_k_best__k': [1, 2, 3, 4],
   'knn__n_neighbors': [3, 5, 7, 9]}
  ]

In [31]:
from sklearn.model_selection import GridSearchCV
gs = GridSearchCV(estimator=pipe, 
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  n_jobs=1, 
                  cv=5,  
                  refit=False)

In [32]:
df_training.head(1)

Unnamed: 0,class,sepal_length,sepal_width,petal_length,petal_width
0,0,5.1,3.5,1.4,0.2


In [33]:
X = df_training.iloc[:, 1:].values
y = df_training['class'].values

In [34]:
# run gridearch
gs = gs.fit(X,y)

In [35]:
print("Best parameters via GridSearch", gs.best_params_)
#so best to select 4 features with 3 neighbors

Best parameters via GridSearch {'knn__n_neighbors': 7, 'select_k_best__k': 4}


In [40]:
gs.best_estimator_.steps

[('select_k_best',
  SelectKBest(k=4, score_func=<function f_classif at 0x000002602967D268>)),
 ('knn',
  KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                       metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                       weights='uniform'))]

In [49]:
gs.best_estimator_.steps[0]

('select_k_best',
 SelectKBest(k=4, score_func=<function f_classif at 0x000002602967D268>))

In [50]:
gs.best_estimator_.steps[0][1]

SelectKBest(k=4, score_func=<function f_classif at 0x000002602967D268>)

In [43]:
gs.best_params_

{'knn__n_neighbors': 7, 'select_k_best__k': 4}

In [44]:
gs.best_estimator_.steps[1][1].n_neighbors

7

In [45]:
# During cross-validation, this feature combination had a CV accuracy of:

print('Best score:', gs.best_score_)

Best score: 0.98


In [46]:
gs.best_params_

{'knn__n_neighbors': 7, 'select_k_best__k': 4}

In [47]:
gs.best_estimator_.steps[1][1].n_neighbors

7