In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.metrics import confusion_matrix,accuracy_score

In [None]:
from itertools import combinations

#### Feature selection imports

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif,mutual_info_classif

from sklearn.feature_selection import RFE,RFECV



## Feature Selection

* sklearn Feature Selection
    - Univariate 
    - Recursive Elimination
* Subset: Iterating over a learning method
    - Best Subset
    - Sequential        
        - Forward
        - Backward

### Univariate feature selection in sklearn: SelectKbest


http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html

* Statistical test on a single variable (no relationship among variables)

* Parameters
    * k: number of features to select based on the scores determined by score_func
    * score_func: Scoring function to evaluate features
        - e.g. f_classif computes ANOVA F-statistic (classification only)

* Transform method to select features from the feature array that is input (i.e. X)


In [None]:
wine = pd.read_csv("wine.csv")
feats = wine.columns[0:-1]
wine.tail()


In [None]:
len(wine.columns)

In [None]:
X = wine.iloc[:, 0:-1].values
y =  LabelEncoder().fit_transform(wine.Customer_Segment) 
X.shape,y.shape

In [None]:
def run_log_regress(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=0, 
                                                    stratify=y)

    stdsc = StandardScaler()
    X_train = stdsc.fit_transform(X_train)
    X_test = stdsc.transform(X_test)
    
    model = LogisticRegression(random_state = 42)
    model.fit(X_train, y_train)
    return model.score(X_test,y_test)


In [None]:
run_log_regress(X,y)

In [None]:
# feature extraction
SelBest = SelectKBest(score_func=f_classif, k=6) #f_classif computes ANOVA F-statistic
SelBest.fit(X, y)

# summarize scores
np.set_printoptions(precision=3)
print("The scores: ", SelBest.scores_)
features = SelBest.transform(X)
features.shape

In [None]:
# Print selected features
idxs = SelBest.get_support()
print(idxs)
feats = wine.columns[0:-1]
feats[idxs].tolist()


In [None]:
run_log_regress(features,y)


### Recursive Feature Elimination

http://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html

* User supplied estimator
* Selects n features by recursively considering smaller and smaller sets of features
    - defaults to selecting half the number of features

In [None]:
# feature extraction
model = LogisticRegression()
rfe = RFE(model, n_features_to_select = 6)
fit = rfe.fit(X, y)
print("Num Features: ",fit.n_features_)
print("Selected Features: ", wine.columns.values[:-1][fit.support_])
print("Feature Ranking: ",fit.ranking_)

In [None]:
model = LogisticRegression()
rfecv = RFECV(model,cv = 10,min_features_to_select = 6)
fit = rfecv.fit(X, y)
print("Num Features: ",fit.n_features_)
print("Selected Features: ", wine.columns.values[:-1][fit.support_])
print("Feature Ranking: ",fit.ranking_)


### Subset Selection 

* Models with different sets of features(predictors)

* Best subset
* Sequential
    - Forward Selection
    - Backwards
    
* No sklearn class for best subset or sequential methods. Code in this section is patterned after code by Sebastin Raschka.
    
* Model evaluation
    - if splitting the data, the best performing model on the test set
    - if using a single dataset (i.e. training data) use a metric that adjusts the error
        - AIC, BIC or adjusted $R^2$

    
#### Best Subset

* Exhaustive Search Algorithm  

* Algorithm  
    1. Let $M_0$ be the null model, which contains no predictors. It predicts the sample mean  
    2. For k = 1,2...,p:  (p is the total number of predictors)
        2.1 Fit all $\binom{p}{k}$ models that contain exactly k predictors  
        2.2 Pick the best and call it $M_k$ where best is  based on a scoring method 
    3. Select the single best from $M_0,...,M_p$ using some model evaluation criteria 
        
* Best subset limited to small number of predictors (p)
    - When p is large, larger search space implies a better chance of finding a model that looks good on the training data but with no predictive power on test data
    - Overfitting and high variance

### Sequential selection

* Sequential feature selection algorithms are a family of greedy search algorithms
    - used to reduce an initial d-dimensional feature space to a k-dimensional feature subspace where k < d
    
* Greedy algorithms make locally optimal choices at each stage of a combinatorial search problem and generally yield a suboptimal solution 


#### Forward selecton algorithm

1. Let $M_0$ be the null model, which contains no predictors
2. For k = 1,2...,p-1:  
    2.1 Consider all p - k models that add one predictor to the ones in $M_k$  
    2.2 Choose the best(based on scoring method) among these p - k models and call it $M_{k+1}$  
3.  Select the single best from $M_1,...,M_p$ using some model evaluation criteria 

* Better computationally than best subset
* Not guaranteed to find the best model out of $2^n$ possible models

In [None]:
class SFS():
    def __init__(self, estimator, scoring=accuracy_score,test_size=0.25, random_state=1,show_details=False):
        self.scoring = scoring
        self.estimator = estimator
        self.test_size = test_size
        self.random_state = random_state
        self.show_details = show_details

    def fit(self, X, y):
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size=self.test_size,
                                                            random_state=self.random_state)

        num_features = X_train.shape[1]
        self.Models = []
        self.scores = []
        indicies = list(range(num_features))
        M = []
        while indicies:
            best_score = 0
            best_model = None
            for i in indicies:
                m = M + [i]
                score = self._calc_score(X_train, y_train, X_test, y_test, m)
                if score > best_score:
                   best_score = score
                   best_model = i
                #if self.show_details: print(m,round(score,3))
            indicies.remove(best_model)
            M = M + [best_model]
            if self.show_details: print("Best score: ",round(best_score,3)," Best model: ", M)
            self.Models.append(M)
            self.scores.append(best_score)
        return self

    def _calc_score(self, X_train, y_train, X_test, y_test, indices):
        self.estimator.fit(X_train[:, indices], y_train)
        y_pred = self.estimator.predict(X_test[:, indices])
        score = self.scoring(y_test, y_pred)
        return score
    
    def best_features(self,names):
        return names[self.Models[np.argmax(self.scores)]].tolist()
    
    def transform(self,X):
        return X[:,self.Models[np.argmax(self.scores)]]
    
    def plot(self):
        # plotting performance of feature subsets
        k_feat = [len(k) for k in sfs.Models]
        plt.plot(k_feat, self.scores, marker='o')
        plt.ylim([0.7, 1.02])
        plt.ylabel('Accuracy')
        plt.xlabel('Number of features')
        plt.grid()
        plt.tight_layout()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, 
                     stratify=y)

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)

# selecting features
sfs = SFS(knn,show_details=True)
sfs.fit(X_train_std, y_train)

In [None]:
X_train_sel = sfs.transform(X_train_std)
X_train_sel.shape

In [None]:
sfs.plot()

sfs.best_features(wine.columns.values[:-1])
 

#### Backward selecton algorithm

1. Let $M_0$ be the full model, which contains all the predictors
2. For k = p,p-1,...,1:  
    2.1 Consider all k models that contain all but one of the predictors in $M_k$ for a total of k - 1 predictors  
    2.2 Choose the best(based on scoring method) among these k models and call it $M_{k-1}$  
3.  Select the single best from $M_0,...,M_p$ using using some model evaluation criteria 

* Like forward stepwise selection, the backward selection approach searches through only 1 + p(p + 1)/2 models
    - Applicable when p is too large for best subset selection
* Not guaranteed to yield the best model containing a subset of the p predictors.
* Backward selection requires that the number of samples n is larger than the number of variables p (so that the full model can be fit).
    - Forward stepwise can be used even when n < p, and so is the only viable subset method when p is very large.


In [None]:
class SBS():
    def __init__(self, estimator, scoring=accuracy_score,test_size=0.25, random_state=1,show_details=False):
        self.scoring = scoring
        self.estimator = estimator
        self.test_size = test_size
        self.random_state = random_state
        self.show_details = show_details
        

    def fit(self, X, y):
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size=self.test_size,
                                                            random_state=self.random_state)

        num_features = X_train.shape[1]
        indicies = tuple(range(num_features))
        score = self._calc_score(X_train, y_train, X_test, y_test, indicies)
        self.scores = [score]
        self.Models = [indicies]
        if self.show_details: print("Full Model score: ",score)
        while num_features > 1:
            best_model = None
            best_score = 0 
            #if self.show_details: print("Number of features: ",num_features)
            for m in combinations(indicies, r = num_features - 1):
                score = self._calc_score(X_train, y_train, X_test, y_test, m)
                if score > best_score:
                    best_score = score
                    best_model = m
                #if self.show_details: print(m,round(score,3))
            if self.show_details: print("Best score: ",round(best_score,3)," Best model: ",best_model)
            indicies = best_model
            self.Models.append(best_model)
            self.scores.append(best_score)
            num_features -= 1
        return self


    def _calc_score(self, X_train, y_train, X_test, y_test, indices):
        self.estimator.fit(X_train[:, indices], y_train)
        y_pred = self.estimator.predict(X_test[:, indices])
        score = self.scoring(y_test, y_pred)
        return score
    
  

    def plot(self):
        # plotting performance of feature subsets
        k_feat = [len(k) for k in self.Models]
        plt.plot(k_feat, self.scores, marker='o')
        plt.ylim([0.7, 1.02])
        plt.ylabel('Accuracy')
        plt.xlabel('Number of features')
        plt.grid()
        plt.tight_layout()

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)

# selecting features
sbs = SBS(knn,show_details = True)
sbs.fit(X_train_std, y_train)

sbs.plot()

In [None]:
k3 = list(sbs.Models[10]) # 3 features
print(wine.columns[1:][k3])

In [None]:
knn.fit(X_train_std, y_train)
print('Training accuracy:', knn.score(X_train_std, y_train))
print('Test accuracy:', knn.score(X_test_std, y_test))

In [None]:
knn.fit(X_train_std[:, k3], y_train)
print('Training accuracy:', knn.score(X_train_std[:, k3], y_train))
print('Test accuracy:', knn.score(X_test_std[:, k3], y_test))

#### References

Raschka,Sebastin & Mirjalili, Vahid (2017). Python Machine Learning, 2nd Edition, Packt Publishing.