# Feature Selection for Machine Learning

Feature selection is a removing unnecessary features.

In [1]:
# Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

# fix_yahoo_finance is used to fetch data 
import fix_yahoo_finance as yf
yf.pdr_override()

In [2]:
# input
symbol = 'AMD'
start = '2014-01-01'
end = '2018-08-27'

# Read data 
dataset = yf.download(symbol,start,end)

# View columns 
dataset.head()

[*********************100%***********************]  1 of 1 downloaded


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2014-01-02,3.85,3.98,3.84,3.95,3.95,20548400
2014-01-03,3.98,4.0,3.88,4.0,4.0,22887200
2014-01-06,4.01,4.18,3.99,4.13,4.13,42398300
2014-01-07,4.19,4.25,4.11,4.18,4.18,42932100
2014-01-08,4.23,4.26,4.14,4.18,4.18,30678700


In [3]:
dataset['Increase_Decrease'] = np.where(dataset['Volume'].shift(-1) > dataset['Volume'],1,0)
dataset['Buy_Sell_on_Open'] = np.where(dataset['Open'].shift(-1) > dataset['Open'],1,0)
dataset['Buy_Sell'] = np.where(dataset['Adj Close'].shift(-1) > dataset['Adj Close'],1,0)
dataset['Returns'] = dataset['Adj Close'].pct_change()
dataset = dataset.dropna()
dataset.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Increase_Decrease,Buy_Sell_on_Open,Buy_Sell,Returns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2014-01-03,3.98,4.0,3.88,4.0,4.0,22887200,1,1,1,0.012658
2014-01-06,4.01,4.18,3.99,4.13,4.13,42398300,1,1,1,0.0325
2014-01-07,4.19,4.25,4.11,4.18,4.18,42932100,0,1,0,0.012107
2014-01-08,4.23,4.26,4.14,4.18,4.18,30678700,0,0,0,0.0
2014-01-09,4.2,4.23,4.05,4.09,4.09,30667600,0,0,1,-0.021531


In [4]:
features = dataset.drop(['Adj Close', 'Close', 'Returns'], axis=1)

Univariate Selection

In [5]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

array = features.values
X = array.astype(int)
Y = dataset['Adj Close'].values.astype(int)

# Feature extraction
test = SelectKBest(score_func=chi2, k=3)
fit = test.fit(X, Y)

# Summarize scores
np.set_printoptions(precision=3)
print(fit.scores_)

[4.297e+03 4.378e+03 4.241e+03 6.776e+11 8.232e+00 9.965e+00 1.452e+01]


In [6]:
new_features = fit.transform(X)

In [7]:
# Show results
print('Original number of features:', X.shape[1])
print('Reduced number of features:', new_features.shape[1])

Original number of features: 7
Reduced number of features: 3


In [8]:
# Summarize selected features
print(new_features[0:5,:])

[[       3        4 22887200]
 [       4        4 42398300]
 [       4        4 42932100]
 [       4        4 30678700]
 [       4        4 30667600]]


In [9]:
US = pd.DataFrame(fit.scores_, columns = ["Univariate_Selection"], index=features.columns)
US = US.reset_index()

In [10]:
US.sort_values('Univariate_Selection',ascending=0)

Unnamed: 0,index,Univariate_Selection
3,Volume,677576000000.0
1,High,4378.068
0,Open,4296.58
2,Low,4240.549
6,Buy_Sell,14.51569
5,Buy_Sell_on_Open,9.964768
4,Increase_Decrease,8.23209


Recursive Feature Elimination 

In [11]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [12]:
# Feature extraction
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)
print("Num Features: %s" % (fit.n_features_))
print("Selected Features: %s" % (fit.support_))
print("Feature Ranking: %s" % (fit.ranking_))

Num Features: 3
Selected Features: [ True  True False  True False False False]
Feature Ranking: [1 1 2 1 3 5 4]


In [13]:
Selected = pd.DataFrame(rfe.support_, columns = ["RFE"], index=features.columns)
Selected = Selected.reset_index()

In [14]:
Selected[Selected['RFE'] == True]

Unnamed: 0,index,RFE
0,Open,True
1,High,True
3,Volume,True


Ridge regression 

In [15]:
from sklearn.linear_model import Ridge

In [16]:
ridge = Ridge(alpha=1.0)
ridge.fit(X,Y)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [17]:
def pretty_print_coefs(coefs, names = None, sort = False):
    if names == None:
        names = ["X%s" % x for x in range(len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst,  key = lambda x:-np.abs(x[0]))
    return " + ".join("%s * %s" % (round(coef, 3), name)
                                   for coef, name in lst)

In [18]:
print ("Ridge model:", pretty_print_coefs(ridge.coef_))

Ridge model: 0.021 * X0 + 0.497 * X1 + 0.481 * X2 + 0.0 * X3 + 0.019 * X4 + 0.137 * X5 + -0.001 * X6


Principal Component Analysis

In [19]:
from sklearn.decomposition import PCA

In [20]:
# feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X)
# summarize components
print(("Explained Variance: %s") % fit.explained_variance_ratio_)
print(fit.components_)

Explained Variance: [1.000e+00 3.343e-14 2.281e-16]
[[ 9.465e-08  9.808e-08  9.156e-08  1.000e+00 -2.834e-09  6.781e-10
   3.577e-10]
 [-5.777e-01 -5.813e-01 -5.729e-01  1.641e-07 -1.595e-02 -8.321e-04
  -3.761e-03]
 [ 6.768e-02 -2.406e-02 -3.216e-02 -1.013e-09 -2.190e-01 -6.765e-01
  -6.988e-01]]


Feature Importance

In [21]:
from sklearn.ensemble import ExtraTreesClassifier

In [22]:
# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)
print(model.feature_importances_)

[0.216 0.328 0.321 0.099 0.013 0.015 0.009]


In [23]:
ET = pd.DataFrame(model.feature_importances_, columns = ["Extra Trees"], index=features.columns)

In [24]:
ET = ET.reset_index()
ET.sort_values(['Extra Trees'],ascending=0)

Unnamed: 0,index,Extra Trees
1,High,0.328005
2,Low,0.320661
0,Open,0.215932
3,Volume,0.098947
5,Buy_Sell_on_Open,0.014538
4,Increase_Decrease,0.012885
6,Buy_Sell,0.009032


Random Forest Classifier

In [25]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

clf.fit(X,Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
RFC = pd.DataFrame(clf.feature_importances_, columns = ["RFC"], index=features.columns)

In [27]:
RFC = RFC.reset_index()

In [28]:
RFC.sort_values(['RFC'],ascending=0)

Unnamed: 0,index,RFC
2,Low,0.348548
0,Open,0.247975
1,High,0.226145
3,Volume,0.126685
5,Buy_Sell_on_Open,0.019749
6,Buy_Sell,0.016977
4,Increase_Decrease,0.013921


Chi Square on Features 

In [29]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

model = SelectKBest(score_func=chi2, k=5)
fit = model.fit(X, Y)

In [30]:
print(fit.scores_)

[4.297e+03 4.378e+03 4.241e+03 6.776e+11 8.232e+00 9.965e+00 1.452e+01]


In [31]:
chi_sq = pd.DataFrame(fit.scores_, columns = ["Chi_Square"], index=features.columns)

In [32]:
chi_sq = chi_sq.reset_index()

In [33]:
chi_sq.sort_values('Chi_Square',ascending=0)

Unnamed: 0,index,Chi_Square
3,Volume,677576000000.0
1,High,4378.068
0,Open,4296.58
2,Low,4240.549
6,Buy_Sell,14.51569
5,Buy_Sell_on_Open,9.964768
4,Increase_Decrease,8.23209


L1 Feature Selection

In [34]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

In [35]:
lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, Y)
model = SelectFromModel(lsvc,prefit=True)

In [36]:
l1 = pd.DataFrame(model.get_support(), columns = ["L1"], index=features.columns)

In [37]:
l1 = l1.reset_index()

In [38]:
l1[l1['L1'] == True]

Unnamed: 0,index,L1
0,Open,True
1,High,True
2,Low,True


Multicollinearity Variance Inflation factor

In [40]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [41]:
def calculate_vif(features):
    vif = pd.DataFrame()
    vif["Features"] = features.columns
    vif["VIF"] = [variance_inflation_factor(features.values, i) for i in range(features.shape[1])]    
    return(vif)

In [42]:
vif = calculate_vif(features)
while vif['VIF'][vif['VIF'] > 10].any():
    remove = vif.sort_values('VIF',ascending=0)['Features'][:1]
    features.drop(remove,axis=1,inplace=True)
    vif = calculate_vif(features)

In [43]:
vif

Unnamed: 0,Features,VIF
0,Low,5.213371
1,Volume,4.09331
2,Increase_Decrease,1.695261
3,Buy_Sell_on_Open,1.754247
4,Buy_Sell,1.793754


In [48]:
from functools import reduce
dfs = [US, RFE, ET, RFC, chi_sq, l1, vif]
final_results = reduce(lambda left,right: pd.merge(left,right,on='index'), dfs)

ValueError: can not merge DataFrame with instance of type <class 'abc.ABCMeta'>