# Feature Engineering

In [38]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
from pandas import read_csv
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif,chi2,f_classif
from sklearn.feature_selection import f_regression, mutual_info_regression

In [22]:
# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
dataframe

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [23]:
#Convert DataFrame into array
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

In [12]:
dataframe.shape

(768, 9)

In [None]:
#For classification (Categiorical Y Variable): chi2, f_classif, mutual_info_classif

In [34]:
# feature extraction chi2
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)
# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)

[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]


In [35]:
# feature extraction f_classif
test = SelectKBest(score_func=f_classif, k=4)
fit = test.fit(X, Y)
# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)

[ 39.67  213.162   3.257   4.304  13.281  71.772  23.871  46.141]


In [36]:
# feature extraction mutual_info_classif
test = SelectKBest(score_func=mutual_info_classif, k=4)
fit = test.fit(X, Y)
# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)

[0.039 0.124 0.038 0.042 0.062 0.072 0.014 0.057]


In [None]:
#For regression (Numeric Y Variable): f_regression, mutual_info_regression

In [39]:
# feature extraction f_regression
test = SelectKBest(score_func=f_regression, k=4)
fit = test.fit(X, Y)
# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)

[ 39.67  213.162   3.257   4.304  13.281  71.772  23.871  46.141]


In [40]:
# feature extraction mutual_info_regression
test = SelectKBest(score_func=mutual_info_regression, k=4)
fit = test.fit(X, Y)
# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)

[0.018 0.15  0.019 0.    0.069 0.086 0.01  0.047]


# Recursive Feature Elimination

In [16]:
# Feature Extraction with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = LogisticRegression(max_iter=400) #by default 100 iterations
rfe = RFE(model, 3)   #3 is no. of features will be selected at a time
fit = rfe.fit(X, Y)




In [17]:
#Num Features: We have passed 3 features at a time
fit.n_features_

3

In [18]:
#Selected Features: It will give 3 useful features
fit.support_

array([ True, False, False, False, False,  True,  True, False])

In [19]:
# Feature Ranking:1 ranking is highest ranking.
fit.ranking_

array([1, 2, 4, 6, 5, 1, 1, 3])

# Feature Importance using Decision Tree

In [42]:
# Feature Importance with Extra Trees Classifier for categeorical Values Y
from pandas import read_csv
from sklearn.tree import  DecisionTreeClassifier
# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = DecisionTreeClassifier()
model.fit(X, Y)
print(model.feature_importances_)

[0.045 0.323 0.076 0.023 0.046 0.24  0.134 0.114]


In [44]:
# Feature Importance with Extra Trees DecisionTreeRegressor For Numeric Y values
from pandas import read_csv
from sklearn.tree import  DecisionTreeRegressor
# load data
filename = 'pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = DecisionTreeClassifier()
model.fit(X, Y)
print(model.feature_importances_)

[0.048 0.318 0.089 0.015 0.037 0.255 0.125 0.112]
