In [2]:
#Data Feature Selection: Univariate Selection
from pandas import read_csv
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

path = r"../../data/datasets/pima_indians_diabetes.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(path, names=names, skiprows=[0])

array = dataframe.values

X = array[:,0:8]
Y = array[:,8]

test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X,Y)

set_printoptions(precision=2)
print(fit.scores_)

featured_data = fit.transform(X)
print ("\nFeatured data:\n", featured_data[0:4])

[ 111.52 1411.89   17.61   53.11 2175.57  127.67    5.39  181.3 ]

Featured data:
 [[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 [ 89.   94.   28.1  21. ]]


In [20]:
#Data Feature Selection: Recursive Feature Elimination
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

path = r"../../data/datasets/pima_indians_diabetes.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(path, names=names, skiprows=[0])
array = dataframe.values

X = array[:,0:8]
Y = array[:,8]

model = LogisticRegression(max_iter=1000)
rfe = RFE(model, n_features_to_select=3)
fit = rfe.fit(X, Y)

print("Number of Features: %d")
print("Selected Features: %s")
print("Feature Ranking: %s")

Number of Features: %d
Selected Features: %s
Feature Ranking: %s


In [28]:
#Data Feature Selection: PCA
from pandas import read_csv
from sklearn.decomposition import PCA

path = r"../../data/datasets/pima_indians_diabetes.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(path, names=names, skiprows=[0])
array = dataframe.values

X = array[:,0:8]
Y = array[:,8]

pca = PCA(n_components=3)
fit = pca.fit(X)

print(fit.components_)

[[-2.02e-03  9.78e-02  1.61e-02  6.08e-02  9.93e-01  1.40e-02  5.37e-04
  -3.56e-03]
 [-2.26e-02 -9.72e-01 -1.42e-01  5.79e-02  9.46e-02 -4.70e-02 -8.17e-04
  -1.40e-01]
 [-2.25e-02  1.43e-01 -9.22e-01 -3.07e-01  2.10e-02 -1.32e-01 -6.40e-04
  -1.25e-01]]


In [2]:
#Data Feature Selection: Feature Importance
from pandas import read_csv
from sklearn.ensemble import ExtraTreesClassifier

path = r"../../data/datasets/pima_indians_diabetes.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(path, names=names, skiprows=[0])
array = dataframe.values

X = array[:,0:8]
Y = array[:,8]

model = ExtraTreesClassifier()
model.fit(X, Y)
print(model.feature_importances_)

[0.11275642 0.22704687 0.09683332 0.08395854 0.07496783 0.14338731
 0.12224294 0.13880675]
