In [1]:
import numpy as np
import warnings

from sklearn.feature_selection import VarianceThreshold,SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingClassifier


In [2]:
X = np.array([
    [0, 2, 0, 3],
    [0, 1, 4, 3],
    [0.1, 1, 1, 3]
], dtype=np.float32)
Y = np.array([1,2,1])

In [3]:
# 基于方差选择最优的特征属性
variance = VarianceThreshold(threshold=0.1)
print(variance)
variance.fit(X)
print("各个特征属性的方差为:")
print(variance.variances_)
print('-----------------')
print(variance.transform(X))

VarianceThreshold(threshold=0.1)
各个特征属性的方差为:
[  2.22222229e-03   2.22222222e-01   2.88888889e+00   0.00000000e+00]
-----------------
[[ 2.  0.]
 [ 1.  4.]
 [ 1.  1.]]


In [4]:
sk1 = SelectKBest(f_regression, k=2)
sk1.fit(X, Y)
print(sk1)
print('------------')
print(sk1.scores_)
print('------------')
print(sk1.transform(X))

SelectKBest(k=2, score_func=<function f_regression at 0x000002045A8E2268>)
------------
[  0.33333333   0.33333333  16.33333333          nan]
------------
[[ 2.  0.]
 [ 1.  4.]
 [ 1.  1.]]


  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [5]:
# 使用chi2的时候要求特征属性的取值为非负数
sk2 = SelectKBest(chi2, k=2)
sk2.fit(X, Y)
print(sk2)
print(sk2.scores_)
print(sk2.transform(X))

SelectKBest(k=2, score_func=<function chi2 at 0x000002045A8E21E0>)
[ 0.05   0.125  4.9    0.   ]
[[ 2.  0.]
 [ 1.  4.]
 [ 1.  1.]]


In [6]:
# 基于特征消去法做的特征选择
estimator = SVR(kernel='linear')
selector = RFE(estimator, 2, step=1)
selector = selector.fit(X, Y)
print(selector.support_)
print(selector.n_features_)
print(selector.ranking_)
print(selector.transform(X))

[False  True  True False]
2
[2 1 1 3]
[[ 2.  0.]
 [ 1.  4.]
 [ 1.  1.]]


In [24]:
X2 = np.array([
    [ 5.1,  3.5,  1.4,  0.2],
    [ 4.9,  3. ,  1.4,  0.2],
    [ -6.2,  0.4,  5.4,  2.3],
    [ -5.9,  0. ,  5.1,  1.8]
], dtype=np.float64)
Y2 = np.array([0, 0, 2, 2])
estimator = LogisticRegression(penalty='l1', C=0.1)
sfm = SelectFromModel(estimator)
sfm.fit(X2, Y2)
print(sfm.transform(X2))
print("系数:")
print(sfm.estimator_.coef_)

[[ 5.1]
 [ 4.9]
 [-6.2]
 [-5.9]]
系数:
[[-0.03417754  0.          0.          0.        ]]


In [33]:
X2 = np.array([
    [ 5.1,  3.5,  1.4,  0.2],
    [ 4.9,  3. ,  1.4,  0.2],
    [ -6.2,  0.4,  5.4,  2.3],
    [ -5.9,  0. ,  5.1,  1.8]
], dtype=np.float64)
Y2 = np.array([0, 0, 2, 2])
estimator = GradientBoostingClassifier(random_state=14)
# 如果基础模型是GBDT，那么一般阈值最好选择0，原因：可以认为当大于0的特征属性是具有影响y值的作用的，所以针对这些特征我们保留
# 如果删除了，重要性大于0的特征的话，那么相当于后续的模型训练过程中，存在部分判断能力的特征被删除了，那么模型效果可能会差一点
sfm = SelectFromModel(estimator,threshold=0.15)
sfm.fit(X2, Y2)
print(sfm.transform(X2))
print("权重因子:")
print(sfm.estimator_.feature_importances_ )

[[ 5.1  3.5  0.2]
 [ 4.9  3.   0.2]
 [-6.2  0.4  2.3]
 [-5.9  0.   1.8]]
权重因子:
[ 0.18  0.23  0.12  0.21]


In [36]:
from sklearn.decomposition import PCA
X2 = np.array([
    [ 5.1,  3.5,  1.4,  0.2, 1, 23],
    [ 4.9,  3. ,  1.4,  0.2, 2.3, 2.1],
    [ -6.2,  0.4,  5.4,  2.3, 2, 23],
    [ -5.9,  0. ,  5.1,  1.8, 2, 3]
], dtype=np.float64)
# n_components: 给定降低到多少维度，但是要求该值必须小于等于样本数目/特征数目，如果给定的值大于，那么会选择样本数目/特征数目中最小的那个作为最终的特征数目
# whiten：是否做一个白化的操作，在PCA的基础上，对于特征属性是否做一个标准化
pca = PCA(n_components=3,whiten=True)
pca.fit(X2)
print(pca.mean_)
print(pca.components_)
print(pca.transform(X2))

[ -0.525   1.725   3.325   1.125   1.825  12.775]
[[ 0.02038178 -0.01698103 -0.01350052 -0.0149724   0.03184796 -0.99893718]
 [ 0.9024592   0.25030511 -0.31422084 -0.15092666 -0.03185873  0.01965141]
 [-0.08872116 -0.06952185 -0.06858116 -0.3074396  -0.94204108 -0.02512755]]
[[-0.98788492  1.06095297  0.94787245]
 [ 1.0554112   0.93712001 -1.00394885]
 [-1.01046952 -0.99473186 -0.99471598]
 [ 0.94294324 -1.00334112  1.05079239]]


In [10]:
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
X = np.array([
    [-1, -1, 3, 1], 
    [-2, -1, 2, 4], 
    [-3, -2, 4, 5], 
    [1, 1, 5, 4], 
    [2, 1, 6, -5], 
    [3, 2, 1, 5]])
y = np.array([1, 1, 2, 2, 0, 1])
# n_components：给定降低到多少维度，要求给定的这个值和y的取值数量有关，不能超过n_class-1
clf = LinearDiscriminantAnalysis(n_components=2)
clf.fit(X, y)
print(clf.transform(X))

[[-3.2688434  -0.38911349]
 [-1.25507558 -1.78088569]
 [ 5.26064254 -0.49688862]
 [ 6.34385833  1.16134391]
 [-4.05800618  3.58297801]
 [-3.02257571 -2.07743411]]


