In [1]:
import pandas as pd
import os
adult = pd.read_csv('./Data/Adult/adult.data', header=None, 
                    names=["Age", "Work-Class", "fnlwgt", "Education", 
                           "Education-Num", "Marital-Status", "Occupation",
                           "Relationship", "Race", "Sex", "Capital-gain", 
                           "Capital-loss", "Hours-per-week", "Native-Country",
                           "Earnings-Raws"])
adult['LongHours'] = adult['Hours-per-week'] > 40


In [2]:
#将所有数值型特征存储为X , 目标是找到影响“结果”程度最大的几个特征
X = adult[["Age","Education-Num","Capital-gain","Capital-loss","Hours-per-week"]].values
adult['High_Earnings']  = adult['Earnings-Raws'] == ' >50K'


In [3]:
from sklearn.feature_selection import SelectKBest,chi2#选取K种影响结果程度最大的k个特征 
transformer_chi2 = SelectKBest(score_func= chi2 , k = 3)#选取方式是chi2 (卡方函数) ，k = 3
X_chi2 = transformer_chi2.fit_transform(X , adult['High_Earnings'])
transformer_chi2.scores_ #得分越高， 说明该特征队结果（adult['High_Earnings']）的影响力越大

array([8.60061182e+03, 2.40142178e+03, 8.21924671e+07, 1.37214589e+06,
       6.47640900e+03])

In [4]:
import numpy as np
from scipy.stats import pearsonr
def pearson(X , Y):
    scores , ps = [], []
    for col in range(X.shape[1]):
        cur_socre , cur_p = pearsonr(X[:,col], Y)#计算所有行的当前列元素和Y之间的相关系数和p值
        '''
            p值为检验p值， p值越小说明两者的关系越密切(二者关系越显著)
            即单靠相关系数无法得出二者正确的相关性， 需要结合显著水平考虑
        '''
        scores.append(cur_socre)
        ps.append(cur_p)
    return np.array(scores) , np.array(ps)
        

In [5]:
transformer_pearson  = SelectKBest(score_func= pearson ,  k= 3)
X_pearson = transformer_pearson.fit_transform(X , adult['High_Earnings'])#score_func 在此时调用到
transformer_pearson.scores_

array([0.2340371 , 0.33515395, 0.22332882, 0.15052631, 0.22968907])

In [6]:
X_pearson,X_chi2#不同的方法选择出影响程度最大的k种特征（k=3）

(array([[39, 13, 40],
        [50, 13, 13],
        [38,  9, 40],
        ...,
        [58,  9, 40],
        [22,  9, 20],
        [52,  9, 40]], dtype=int64), array([[   39,  2174,     0],
        [   50,     0,     0],
        [   38,     0,     0],
        ...,
        [   58,     0,     0],
        [   22,     0,     0],
        [   52, 15024,     0]], dtype=int64))

In [7]:
from sklearn.cross_validation import cross_val_score
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=14)
scores_chi2 = cross_val_score(clf , X_chi2 , adult['High_Earnings'] , scoring = 'accuracy')
scores_pearson = cross_val_score(clf , X_pearson , adult['High_Earnings'] , scoring='accuracy')
np.mean(scores_chi2)*100, np.mean(scores_pearson)*100



(82.85986761581545, 77.06459415969852)

In [8]:
'''
http://blog.codinglabs.org/articles/pca-tutorial.html
讲解PCA原理的博客
'''
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
X_d = pca.fit_transform(X)#X_Recudtion_D 为X降维到5-D的数据集
print(pca.explained_variance_ratio_)
scores = cross_val_score(clf , X_d , adult['High_Earnings'] , scoring= 'accuracy')
np.mean(scores)*100
#利用pca选取方差最大的k个特征后作为测试特征后效果比用卡方函数打分后选取的特征好

[0.99702849]


82.92435855401915

In [14]:
from sklearn.base import TransformerMixin
from sklearn.utils import as_float_array

In [132]:
class MeanDiscrete(TransformerMixin):
    def fit(self , X ,Y=None):#防止fit进来的参数个数不一样，单纯fit X时，默认Y=None,
        X = as_float_array(X)
        self.mean = X.mean(axis  = 0)
        return self
    def transform(self , X):
        X = as_float_array(X)
        assert X.shape[1] == self.mean.shape[0]
        return X>self.mean


In [135]:
mean_discrete = MeanDiscrete()
X_mean = mean_discrete.fit_transform(X)#继承了TransformerMixin的fit_transform且调用了MeanDiscrete的fit和transform

In [136]:
from numpy.testing import assert_array_equal

In [137]:
'''
    测试块
'''
def test_meandiscrete():
    X_test = np.array([[ 0,  2],
                        [ 3,  5],
                        [ 6,  8],
                        [ 9, 11],
                        [12, 14],
                        [15, 17],
                        [18, 20],
                        [21, 23],
                        [24, 26],
                        [27, 29]])
    mean_discrete = MeanDiscrete()
    mean_discrete.fit(X_test)
    assert_array_equal(mean_discrete.mean , np.array([13.5 , 15.5]))
    X_transformed = mean_discrete.transform(X_test)
    X_expected = np.array([[ 0,  0],
                            [ 0, 0],
                            [ 0, 0],
                            [ 0, 0],
                            [ 0, 0],
                            [ 1, 1],
                            [ 1, 1],
                            [ 1, 1],
                            [ 1, 1],
                            [ 1, 1]])
    assert_array_equal(X_transformed ,X_expected )
test_meandiscrete()


In [139]:
'''
    自己创建转换器：MeanDiscrete，返回各列的值是否大于该列的平均值
'''
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

pipeline = Pipeline([
                    ("meandiscrete",MeanDiscrete()),
                    ("classfier", DecisionTreeClassifier(random_state = 14))
                    ])
scores = cross_val_score(pipeline , X , adult['High_Earnings'] , scoring = 'accuracy')
'''
   函数构造参数时，设定形参的默认值可以兼容不用调用需求下形参个数不一的情况 ，如上方MeanDiscrete类中设定Y=None的函数
'''
np.mean(scores)*100
#np.mean(cross_val_score(clf,MeanDiscrete().fit_transform(X),adult['High_Earnings'] ,scoring = 'accuracy'))*100


80.2708793713566