In [83]:
import pandas as pd
import os
adult = pd.read_csv('./Data/Adult/adult.data', header=None, 
                    names=["Age", "Work-Class", "fnlwgt", "Education", 
                           "Education-Num", "Marital-Status", "Occupation",
                           "Relationship", "Race", "Sex", "Capital-gain", 
                           "Capital-loss", "Hours-per-week", "Native-Country",
                           "Earnings-Raws"])
adult['LongHours'] = adult['Hours-per-week'] > 40


In [84]:
#将所有数值型特征存储为X , 目标是找到影响“结果”程度最大的几个特征
X = adult[["Age","Education-Num","Capital-gain","Capital-loss","Hours-per-week"]].values
adult['High_Earnings']  = adult['Earnings-Raws'] == ' >50K'


In [85]:
from sklearn.feature_selection import SelectKBest,chi2#选取K种影响结果程度最大的k个特征 
transformer_chi2 = SelectKBest(score_func= chi2 , k = 3)#选取方式是chi2 (卡方函数) ，k = 3
X_chi2 = transformer_chi2.fit_transform(X , adult['High_Earnings'])
transformer_chi2.scores_ #得分越高， 说明该特征队结果（adult['High_Earnings']）的影响力越大

array([    8600.612,     2401.422, 82192467.142,  1372145.89 ,
           6476.409])

In [86]:
import numpy as np
from scipy.stats import pearsonr
def pearson(X , Y):
    scores , ps = [], []
    for col in range(X.shape[1]):
        cur_socre , cur_p = pearsonr(X[:,col], Y)#计算所有行的当前列元素和Y之间的相关系数和p值
        '''
            p值为检验p值， p值越小说明两者的关系越密切(二者关系越显著)
            即单靠相关系数无法得出二者正确的相关性， 需要结合显著水平考虑
        '''
        scores.append(cur_socre)
        ps.append(cur_p)
    return np.array(scores) , np.array(ps)
        

In [87]:
transformer_pearson  = SelectKBest(score_func= pearson ,  k= 3)
X_pearson = transformer_pearson.fit_transform(X , adult['High_Earnings'])#score_func 在此时调用到
transformer_pearson.scores_

array([0.234, 0.335, 0.223, 0.151, 0.23 ])

In [88]:
X_pearson,X_chi2#不同的方法选择出影响程度最大的k种特征（k=3）

(array([[39, 13, 40],
        [50, 13, 13],
        [38,  9, 40],
        ...,
        [58,  9, 40],
        [22,  9, 20],
        [52,  9, 40]], dtype=int64), array([[   39,  2174,     0],
        [   50,     0,     0],
        [   38,     0,     0],
        ...,
        [   58,     0,     0],
        [   22,     0,     0],
        [   52, 15024,     0]], dtype=int64))

In [89]:
from sklearn.cross_validation import cross_val_score
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=14)
scores_chi2 = cross_val_score(clf , X_chi2 , adult['High_Earnings'] , scoring = 'accuracy')
scores_pearson = cross_val_score(clf , X_pearson , adult['High_Earnings'] , scoring='accuracy')
np.mean(scores_chi2)*100, np.mean(scores_pearson)*100

(82.85986761581545, 77.06459415969852)

In [91]:
'''
http://blog.codinglabs.org/articles/pca-tutorial.html
讲解PCA原理的博客
'''
from sklearn.decomposition import PCA
pca = PCA(n_components=1)
X_d = pca.fit_transform(X)#X_Recudtion_D 为X降维到5-D的数据集
print(pca.explained_variance_ratio_)
scores = cross_val_score(clf , X_d , adult['High_Earnings'] , scoring= 'accuracy')
np.mean(scores)*100
#利用pca选取方差最大的k个特征后作为测试特征后效果比用卡方函数打分后选取的特征好

[0.997]


82.92435855401915