In [2]:
import os
import pandas as pd
import numpy as np

datapath = os.path.join(os.path.expanduser('~'), 'Downloads','adultdata.csv')

adult = pd.read_csv(datapath, header=None, names=['Age',
                                        'work-class',
                                        'fnlwgt',
                                        'Education',
                                        'Education_Num',
                                        'Marital_status',
                                        'Occupation',
                                        'Relationship',
                                        'Race',
                                        'Sex',
                                        'Capital_gain',
                                        'Capital_loss',
                                        'Hours_per_week',
                                        'Native_country',
                                        'Earnings_raw'])

In [3]:
adult.shape

(32562, 15)

In [4]:
adult.head()

Unnamed: 0,Age,work-class,fnlwgt,Education,Education_Num,Marital_status,Occupation,Relationship,Race,Sex,Capital_gain,Capital_loss,Hours_per_week,Native_country,Earnings_raw
0,39.0,State-gov,77516.0,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50.0,Self-emp-not-inc,83311.0,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38.0,Private,215646.0,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53.0,Private,234721.0,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28.0,Private,338409.0,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [5]:
adult.isnull().sum()

Age               1
work-class        1
fnlwgt            1
Education         1
Education_Num     1
Marital_status    1
Occupation        1
Relationship      1
Race              1
Sex               1
Capital_gain      1
Capital_loss      1
Hours_per_week    1
Native_country    1
Earnings_raw      1
dtype: int64

In [6]:
adult.dropna(how='all',inplace=True)

In [7]:
adult.shape

(32561, 15)

### Choose the best features

In [8]:
# extract data

X = adult[['Age','Education_Num','Capital_gain','Capital_loss', 'Hours_per_week']].values

In [9]:
y = (adult['Earnings_raw'] == '>50K').values

In [10]:
y

array([False, False, False, ..., False, False,  True])

In [11]:
X

array([[3.9000e+01, 1.3000e+01, 2.1740e+03, 0.0000e+00, 4.0000e+01],
       [5.0000e+01, 1.3000e+01, 0.0000e+00, 0.0000e+00, 1.3000e+01],
       [3.8000e+01, 9.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],
       ...,
       [5.8000e+01, 9.0000e+00, 0.0000e+00, 0.0000e+00, 4.0000e+01],
       [2.2000e+01, 9.0000e+00, 0.0000e+00, 0.0000e+00, 2.0000e+01],
       [5.2000e+01, 9.0000e+00, 1.5024e+04, 0.0000e+00, 4.0000e+01]])

In [35]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

transformer = SelectKBest(score_func=chi2, k=3)
Xt_chi2 = transformer.fit_transform(X,y)

print(transformer.scores_)

[8.60061182e+03 2.40142178e+03 8.21924671e+07 1.37214589e+06
 6.47640900e+03]


In [36]:
#transformer = SelectKBest(score_func=chi2, k=4)
#Xt_chi2 = transformer.fit_transform(X,y)

print(transformer.scores_)

[8.60061182e+03 2.40142178e+03 8.21924671e+07 1.37214589e+06
 6.47640900e+03]


In [37]:
X.shape[1]

5

In [38]:
# Use Pearson correlation coefficient

from scipy.stats import pearsonr

def multivariate_pearsonr(X,y):
    scores, pvalues = [],[]
    for column in range(X.shape[1]):
        cur_score, cur_p = pearsonr(X[:,column], y)
        scores.append(abs(cur_score))
        pvalues.append(cur_p)
    return (np.array(scores), np.array(pvalues))

In [39]:
transformer = SelectKBest(score_func=multivariate_pearsonr, k=3)

Xt_pearson = transformer.fit_transform(X,y)
print(transformer.scores_)

[0.2340371  0.33515395 0.22332882 0.15052631 0.22968907]


In [40]:
Xt_pearson #特征数量取决于SelectKBest中k的赋值， Xt_pearson中k=3

array([[39., 13., 40.],
       [50., 13., 13.],
       [38.,  9., 40.],
       ...,
       [58.,  9., 40.],
       [22.,  9., 20.],
       [52.,  9., 40.]])

In [41]:
Xt_chi2 #特征数量取决于SelectKBest中k的赋值， Xt_pearson中k=4

array([[   39.,  2174.,     0.],
       [   50.,     0.,     0.],
       [   38.,     0.,     0.],
       ...,
       [   58.,     0.,     0.],
       [   22.,     0.,     0.],
       [   52., 15024.,     0.]])

In [45]:
# 使用分类器检查特征集合

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

clf = DecisionTreeClassifier(random_state=1)

scores_pearson = cross_val_score(clf, Xt_pearson, y, scoring='accuracy', cv=10)

scores_chi2 = cross_val_score(clf,Xt_chi2, y, scoring='accuracy', cv=10)

In [46]:
np.mean(scores_chi2)

0.8296120753712095

In [47]:
np.mean(scores_pearson)

0.7734713797309745

In [50]:
# Create new feature

adDataPath = os.path.join(os.path.expanduser('~'), 'Downloads','adData.csv')
adData = pd.read_csv(adDataPath, header=None)

In [51]:
adData

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,245,246,247,248,249,250,251,252,253,254
0,125,125,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,57,468,8.2105,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,33,230,6.9696,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,59,460,7.7966,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,60,234,3.9,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
