# load the preprocessing: transform description to number([male,female]->[0,1])

[【机器学习实验】scikit-learn的主要模块和基本使用](http://www.jianshu.com/p/1c6efdbce226)

In [1]:
def loadXandY(fileName):
    import numpy as np
    
    # load the CSV file as a numpy matrix
    with open(fileName, 'r') as f:
        dataset = np.loadtxt(f, delimiter=",")
    
    # separate the data from the target attributes
    attrsize = len(dataset[0])
    
    print(len(dataset), attrsize)
    
    X = dataset[:,0:-1]
    y = dataset[:,-1]
    return X,y

In [2]:
baseDir = 'H:/practice/scikit_class/scikit_learning/uci_adult/adult_data/'
# baseDir = 'adult_data/'

fileName = baseDir+'adult.data.num'

testFileName = baseDir+'adult.test.num'

X,y = loadXandY(fileName)
TX,Ty = loadXandY(testFileName)

32561 15
16281 15


# deal with the missing value
[reference](http://scikit-learn.org/stable/modules/preprocessing.html#imputation-of-missing-values)

In [3]:
def simpleImputer(X):
    from sklearn.preprocessing import Imputer
    imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
    imp.fit(X)
    return imp.transform(X)

In [4]:
X = simpleImputer(X)
TX = simpleImputer(TX)

数据归一化(Data Normalization)

In [5]:
def simpleProcessing(X):
    from sklearn import preprocessing
    # normalize the data attributes
    normalized_X = preprocessing.normalize(X)
    # standardize the data attributes
    standardized_X = preprocessing.scale(X)
    return X

In [6]:
X = simpleProcessing(X)
TX = simpleImputer(TX)

特征选择(Feature Selection)

In [7]:
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X, y)
# display the relative importance of each attribute
print(model.feature_importances_)

[ 0.15935025  0.04029886  0.16221789  0.03665789  0.08092646  0.06885382
  0.08127987  0.09690376  0.01489237  0.02173822  0.09114213  0.03247581
  0.09440882  0.01885385]


In [8]:
def make_predictions(model, X, y):
    from sklearn import metrics
    # make predictions
    expected = y
    predicted = model.predict(X)
    # summarize the fit of the model
    print(metrics.classification_report(expected, predicted))
    print(metrics.confusion_matrix(expected, predicted))
    print('\n')

In [9]:
# 逻辑回归
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X, y)
print(model)

make_predictions(model, X, y)

make_predictions(model, TX, Ty)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
             precision    recall  f1-score   support

        0.0       0.71      0.27      0.39      7841
        1.0       0.81      0.97      0.88     24720

avg / total       0.78      0.80      0.76     32561

[[ 2087  5754]
 [  840 23880]]


             precision    recall  f1-score   support

        0.0       0.71      0.26      0.39      3846
        1.0       0.81      0.97      0.88     12435

avg / total       0.79      0.80      0.76     16281

[[ 1018  2828]
 [  414 12021]]




In [10]:
# 朴素贝叶斯
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X, y)
print(model)

make_predictions(model, X, y)

make_predictions(model, TX, Ty)

GaussianNB()
             precision    recall  f1-score   support

        0.0       0.66      0.31      0.42      7841
        1.0       0.81      0.95      0.88     24720

avg / total       0.78      0.80      0.77     32561

[[ 2441  5400]
 [ 1255 23465]]


             precision    recall  f1-score   support

        0.0       0.64      0.31      0.41      3846
        1.0       0.82      0.95      0.88     12435

avg / total       0.77      0.80      0.77     16281

[[ 1176  2670]
 [  663 11772]]




In [11]:
# K近邻
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
# fit a k-nearest neighbor model to the data
model = KNeighborsClassifier()
model.fit(X, y)
print(model)

make_predictions(model, X, y)

make_predictions(model, TX, Ty)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
             precision    recall  f1-score   support

        0.0       0.76      0.47      0.58      7841
        1.0       0.85      0.95      0.90     24720

avg / total       0.83      0.84      0.82     32561

[[ 3650  4191]
 [ 1130 23590]]


             precision    recall  f1-score   support

        0.0       0.55      0.32      0.41      3846
        1.0       0.81      0.92      0.86     12435

avg / total       0.75      0.78      0.75     16281

[[ 1239  2607]
 [ 1028 11407]]




In [12]:
# 决策树
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
# fit a CART model to the data
model = DecisionTreeClassifier()
model.fit(X, y)
print(model)

make_predictions(model, X, y)

make_predictions(model, TX, Ty)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00      7841
        1.0       1.00      1.00      1.00     24720

avg / total       1.00      1.00      1.00     32561

[[ 7841     0]
 [    1 24719]]


             precision    recall  f1-score   support

        0.0       0.60      0.62      0.61      3846
        1.0       0.88      0.87      0.88     12435

avg / total       0.82      0.81      0.81     16281

[[ 2378  1468]
 [ 1557 10878]]




In [13]:
# SVM
from sklearn import metrics
from sklearn.svm import SVC
# fit a SVM model to the data
model = SVC()
model.fit(X, y)
print(model)


make_predictions(model, X, y)

make_predictions(model, TX, Ty)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
             precision    recall  f1-score   support

        0.0       1.00      0.99      1.00      7841
        1.0       1.00      1.00      1.00     24720

avg / total       1.00      1.00      1.00     32561

[[ 7766    75]
 [    0 24720]]


             precision    recall  f1-score   support

        0.0       0.52      0.00      0.01      3846
        1.0       0.76      1.00      0.87     12435

avg / total       0.71      0.76      0.66     16281

[[   15  3831]
 [   14 12421]]




In [14]:
# 优化算法参数
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.grid_search import GridSearchCV
# prepare a range of alpha values to test
alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
# create and fit a ridge regression model, testing each alpha
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
grid.fit(X, y)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)

GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e+00,   1.00000e-01,   1.00000e-02,   1.00000e-03,
         1.00000e-04,   0.00000e+00])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)
0.312294629958
1.0


In [15]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn.linear_model import Ridge
from sklearn.grid_search import RandomizedSearchCV
# prepare a uniform distribution to sample for the alpha parameter
param_grid = {'alpha': sp_rand()}
# create and fit a ridge regression model, testing random alpha values
model = Ridge()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
rsearch.fit(X, y)
print(rsearch)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
          fit_params={}, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001DFC7A76518>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, verbose=0)
0.312294629657
0.989903640912
