In [3]:
# 加载数据(Data Loading)
import numpy as np
from urllib import request
# url with dataset
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
# download the file
raw_data = request.urlopen(url)
# load the CSV file as a numpy matrix
dataset = np.loadtxt(raw_data, delimiter=",")
# separate the data from the target attributes
X = dataset[:,0:7]
y = dataset[:,8]

In [6]:
#print(dataset)
print(len(y))
print(len(X))
#print(X)

768
768


In [11]:
# 数据归一化(Data Normalization)
# 大多数机器学习算法中的梯度方法对于数据的缩放和尺度都是很敏感的，
# 在开始跑算法之前，我们应该进行归一化或者标准化的过程，这使得特征数据缩放到0-1范围中。scikit-learn提供了归一化的方法：

from sklearn import preprocessing
# normalize the data attributes
normalized_X = preprocessing.normalize(X)
# standardize the data attributes
standardized_X = preprocessing.scale(X)
print(normalized_X)
print(standardized_X)

[[ 0.03494617  0.86200564  0.41935409 ...,  0.          0.19569858
   0.00365188]
 [ 0.00872683  0.74178025  0.57597054 ...,  0.          0.23213358
   0.00306312]
 [ 0.04093566  0.93640332  0.32748532 ...,  0.          0.11922512
   0.0034386 ]
 ..., 
 [ 0.02727338  0.66001582  0.39273669 ...,  0.61092373  0.14291252
   0.0013364 ]
 [ 0.0070043   0.8825414   0.42025781 ...,  0.          0.21082934
   0.0024445 ]
 [ 0.00804902  0.74855891  0.56343144 ...,  0.          0.24469022
   0.00253544]]
[[ 0.63994726  0.84832379  0.14964075 ..., -0.69289057  0.20401277
   0.46849198]
 [-0.84488505 -1.12339636 -0.16054575 ..., -0.69289057 -0.68442195
  -0.36506078]
 [ 1.23388019  1.94372388 -0.26394125 ..., -0.69289057 -1.10325546
   0.60439732]
 ..., 
 [ 0.3429808   0.00330087  0.14964075 ...,  0.27959377 -0.73518964
  -0.68519336]
 [-0.84488505  0.1597866  -0.47073225 ..., -0.69289057 -0.24020459
  -0.37110101]
 [-0.84488505 -0.8730192   0.04624525 ..., -0.69289057 -0.20212881
  -0.47378505]]


In [12]:
# 特征选择
# 在解决一个实际问题的过程中，选择合适的特征或者构建特征的能力特别重要。这成为特征选择或者特征工程。 
# 特征选择时一个很需要创造力的过程，更多的依赖于直觉和专业知识，并且有很多现成的算法来进行特征的选择。 
# 下面的树算法(Tree algorithms)计算特征的信息量：
from sklearn import metrics
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(X, y)
# display the relative importance of each attribute
print(model.feature_importances_)

[ 0.12937825  0.29224411  0.10702516  0.0893193   0.07271615  0.16651331
  0.14280372]


In [25]:
# 逻辑回归
# 大多数问题都可以归结为二元分类问题。这个算法的优点是可以给出数据所在类别的概率。
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
             precision    recall  f1-score   support

        0.0       0.79      0.89      0.84       500
        1.0       0.74      0.55      0.63       268

avg / total       0.77      0.77      0.77       768

[[447  53]
 [120 148]]


In [14]:
# 朴素贝叶斯
# 这也是著名的机器学习算法，该方法的任务是还原训练样本数据的分布密度，其在多类别分类中有很好的效果。
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

GaussianNB(priors=None)
             precision    recall  f1-score   support

        0.0       0.80      0.86      0.83       500
        1.0       0.69      0.60      0.64       268

avg / total       0.76      0.77      0.76       768

[[429  71]
 [108 160]]


In [15]:
# k近邻
#       k近邻算法常常被用作是分类算法一部分，比如可以用它来评估特征，在特征选择上我们可以用到它。

from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
# fit a k-nearest neighbor model to the data
model = KNeighborsClassifier()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')
             precision    recall  f1-score   support

        0.0       0.82      0.90      0.86       500
        1.0       0.77      0.63      0.69       268

avg / total       0.80      0.80      0.80       768

[[448  52]
 [ 98 170]]


In [16]:
# 决策树
#       分类与回归树(Classification and Regression Trees ,CART)算法常用于特征含有类别信息的分类或者回归问题，这种方法非常适用于多分类情况。

from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
# fit a CART model to the data
model = DecisionTreeClassifier()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00       500
        1.0       1.00      1.00      1.00       268

avg / total       1.00      1.00      1.00       768

[[500   0]
 [  0 268]]


In [17]:
# 支持向量机

#       SVM是非常流行的机器学习算法，主要用于分类问题，如同逻辑回归问题，它可以使用一对多的方法进行多类别的分类。
from sklearn import metrics
from sklearn.svm import SVC
# fit a SVM model to the data
model = SVC()
model.fit(X, y)
print(model)
# make predictions
expected = y
predicted = model.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
             precision    recall  f1-score   support

        0.0       1.00      1.00      1.00       500
        1.0       1.00      1.00      1.00       268

avg / total       1.00      1.00      1.00       768

[[500   0]
 [  0 268]]


In [18]:
# 如何优化算法参数

#       一项更加困难的任务是构建一个有效的方法用于选择正确的参数，我们需要用搜索的方法来确定参数。scikit-learn提供了实现这一目标的函数。 
#       下面的例子是一个进行正则参数选择的程序：
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.grid_search import GridSearchCV
# prepare a range of alpha values to test
alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
# create and fit a ridge regression model, testing each alpha
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
grid.fit(X, y)
print(grid)
# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)

GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([  1.00000e+00,   1.00000e-01,   1.00000e-02,   1.00000e-03,
         1.00000e-04,   0.00000e+00])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)
0.28211895568567485
1.0




In [19]:
#  有时随机从给定区间中选择参数是很有效的方法，然后根据这些参数来评估算法的效果进而选择最佳的那个。
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn.linear_model import Ridge
from sklearn.grid_search import RandomizedSearchCV
# prepare a uniform distribution to sample for the alpha parameter
param_grid = {'alpha': sp_rand()}
# create and fit a ridge regression model, testing random alpha values
model = Ridge()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=100)
rsearch.fit(X, y)
print(rsearch)
# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)

RandomizedSearchCV(cv=None, error_score='raise',
          estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
          fit_params={}, iid=True, n_iter=100, n_jobs=1,
          param_distributions={'alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000000005F3ED30>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, verbose=0)
0.2821187956334405
0.994062646938
