In [1]:
import sklearn as sk

In [2]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import tensorflow as tf

In [3]:
# example
from sklearn import neighbors, datasets, preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
iris = datasets.load_iris() # 数据集加载
X, y = iris.data[:, :2], iris.target 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33) # 训练集测试集切分
scaler = preprocessing.StandardScaler().fit(X_train) # 数据标准化处理
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
knn = neighbors.KNeighborsClassifier(n_neighbors=5) # 初始化一个空的模型
knn.fit(X_train, y_train) # 模型训练
y_pred = knn.predict(X_test) 
accuracy_score(y_test, y_pred) # 计算准确率



0.631578947368421

**`model select / 模型选择`**

In [4]:
# 超参数
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.grid_search import GridSearchCV
# 设定不同超参数值
alphas = np.array([1,0.1,0.01,0.001,0.0001,0])
# 设定模型
model = Ridge()
grid = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))
grid.fit(X, y)
print(grid)
# 对超参数进行遍历，查看其表现
print(grid.best_score_)
print(grid.best_estimator_.alpha)



GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': array([1.e+00, 1.e-01, 1.e-02, 1.e-03, 1.e-04, 0.e+00])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)
0.0
1.0


**`特征工程`**

In [14]:
# 归一化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
standardized_X = scaler.transform(X_train)
standardized_X_test = scaler.transform(X_test)

In [18]:
# 正则化
# preprocessing.normalize(X, norm='l2')

from sklearn.preprocessing import Normalizer
scaler = Normalizer().fit(X_train)
normalized_X = scaler.transform(X_train)
normalized_X_test = scaler.transform(X_test)

In [19]:
# one-hot decode

In [21]:
data = [[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]
encoder = preprocessing.OneHotEncoder().fit(data)
encoder.transform(data).toarray()

array([[1., 0., 1., 0., 0., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 1., 0., 1., 0., 0.],
       [0., 1., 1., 0., 0., 0., 0., 1., 0.]])

**`评估方法`**

In [5]:
# sklearn.metrics
# 模型自带
knn.score(X_test, y_test)

# 导入metrics包
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.631578947368421

In [6]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00         8
          1       0.42      0.73      0.53        11
          2       0.73      0.42      0.53        19

avg / total       0.70      0.63      0.63        38



**`dataset`**
+ http://scikit-learn.org/stable/modules/classes.html#module-sklearn.datasets

**`模型持久化`**

In [11]:
from sklearn.externals import joblib

In [13]:
# 保存模型
joblib.dump(model, 'dataset/model.pkl')

# 载入模型
model = joblib.load('dataset/model.pkl')