### 1.加载数据集

In [2]:
import numpy as np
from sklearn import datasets

In [3]:
iris = datasets.load_iris()

In [4]:
X = iris.data
y = iris.target

### 2.拆分数据集  
按照比例拆分并打乱数据集，同时保障数据集的分类均匀

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train,X_test,y_train,y_test =  train_test_split(X,y,train_size=0.8,random_state=1,stratify=y)

### 3.预测

In [8]:
from sklearn.neighbors import KNeighborsClassifier

In [10]:
knn_classfier = KNeighborsClassifier(n_neighbors=5)

In [11]:
knn_classfier.fit(X_train,y_train)

In [12]:
y_predict = knn_classfier.predict(X_test)

In [13]:
y_predict

array([2, 0, 1, 0, 0, 0, 2, 2, 2, 1, 0, 1, 2, 1, 2, 0, 2, 1, 1, 2, 1, 1,
       0, 0, 2, 1, 0, 0, 1, 1])

### 4.评价

In [15]:
num = np.sum(y_test == y_predict)
num

np.int64(29)

In [18]:
precision = np.sum(y_test == y_predict) / len(y_test)
precision

np.float64(0.9666666666666667)

In [19]:
from sklearn.metrics import accuracy_score

In [20]:
accuracy_score(y_test,y_predict)

0.9666666666666667

### 5.调整超参数（hyperparameter）

超参数：机器学习之前，人为设定的参数  
如何找到超参数：先试用经验值，然后根据经验值再尝试其他数  
**KNN算法的超参数**：k(临近点的个数)、加权、明氏距离的p

In [27]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

In [28]:
iris = load_iris()

In [29]:
X = iris.data
y = iris.target

In [30]:
X.shape,y.shape

((150, 4), (150,))

In [31]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.7,random_state=1,stratify=y)

### 超参数的选择

In [49]:
from sklearn.neighbors import KNeighborsClassifier

对于KNN算法有三个超参数，朴素的思路是嵌套循环遍历这三个参数，找到最优解

In [33]:
knn_classifier = KNeighborsClassifier(
    n_neighbors=5,
    weights='distance',#权重根据距离定义
    p=2
)

In [34]:
knn_classifier.fit(X_train,y_train)

In [35]:
knn_classifier.score(X_test,y_test)

0.9777777777777777

In [50]:
best_score = -1
best_p=-1
best_n = -1
best_weights = ""
for n in range(1,20):
    for p in range(1,7):
        for weights in ['uniform','distance']:
            
            knn_classifier = KNeighborsClassifier(
                            n_neighbors = n,
                            weights = weights,#权重根据距离定义
                            p = p
                            ) 
            knn_classifier.fit(X_train,y_train)
            score = knn_classifier.score(X_test,y_test)
        
            if score > best_score:
                best_score = score
                best_p = p
                best_n = n
                best_weights = weights
         
print("score:",best_score)
print("p:",best_p)
print("n:",best_n)
print("weights:",best_weights)

score: 0.9777777777777777
p: 1
n: 1
weights: uniform


### sklearn 超参数搜索

In [37]:
from sklearn.model_selection import GridSearchCV

In [43]:
params = {
    'n_neighbors':[n for n in range(1,20)],
    'weights':['uniform','distance'],
    'p':[p for p in range(1,7)]
}

In [44]:
grid = GridSearchCV(
    estimator = KNeighborsClassifier(),
    param_grid=  params,
    n_jobs = -1#并行参数
)

In [45]:
grid.fit(X_train,y_train)

In [46]:
grid.best_params_

{'n_neighbors': 13, 'p': 1, 'weights': 'uniform'}

In [47]:
grid.best_score_

np.float64(0.9619047619047618)

In [48]:
grid.best_estimator_.predict(X_test)

array([2, 0, 0, 1, 1, 1, 2, 1, 2, 0, 0, 2, 0, 1, 0, 1, 2, 1, 1, 2, 2, 0,
       1, 2, 1, 1, 1, 2, 0, 2, 0, 0, 1, 1, 2, 2, 0, 0, 0, 1, 2, 2, 1, 0,
       0])

### 上面的操作还少了一步--特征归一化

将所有的特征用统一的比例进行缩放，这样计算的距离才能真正表现出实际距离,常用的特征归一化：最大值最小值归一化、零均值归一化

$$x_{norm} = \frac{x - x_{min}}{x_{max} - x_{min}}$$  最大值最小值归一化

$$x_{norm} = \frac{x - \mu}{\sigma}$$  
$$ \mu = \frac{1}{n} \sum_{i=1}^{n}x_i$$  
$$\sigma = \sqrt{\frac{1}{n}\sum_{i=1}^{n}(x_i - \mu)^{2}}$$
零均值归一化

#### 最大值最小值归一化

In [52]:
iris = load_iris()

In [53]:
X = iris.data
y = iris.target

In [54]:
np.max(X[:,0])

np.float64(7.9)

In [55]:
np.min(X[:,0])

np.float64(4.3)

In [56]:
X[:,0] = (X[:,0] - np.min(X[:,0])) / (np.max(X[:,0]) - np.min(X[:,0]))

In [60]:
X[:5,0]

array([0.22222222, 0.16666667, 0.11111111, 0.08333333, 0.19444444])

#### 零均值归一化

In [61]:
np.mean(X[:,0])#计算平均值

np.float64(0.42870370370370364)

In [62]:
np.std(X[:,0])#计算标准差

np.float64(0.22925035882920577)

In [64]:
X[:,0] = (X[:,0] - np.mean(X[:,0]))/ np.std(X[:,0])

In [65]:
X[:5,0]

array([-0.90068117, -1.14301691, -1.38535265, -1.50652052, -1.02184904])

## scikit-learn中已经实现了特征值归一化

In [70]:
X = iris.data
y = iris.target

In [67]:
from sklearn.preprocessing import StandardScaler

In [69]:
standard = StandardScaler()

In [71]:
standard.fit(X)

In [72]:
standard.mean_ # 每一列的均值

array([2.26207941e-16, 3.05733333e+00, 3.75800000e+00, 1.19933333e+00])

In [74]:
standard.scale_ # 每一列的标准差

array([1.        , 0.43441097, 1.75940407, 0.75969263])

In [76]:
X= standard.transform(X) # 零均值归一化

**特别注意**：  
我们拿到数据集后，要拆分为训练集和测试集，我们计算训练集的 $\mu$ 和 $\sigma$ 后均一化训练集，也要用训练集的$\mu$ 和 $\sigma$ 应用在测试集上