In [11]:
import numpy as np

### 1.准备数据

In [12]:
# 加载数据集
X = np.genfromtxt('letter-recognition.data', delimiter=',', usecols=range(1, 17))
X

array([[ 2.,  8.,  3., ...,  8.,  0.,  8.],
       [ 5., 12.,  3., ...,  8.,  4., 10.],
       [ 4., 11.,  6., ...,  7.,  3.,  9.],
       ...,
       [ 6.,  9.,  6., ..., 12.,  2.,  4.],
       [ 2.,  3.,  4., ...,  9.,  5.,  8.],
       [ 4.,  9.,  6., ...,  7.,  2.,  8.]])

In [13]:
y = np.genfromtxt('letter-recognition.data', delimiter=',', usecols=0, dtype=np.str)
y

array(['T', 'I', 'D', ..., 'T', 'S', 'A'], dtype='<U1')

In [14]:
y = np.where(y == 'C', 1, -1)
y

array([-1, -1, -1, ..., -1, -1, -1])

### 2.模型训练与测试

In [15]:
from svm import SMO
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [16]:
# 创建模型
clf = SMO(C=1, tol=0.01, kernel='rbf', gamma=0.01)

In [17]:
# 将数据集分为训练集和测试集(7:3开)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train, X_test, y_train, y_test

(array([[ 3., 10.,  4., ...,  8.,  7.,  9.],
        [ 4.,  7.,  4., ...,  8.,  8.,  8.],
        [ 5.,  8.,  7., ...,  8., 10.,  7.],
        ...,
        [ 6., 10.,  8., ...,  9.,  5.,  8.],
        [ 6.,  8.,  7., ...,  8.,  5., 11.],
        [ 3.,  3.,  6., ...,  8.,  3.,  8.]]),
 array([[ 3.,  8.,  5., ...,  5.,  2.,  8.],
        [ 7., 10.,  9., ..., 11.,  9., 12.],
        [ 6.,  6.,  6., ...,  9.,  2.,  6.],
        ...,
        [ 5.,  9.,  5., ..., 12.,  2.,  4.],
        [ 4.,  8.,  5., ...,  8.,  3.,  8.],
        [ 6., 11.,  9., ...,  9.,  3.,  6.]]),
 array([-1, -1, -1, ..., -1, -1, -1]),
 array([-1, -1, -1, ..., -1, -1, -1]))

In [18]:
# 训练模型
clf.train(X_train, y_train)

n_changed: 1159
sv num: 566
n_changed: 490
sv num: 312
n_changed: 278
sv num: 192
n_changed: 163
sv num: 125
n_changed: 115
sv num: 97
n_changed: 86
sv num: 73
n_changed: 64
sv num: 62
n_changed: 56
sv num: 52
n_changed: 47
sv num: 47
n_changed: 42
sv num: 43
n_changed: 38
sv num: 41
n_changed: 39
sv num: 39
n_changed: 35
sv num: 37
n_changed: 31
sv num: 37
n_changed: 25
sv num: 37
n_changed: 26
sv num: 36
n_changed: 16
sv num: 35
n_changed: 15
sv num: 34
n_changed: 15
sv num: 34
n_changed: 9
sv num: 34
n_changed: 8
sv num: 34
n_changed: 6
sv num: 34
n_changed: 4
sv num: 34
n_changed: 7
sv num: 34
n_changed: 2
sv num: 34
n_changed: 0
sv num: 34
n_changed: 490
sv num: 360
n_changed: 318
sv num: 230
n_changed: 201
sv num: 163
n_changed: 145
sv num: 123
n_changed: 114
sv num: 94
n_changed: 78
sv num: 77
n_changed: 66
sv num: 65
n_changed: 57
sv num: 63
n_changed: 55
sv num: 52
n_changed: 43
sv num: 46
n_changed: 33
sv num: 43
n_changed: 35
sv num: 42
n_changed: 27
sv num: 41
n_changed: 21

In [19]:
# 预测
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('准确率为 {:.4%}'.format(accuracy))

准确率为 99.3333%


In [20]:
# 了解识别信息
from sklearn.metrics import confusion_matrix
C = confusion_matrix(y_test, y_pred)
C

array([[5781,    0],
       [  40,  179]])

```python
array([[5777,    0],
       [  44,  179]])
```
这代表有 `5777 + 179` 个文字预测正确
`44 + 0` 个预测错误

因此,字母'C'预测错误概率该是很高,需要调整超参数优化性能

In [21]:
"""
@author: husiyuan
@date: 2020-6-23
"""
from sklearn.svm import SVC

# 用于保存各超参数组合的成绩
acc_list = []
p_list = []

# 待尝试的各超参数,可先粗调再细调
C_list = [0.1, 1, 10, 100]
gamma_list = [0.1, 1, 10, 100]
for C in C_list:
    for gamma in gamma_list:
        # 迭代不同超参数组合,创建模型
        clf = SVC(C=C, tol=0.01, kernel='rbf', gamma=gamma)
        # 训练,预测,计算准确率
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_pred, y_test)
        print('C = {1}, gamma={2}, accuracy={:.4%}'.format(C, gamma, accuracy))
        # 保存成绩
        acc_list.append(accuracy)
        p_list.append((C, gamma))

idx = np.argmax(acc_list)
print('best (C, gamma) is : ', p_list(idx))

now try C = 0.1 , gamma = 0.1
now try C = 0.1 , gamma = 1
now try C = 0.1 , gamma = 10
now try C = 0.1 , gamma = 100
now try C = 1 , gamma = 0.1
now try C = 1 , gamma = 1
now try C = 1 , gamma = 10
now try C = 1 , gamma = 100
now try C = 10 , gamma = 0.1
now try C = 10 , gamma = 1
now try C = 10 , gamma = 10
now try C = 10 , gamma = 100
now try C = 100 , gamma = 0.1
now try C = 100 , gamma = 1
now try C = 100 , gamma = 10
now try C = 100 , gamma = 100


TypeError: 'list' object is not callable

In [None]:
# 经过上述测试,发现
clf = SMO(C=5, tol=0.01, kernel='rbf', gamma=0.05)
clf.train(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print('准确率为 {:.4%}'.format(accuracy))

In [None]:
# 现在结果就比较令人满意了
C = confusion_matrix(y_test, y_pred)
C