# 第三题：支持向量机的分类任务

实验内容：
1. 使用支持向量机完成Breast_Cancer_Wisconsin数据集分类任务
2. 使用不同核函数和惩罚系数C在训练集上训练模型，并分别计算对应模型在测试集的精度，查准率，查全率，F1值，将结果填写到下表中。

| 核函数 | C | 精度 | 查准率 | 查全率 | F1| 
| - | - | - | - | - | - |
rbf | 0.1 | 0.883 | 0.979 | 0.712 | 0.825 | 
rbf | 1 | 0.883 | 0.926 | 0.758 | 0.833 | 
linear | 0.1 | 0.924 | 0.921 | 0.879 | 0.899 | 
linear | 1 | 0.93 | 0.922 | 0.894 | 0.908 | 
sigmoid | 0.1 | 0.614 | 0.0 | 0.0 | 0.0 | 
sigmoid | 1 | 0.444 | 0.226 | 0.182 | 0.202 | 

In [1]:
# 导入数据
import numpy as np
import pandas as pd
data = pd.read_csv('data\Breast_Cancer_Wisconsin\data')
data['diagnosis'] = data['diagnosis'].apply(lambda x:1 if x == "M" else 0)

In [2]:
# 查看数据类型
data.dtypes

id                           int64
diagnosis                    int64
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst     

In [3]:
# 划分数据为特征和标签
data = data.values
x = data[:,2:-1]
y = data[:,1:2].reshape(-1)

In [4]:
x

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [5]:
y

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 0., 0., 0.,
       0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 0., 1., 0., 1., 1., 0., 0.,
       0., 0., 1., 0., 1., 1., 0., 1., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
       1., 1., 1., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1., 1., 0.,
       0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
       1., 0., 1., 1., 0., 0., 0., 1., 1., 0., 1., 0., 1., 1., 0., 1., 1.,
       0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 1., 0.,
       0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 1., 1., 1., 0., 1., 0., 1.,
       0., 0., 0., 1., 0., 0., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1.,
       0., 1., 0., 1., 0., 0., 1., 0., 1., 1., 1., 1., 0., 0., 1., 1., 0.,
       0., 0., 1., 0., 0.

In [6]:
# 数据集分割
from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(x, y, test_size = 0.3, random_state = 32)
trainX.shape, trainY.shape, testX.shape, testY.shape

((398, 30), (398,), (171, 30), (171,))

**注意：计算线性核的时候，要使用 LinearSVC 这个类，不要使用SVC(kernel = 'linear')。LinearSVC不需要设置kernel参数！**

In [7]:
# 引入模型
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

In [8]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [9]:
# YOUR CODE HERE

In [10]:
clf = SVC(kernel="rbf", C=0.1)
clf.fit(trainX, trainY)
predY = clf.predict(testX)
acc = accuracy_score(testY, predY)
prec = precision_score(testY, predY)
rec = recall_score(testY, predY)
f1 = f1_score(testY, predY)
print(f'Kernel: rbf, C: 0.1, Accuracy: {acc:.3}, Precision: {prec:.3}, Recall: {rec:.3}, F1: {f1:.3}')

Kernel: rbf, C: 0.1, Accuracy: 0.883, Precision: 0.979, Recall: 0.712, F1: 0.825


In [11]:
clf = SVC(kernel="rbf", C=1)
clf.fit(trainX, trainY)
predY = clf.predict(testX)
acc = accuracy_score(testY, predY)
prec = precision_score(testY, predY)
rec = recall_score(testY, predY)
f1 = f1_score(testY, predY)
print(f'Kernel: rbf, C: 1, Accuracy: {acc:.3}, Precision: {prec:.3}, Recall: {rec:.3}, F1: {f1:.3}')

Kernel: rbf, C: 1, Accuracy: 0.883, Precision: 0.926, Recall: 0.758, F1: 0.833


In [12]:
clf = LinearSVC(C=0.1, dual=False)
clf.fit(trainX, trainY)
predY = clf.predict(testX)
acc = accuracy_score(testY, predY)
prec = precision_score(testY, predY)
rec = recall_score(testY, predY)
f1 = f1_score(testY, predY)
print(f'Kernel: linear, C: 0.1, Accuracy: {acc:.3}, Precision: {prec:.3}, Recall: {rec:.3}, F1: {f1:.3}')

Kernel: linear, C: 0.1, Accuracy: 0.924, Precision: 0.921, Recall: 0.879, F1: 0.899


In [13]:
clf = LinearSVC(C=1, dual=False)
clf.fit(trainX, trainY)
predY = clf.predict(testX)
acc = accuracy_score(testY, predY)
prec = precision_score(testY, predY)
rec = recall_score(testY, predY)
f1 = f1_score(testY, predY)
print(f'Kernel: linear, C: 1, Accuracy: {acc:.3}, Precision: {prec:.3}, Recall: {rec:.3}, F1: {f1:.3}')

Kernel: linear, C: 1, Accuracy: 0.93, Precision: 0.922, Recall: 0.894, F1: 0.908


In [14]:
clf = SVC(kernel="sigmoid", C=0.1)
clf.fit(trainX, trainY)
predY = clf.predict(testX)
acc = accuracy_score(testY, predY)
prec = precision_score(testY, predY)
rec = recall_score(testY, predY)
f1 = f1_score(testY, predY)
print(f'Kernel: sigmoid, C: 0.1, Accuracy: {acc:.3}, Precision: {prec:.3}, Recall: {rec:.3}, F1: {f1:.3}')

Kernel: sigmoid, C: 0.1, Accuracy: 0.614, Precision: 0.0, Recall: 0.0, F1: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
clf = SVC(kernel="sigmoid", C=1)
clf.fit(trainX, trainY)
predY = clf.predict(testX)
acc = accuracy_score(testY, predY)
prec = precision_score(testY, predY)
rec = recall_score(testY, predY)
f1 = f1_score(testY, predY)
print(f'Kernel: sigmoid, C: 1, Accuracy: {acc:.3}, Precision: {prec:.3}, Recall: {rec:.3}, F1: {f1:.3}')

Kernel: sigmoid, C: 1, Accuracy: 0.444, Precision: 0.226, Recall: 0.182, F1: 0.202
