# 3.支持向量机

## 3.1 维度诅咒

In [None]:
import random
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time

In [None]:
N_sample = 500
max_dim = 100

In [None]:
plt.rcParams['font.sans-serif'] = ['SimHei']  # 指定默认字体
plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题


In [None]:
def euclidien_distence(x, y):
    #计算两个向量x和y的欧氏距离
    vec1, vec2 = np.mat(x), np.mat(y)
    return np.sqrt(np.sum(np.square(vec1 - vec2)))

In [None]:
def generate_data(dim):
    #生成dim维的N_sample个数据，数据格式：500*dim
    data = [[] for _ in range(N_sample)]
    for i in range(N_sample):
        for j in range(dim):
            num = random.random()
            data[i].append(num)
    return data

In [None]:
#这部分代码需要一些时间来运行
euc_diff_list = [] # 存储欧氏距离下最大最小距离之间的距离
time_0 = time.time()
for dim in range(1, max_dim+1): # 由于1维情况，无法计算余弦相似度，故从2-50维
    if dim % 10 == 1:
        print(dim,int(time.time()-time_0))
    data = generate_data(dim)
    euc_distence_list = []
    for i in range(N_sample-1):
        for j in range(i+1, N_sample):
            euc_distence_list.append(euclidien_distence(data[i], data[j]))
            
    euc_diff_list.append(math.log((max(euc_distence_list) - min(euc_distence_list))/min(euc_distence_list), 10))
    


In [None]:

x = list(range(1, max_dim+1))

plt.plot(x, euc_diff_list, label = '欧氏距离 最大/最小')
plt.title('Curse of Dimensionality')
plt.xlabel('维度')
plt.ylabel('lg(欧氏距离)')
plt.legend(loc = 'upper right')

## 3.2 超参数：gamma与C

修改自CSDN，链接 https://blog.csdn.net/OldDriver1995/article/details/105211038

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.svm import SVC
import random
np.random.seed(42)

from sklearn.datasets import make_moons
X, y = make_moons(n_samples=100, noise=0.15, random_state=42)
def plot_dataset(X, y, axes):
    plt.plot(X[:, 0][y==0], X[:, 1][y==0], "bd")
    plt.plot(X[:, 0][y==1], X[:, 1][y==1], "rd")
    plt.axis(axes)
    plt.grid(True, which='both')
    plt.xlabel(r"$x_1$", fontsize=20)
    plt.ylabel(r"$x_2$", fontsize=20, rotation=0)

plot_dataset(X, y, [-1.5, 2.5, -1, 1.5])
plt.show()

In [None]:
def plot_boundary(clf, axes):
    x1= np.linspace(axes[0], axes[1], 100)
    x2= np.linspace(axes[0], axes[1], 100)
    xx, yy = np.meshgrid(x1, x2)
    x_new = np.c_[xx.ravel(), yy.ravel()]
    y_pred = clf.predict(x_new).reshape(xx.shape)
    y_decision=clf.decision_function(x_new).reshape(xx.shape)
    plt.contourf(xx, yy, y_pred, alpha = 0.4, cmap='coolwarm')
    plt.contourf(xx,yy,y_decision, alpha=0.3, cmap='coolwarm')

In [None]:
gamma1,gamma2, gamma3 = 0.1, 1, 10
C1, C2, C3 = 0.001, 1,1000
hyperparams = (gamma1, C1), (gamma1, C2),(gamma1, C3),(gamma2, C1), (gamma2, C2),(gamma2, C3),(gamma3, C1), (gamma3, C2),(gamma3, C3)
svm_clfs = []

for gamma, C in hyperparams:
    rbf_kernel_svm_clf = Pipeline([
            ("scaler", StandardScaler()),
            ("svm_clf", SVC(kernel="rbf", gamma=gamma, C=C)) 
            ])
    rbf_kernel_svm_clf.fit(X, y)
    svm_clfs.append(rbf_kernel_svm_clf)#这里会产生四个clf
#将画布分成四块
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(21,15), sharex=True, sharey=True)

for i, svm_clf in enumerate(svm_clfs): 
    plt.sca(axes[i // 3, i % 3])
    plot_boundary(svm_clf, [-1.5, 2.45, -1, 1.5])
    plot_dataset(X, y, [-1.5, 2.45, -1, 1.5])
    gamma, C = hyperparams[i]
    plt.title(r"$\gamma = {}, C={}$".format(gamma, C), fontsize=16)
    if i in (0, 1):
        plt.xlabel("") 
    if i in (1, 3):
        plt.ylabel("")
plt.savefig('SVM_gamma_C.png')
plt.show()

## 3.3 不同数据分布条件下的不同模型

改编自 sklearn https://scikit-learn.org/stable/auto_examples/preprocessing/plot_discretization_classification.html#sphx-glr-auto-examples-preprocessing-plot-discretization-classification-py

这个例子用到了sklearn的标准化训练流程工具：Pipeline（流水线）。我们会在后续训练流程中进行讲解

这里可以先理解为，Pipeline是把各个打包好的模块排列进模型，执行时会依次执行排列好的模块。

事实上，这个例子是个很好的学习流水线的例子

In [None]:
# Code source: Tom Dupré la Tour
# Adapted from plot_classifier_comparison by Gaël Varoquaux and Andreas Müller
#
# License: BSD 3 clause

import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap

from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.exceptions import ConvergenceWarning
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.utils._testing import ignore_warnings

h = 0.02  # step size in the mesh


def get_name(estimator):
    name = estimator.__class__.__name__
    if name == "Pipeline":
        name = [get_name(est[1]) for est in estimator.steps]
        name = " + ".join(name)
    return name

In [None]:
np.logspace(-1, 1, 3)
#自己试一下这个函数是在干什么

In [None]:
# list of (estimator, param_grid), where param_grid is used in GridSearchCV
# The parameter spaces in this example are limited to a narrow band to reduce
# its runtime. In a real use case, a broader search space for the algorithms
# should be used.
classifiers = [
    
    (
        make_pipeline(StandardScaler(), SVC(random_state=0, kernel='linear')),
        {"svc__C": np.logspace(-3, 3, 20)},
    ),
    (
        make_pipeline(StandardScaler(), SVC(random_state=0,kernel='poly')), #默认值 degree = 3
        {"svc__C": np.logspace(-3, 3, 20)},
    ),
    
    (
        make_pipeline(StandardScaler(), SVC(random_state=0, kernel='poly', degree = 10)), 
        {"svc__C": np.logspace(-3, 3, 20)},
    ),
    
    (
        make_pipeline(StandardScaler(), SVC(random_state=0)),  #默认值 kenerl = rbf
        {"svc__C": np.logspace(-3, 3, 20)},
    ),
]


In [None]:
names = [get_name(e).replace("StandardScaler + ", "") for e, _ in classifiers]

n_samples = 100
datasets = [
    make_moons(n_samples=n_samples, noise=0.2, random_state=0),
    make_circles(n_samples=n_samples, noise=0.2, factor=0.5, random_state=1),
    make_classification(
        n_samples=n_samples,
        n_features=2,
        n_redundant=0,
        n_informative=2,
        random_state=2,
        n_clusters_per_class=1,
    ),
]

fig, axes = plt.subplots(
    nrows=len(datasets), ncols=len(classifiers) + 1, figsize=(21, 9)
)

cm_piyg = plt.cm.PiYG
cm_bright = ListedColormap(["#b30065", "#178000"])

# iterate over datasets
for ds_cnt, (X, y) in enumerate(datasets):
    print(f"\ndataset {ds_cnt}\n---------")

    # split into training and test part
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.5, random_state=42
    )

    # create the grid for background colors
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # plot the dataset first
    ax = axes[ds_cnt, 0]
    if ds_cnt == 0:
        ax.set_title("Input data")
    # plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k")
    # and testing points
    ax.scatter(
        X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors="k"
    )
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    ax.set_xticks(())
    ax.set_yticks(())

    # iterate over classifiers
    for est_idx, (name, (estimator, param_grid)) in enumerate(zip(names, classifiers)):
        ax = axes[ds_cnt, est_idx + 1]

        clf = GridSearchCV(estimator=estimator, param_grid=param_grid)
        with ignore_warnings(category=ConvergenceWarning):
            clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        print(f"{name}: {score:.2f}")

        # plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, x_max]*[y_min, y_max].
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.column_stack([xx.ravel(), yy.ravel()]))
        else:
            Z = clf.predict_proba(np.column_stack([xx.ravel(), yy.ravel()]))[:, 1]

        # put the result into a color plot
        Z = Z.reshape(xx.shape)
        ax.contourf(xx, yy, Z, cmap=cm_piyg, alpha=0.8)

        # plot the training points
        ax.scatter(
            X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors="k"
        )
        # and testing points
        ax.scatter(
            X_test[:, 0],
            X_test[:, 1],
            c=y_test,
            cmap=cm_bright,
            edgecolors="k",
            alpha=0.6,
        )
        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())

        if ds_cnt == 0:
            ax.set_title(name.replace(" + ", "\n"))
        ax.text(
            0.95,
            0.06,
            (f"{score:.2f}").lstrip("0"),
            size=15,
            bbox=dict(boxstyle="round", alpha=0.8, facecolor="white"),
            transform=ax.transAxes,
            horizontalalignment="right",
        )


plt.tight_layout()

plt.savefig('SVC_data_model')
plt.show()