# Workshop 6

Starter code for workshop 6. You should have seen most of it before, but make sure you understand what it is doing!

In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# To plot even prettier figures
import seaborn as sn

# General data handling (pure numerics are better in numpy)
import pandas as pd

In [2]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()

In [3]:
xarray = data.data
yarray = data.target
print(xarray.shape)
print(yarray.shape)
fullarray = np.concatenate((xarray,np.reshape(yarray,(-1,1))),axis=1)
print(fullarray.shape)

(569, 30)
(569,)
(569, 31)


In [4]:
fullarray[:,-1] = 1 - fullarray[:,-1]   # now invert the labels (so that malignant=1)
df = pd.DataFrame(fullarray,columns = list(data.feature_names) + ['target'])

Your code starts here...

# Splitting into separate datasets

In [5]:
#Q1
#按70:15:15的比例将数据分成训练集、验证集和测试集。


from sklearn.model_selection import train_test_split

bigtrain_set, test_set = train_test_split(fullarray, test_size=0.15, random_state=42, stratify=fullarray[:,-1])
train_set, val_set = train_test_split(bigtrain_set, test_size=0.15, random_state=42, stratify=bigtrain_set[:,-1])

**Note the use of "stratify" in the calls above, as these make sure that each dataset has roughly the same proportions of the classes.**

In [6]:
X_train = train_set[:,:-1] #【：-1】= 从位置0到位置-1之前的数,也就是说去掉最后一个字符或者列的结果
y_train = train_set[:,-1]
X_test = test_set[:,:-1]
y_test = test_set[:,-1]
X_val = val_set[:,:-1]
y_val = val_set[:,-1]
print(f'Shapes are {[X_train.shape,y_train.shape,X_test.shape,y_test.shape,X_val.shape,y_val.shape]}')

Shapes are [(410, 30), (410,), (86, 30), (86,), (73, 30), (73,)]


In [7]:
print(np.mean(y_train),np.mean(y_test),np.mean(y_val))

0.37317073170731707 0.37209302325581395 0.3698630136986301


In [8]:
#这些是每个数据集中类的比例（因为类的值为0和1，所以平均值正好等于1表示的类的比例）。

These are the proportions of the classes in each dataset (as classes are given values 0 and 1, so a mean is just equal to the proportion of the class represented by 1).

# Pipeline

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

preproc_pl = Pipeline([ ('imputer', SimpleImputer(strategy="median")), 
                        ('std_scaler', StandardScaler()) ])

# SVM Classifier

In [10]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, hinge_loss

In [11]:
from sklearn.metrics import roc_curve, auc

In [12]:
#Q2 
#在管道中，使用径向基函数（rbf）核、默认（超）参数构建SVM分类器，
#并在验证集上确定该分类器的精度。

svm_pl = Pipeline([('preproc',preproc_pl), ('svc',SVC(kernel='rbf'))])

In [13]:
svm_pl.fit(X_train,y_train)
y_val_pred = svm_pl.predict(X_val)
acc = accuracy_score(y_val,y_val_pred)
print(acc)

0.9726027397260274


In [14]:
X_mean = np.mean(X_train,axis=0)
print(X_mean.shape)

(30,)


In [15]:
#Q3
#现在我们想建立一个简单的基线来比较准确度值，就像我们在早期的回归研讨会上所做的那样。
#在这里，我们将分别为每个特征执行此操作，
#通过使用公式将特征值转换为简单的“预测概率”：y_pred=（x-xmin）/（xmax-xmin），
#该公式给出了0到1（包括）范围内的值。

#从第一个特征开始，写一个循环，以在0到1之间的一组均匀间隔的值设置该预测的阈值（y_pred），
#并为每个阈值计算精度。由此确定该功能的最大精确度（在所有测试阈值中）。




for n in range(X_train.shape[1]):
    xfeat = X_train[:,n]
    xmin = np.min(xfeat)
    xmax = np.max(xfeat)
    y_pred = (xfeat-xmin)/(xmax-xmin)
    fpr, tpr, thresholds = roc_curve(??, ??, pos_label=??)
    aucval = auc(??,??)
    if aucval<0.5: aucval = 1-aucval
    acc = 0
    for thr in xfeat:
        acc = np.max([acc, accuracy_score(y_train, y_pred>thr)])
    print(f'AUC for feature {n} = {aucval} ; Max accuracy = {acc}')

SyntaxError: invalid syntax (3436379679.py, line 18)

In [None]:
for n in [6, 4]:  # Best and worst feature numbers
    xfeat = X_train[:,n]
    xmin = np.min(xfeat)
    xmax = np.max(xfeat)
    y_pred = (xfeat-xmin)/(xmax-xmin)
    fpr, tpr, thresholds = roc_curve(y_train, y_pred, pos_label=1)
    plt.plot(??,??,'b')
    plt.title(f'ROC for feature {n}')
    plt.xlabel('FPR')
    plt.ylabel('TPR')
    plt.show()

In [None]:
#请注意，最差的特征不是AUC最低的特征，而是AUC最接近0.5的特征，因为小于0.5的特征只是反向特征，可以被否定以获得正性能（AUC>0.5）。

Note that the worst feature is not the one with the lowest AUC, it is the one with the AUC closest to 0.5, as ones less than this are simply inverted features and could be negated to get positive performance (with AUC>0.5).

In [16]:
#Q5
#从第3步中选择精度得分最高的两个功能。使用提供的代码（make_meshgrid和plot_contours）绘制步骤2中SVM分类器的决策边界
#[请注意，这里提供的代码是scikit learn examplesLinks中一个外部站点的修改版本。]。我们将使用原始数据绘制这些决策边界，
#因此在管道中传递到plot_等高线调用，而不仅仅是分类器部分。您需要根据原始特征值为make_meshgrid选择合适的范围。



def make_meshgrid(x, y, ns=100):
    """Create a mesh of points to plot in

    Parameters
    ----------
    x: data to base x-axis meshgrid on (only min and max used)
    y: data to base y-axis meshgrid on (only min and max used)
    ns: number of steps in grid, optional

    Returns
    -------
    xx, yy : ndarray
    """
    x_min, x_max = x.min(), x.max()
    y_min, y_max = y.min(), y.max()
    hx = (x_max - x_min)/ns
    hy = (y_max - y_min)/ns
    xx, yy = np.meshgrid(np.arange(x_min, x_max + hx, hx), np.arange(y_min, y_max + hy, hy))
    return xx, yy

In [17]:
def plot_contours(clf, xx, yy, xmean, n1, n2, **params):
    """Plot the decision boundaries for a classifier.

    Parameters
    ----------
    clf: a classifier
    xx: meshgrid ndarray
    yy: meshgrid ndarray
    xmean : 1d array of mean values (to populate constant features with)
    n1, n2: index numbers of features that change (for xx and yy)
    params: dictionary of params to pass to contourf, optional
    """
    # The following lines makes an MxN matrix to pass to the classifier (# samples x # features)
    # It does this by multiplying Mx1 and 1xN matrices, where the former is filled with 1's
    #  where M is the number of grid points in xx and N is the number of features in xmean
    #  It is done in such a way that the xmean vector is replaced in each row
    fullx = np.ones((xx.ravel().shape[0],1)) * np.reshape(xmean,(1,-1))
    fullx[:,n1] = xx.ravel()
    fullx[:,n2] = yy.ravel()
    Z = clf.predict(fullx)
    Z = Z.reshape(xx.shape)
    out = plt.contourf(xx, yy, Z, **params)
    return out

In [18]:
# we create an instance of SVM and fit out data. We do not scale our
# data since we want to plot the support vectors

#Q6
#现在，在决策边界上方，以两个类的不同颜色显示训练数据点的散点图。
#还可以使用相同的颜色但不同的符号添加验证数据点的散点图[提示：使用marker='s'获得正方形]。

n0=6
n1=7

x10, x90 = np.percentile(X_train[:,n0],[??,??])
y10, y90 = np.percentile(X_train[:,n1],[??,??])
xx, yy = make_meshgrid(np.array([x10, x90]), np.array([y10, y90]), 500)

plot_contours(svm_pl, xx, yy, X_mean, n0, n1, cmap=plt.cm.coolwarm, alpha=0.8)
plt.scatter(X_train[??,??], X_train[??,??], c=y_train, cmap=plt.cm.coolwarm, s=20, edgecolors="k")
plt.scatter(X_val[??,??], X_val[??,??], c=??, cmap=plt.cm.coolwarm, s=20, edgecolors="k", marker='s')

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xlabel(f"Feature {n0}")
plt.ylabel(f"Feature {n1}")
plt.title("Decision Boundary")

plt.show()

SyntaxError: invalid syntax (2362937716.py, line 11)

In [19]:
#Q7
#使用多项式（“多边形”）和线性（“线性”）核重新运行SVM分类。比较结果的准确性和决策边界图。

for kerneltype in ['linear','poly']:
    print(f'SVM KERNEL = {kerneltype}')
    svm_pl = Pipeline([('preproc',preproc_pl), ('svc',SVC(kernel=kerneltype))])
    
    svm_pl.fit(X_train,y_train)
    y_val_pred = svm_pl.predict(X_val)
    acc = accuracy_score(y_val,y_val_pred)
    print(f'  Accuracy = {acc}')
   
    plot_contours(svm_pl, xx, yy, X_mean, n0, n1, cmap=plt.cm.coolwarm, alpha=0.8)
    plt.scatter(X_train[??,??], X_train[??,??], c=y_train, cmap=plt.cm.coolwarm, s=20, edgecolors="k")
    plt.scatter(X_val[??,??], X_val[??,??], c=??, cmap=plt.cm.coolwarm, s=20, edgecolors="k", marker='s')
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.xlabel(f"Feature {n0}")
    plt.ylabel(f"Feature {n1}")
    plt.title(f"Decision Boundary for kernel = {kerneltype}")
    plt.show()

SyntaxError: invalid syntax (2180143251.py, line 14)

## Model selection

In [20]:
# Recalculating the above results as they were not stored there, but if they were, this could be avoided
for kerneltype in ['rbf','linear','poly']:
    svm_pl = Pipeline([('preproc',preproc_pl), ('svc',SVC(kernel=kerneltype))])   
    svm_pl.fit(X_train,y_train)
    y_val_pred = svm_pl.predict(X_val)
    acc = accuracy_score(y_??,y_val_pred)
    print(f'Validation accuracy = {acc} for kernel {kerneltype}')

SyntaxError: invalid syntax (3966769717.py, line 6)

### Choose RBF as best classifier as it has the best performance on the _validation_ set

In [21]:
#Q8
#选择最佳分类器，并在测试集上报告结果。查看它与验证集结果的差异。

kerneltype = 'rbf'
svm_pl = Pipeline([('preproc',preproc_pl), ('svc',SVC(kernel=kerneltype))])  
# Refit on combined training + validation set
svm_pl.fit(np.concatenate((X_train,X_val),axis=0),np.concatenate((y_train,y_??),axis=0))
# Evaluate this model (and only the selected best model) on the test set
y_test_pred = svm_pl.predict(X_test)
acc = accuracy_score(y_test,y_test_pred)
print(f'Final test accuracy = {acc} for selected model: kernel={kerneltype}')

SyntaxError: invalid syntax (2613289588.py, line 7)