In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(0)

In [None]:
import os

img_folder = 'imgs'
if not os.path.exists(img_folder):
    os.makedirs(img_folder)

## Question: SVM Coding

### preamble:
first load the separable dataset of 200 instances of 2D-features (with offset). </font>

In [None]:
# load the sparable samples
data_q2_separable = np.load('datasets/q2_separable.npz')
separable_x2 = data_q2_separable['x']
separable_y2 = data_q2_separable['y']

# visualize the datapoints
fig, ax = plt.subplots(1)
plot_samples(ax, separable_x2, separable_y2, 'separable samples (offset)', legend=True)

### Q2(a) Primal SVM
<font size="3"> In this section, please complete the hard SVM primal problem solver 'solve_SVM_primal' (TODO marked as 'Q2(a)' in q2.py). For hard SVM, the default value of the argument 'regularization' is None. After you have completed the task, run the block below to check the decision boundary obtained. </font>

In [None]:
from q2 import solve_SVM_primal

w, b = solve_SVM_primal(separable_x2, separable_y2)
fig, ax = plt.subplots(1)
plot_samples(ax, separable_x2, separable_y2, 'separable samples (offset)')
plot_line(ax, 'green', w, bias=b)
plt.savefig(os.path.join(img_folder, 'Q2_a.png'))

### Q2(b) Support Vectors
<font size="3"> To visualize the support vector, finish implementing the method 'get\_support\_vector'. Note that to deal with numerical errors, two numbers are considered equal when their difference is within $\epsilon$. Run the block of plotting to verify your implementation.</font>

In [None]:
from q2 import get_support_vectors

positive_vectors, positive_boundary, negative_vectors, negative_boundary = get_support_vectors(
    separable_x2, separable_y2, w, b)
fig, ax = plt.subplots(1)
plot_samples(ax, separable_x2, separable_y2, 'check support vectors')
plot_line(ax, 'green', w, bias=b)
plot_line(ax, 'pink', positive_boundary[0], bias=positive_boundary[1])
plot_line(ax, 'yellow', negative_boundary[0], bias=negative_boundary[1])
ax.scatter(positive_vectors[:, 0], positive_vectors[:, 1], s=80, facecolors='none', edgecolors='pink')
ax.scatter(negative_vectors[:, 0], negative_vectors[:, 1], s=80, facecolors='none', edgecolors='y')
plt.savefig(os.path.join(img_folder, 'Q2_b.png'))

In [None]:
# load the nonseparable samples
data_q2_nonseparable = np.load('datasets/q2_non_separable.npz')
nonseparable_x2 = data_q2_nonseparable['x']
nonseparable_y2 = data_q2_nonseparable['y']

# visualize the datapoints
fig, ax = plt.subplots(1)
plot_samples(ax, nonseparable_x2, nonseparable_y2, 'non-separable samples (offset)', legend=True)

<font size="3"> In this section you need to implement the soft SVM with not-null regularization in the method 'solve\_SVM\_primal' in 'q2.py'. After you've completed the code, run the experiment on different regularization.</font>

In [None]:
# now test the results with different regularizations
regularozations = [0.1, 0.5, 5]
fig, axs = plt.subplots(1, len(regularozations), figsize=(len(regularozations)*4, 3))
for ax, C in zip(axs, regularozations):
    # compute the result
    w, b = solve_SVM_primal(nonseparable_x2, nonseparable_y2, regularization=C)
    plot_samples(ax, nonseparable_x2, nonseparable_y2, f'regularization={C}')
    plot_line(ax, 'green', w, bias=b)
    # mark support vectors
    positive_vectors, positive_boundary, negative_vectors, negative_boundary = get_support_vectors(
        nonseparable_x2, nonseparable_y2, w, b)
    plot_line(ax, 'pink', positive_boundary[0], bias=positive_boundary[1])
    plot_line(ax, 'yellow', negative_boundary[0], bias=negative_boundary[1])
    ax.scatter(positive_vectors[:, 0], positive_vectors[:, 1], s=80, facecolors='none', edgecolors='pink')
    ax.scatter(negative_vectors[:, 0], negative_vectors[:, 1], s=80, facecolors='none', edgecolors='y')
    plt.savefig(os.path.join(img_folder, 'Q2_c.png'))

### Dual SVM and Cross Validation
<font size="3"> In some cases, rather than the feature vectors we only have pairwise distances/similarities of the samples. We will explore kernel transformation and dual SVM in the following section.
In this section we use the wine data from [link](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_wine.html) and only keep two classes. </font>

#### Kernel Functions
<font size='3'> kernel transformation: the pairwise relations between samples are more obvious after certain kernel transformation is applied. For this question, you need to implement the RBF kernel function ($\mathcal{K}(x, y) = exp(-\gamma||x-y||^2) $). Finish implementing the method 'get\_affinity\_matrix' in `q2.py`. </font>

In [None]:
from q2 import get_affinity_matrix
from utils import plot_affinities

wine_data = np.load('datasets/wine.npz')
wine_x = wine_data['x']
wine_y = wine_data['y']
transformed_basic = get_affinity_matrix(wine_x, method='product')
transformed_rbf = get_affinity_matrix(wine_x, method='rbf', gamma=1e-5)
fig, axs = plt.subplots(1, 2, figsize=(10, 4))
plot_affinities(axs[0], transformed_basic, wine_y, title='dot product')
plot_affinities(axs[1], transformed_rbf, wine_y, title='rbf kernel')
plt.savefig(os.path.join(img_folder, 'Q2_d.png'))

<font size="3">Now implement the dual method and test it on the rbf embeddings. You should expect the best average test accuracy above 90% </font>

In [None]:
from q2 import solve_SVM_dual

regularizations = [1, 5, 10]
for regularization in regularizations:
    print(f"==========start the experiment on C={regularization}==========")
    solve_SVM_dual(transformed_rbf, wine_y, regularization=regularization, folds=5)