# Setup

In [None]:
import numpy as np
import matplotlib.pyplot as plt

from scipy.optimize import minimize

In [None]:
def get_simple():
    X = np.linspace(-3, 3, 11)
    y = np.sin(X)
    y+=np.random.randn(11)*.2
    return X, y

### **1. Training of SVRs via Constrained Optimization** <a class="anchor" id="optim"></a>

Throughout this notebook, we assume $\mathbf{X} \in \mathbb{R}^{N \times D}$ as $N \times D$ matrix of training examples and $\mathbf{t} \in \mathbb{R}^N$ as $N$-dimensional vector of training targets.
To express the dual SVR in standard form, we express the kernel matrix $K \in \mathbb{R}^{NxN}$ such that each entry is $K_{ij} = k(\mathbf{x}_i , \mathbf{x}_j)$.

The dual form of the SVR was introduced as:
  \begin{align*}
  \widetilde{L}(\mathbf a,\widehat{\mathbf a}) =& - \frac{1}{2}  \sum_{n=1}^N  \sum_{m=1}^N (a_n - \widehat a _n) (a_m - \widehat a _m)k(\mathbf x_n,\mathbf x_m)\\ &- \epsilon  \sum_{n=1}^N (a_n + \widehat a _n) +  \sum_{m=1}^N (a_n - \widehat a _n) t_n
  \end{align*}


> To simplify the mathematical procedure, transform it first into matrix multiplication form!

The optimization objective is given by:
\begin{align}
\max_{\boldsymbol{a}}\widetilde{L}(\mathbf a,\widehat{\mathbf a})
\end{align}
subject to
\begin{eqnarray*}
  0 \leqslant a_n \leqslant C\\
  0 \leqslant \widehat a_n \leqslant C
\end{eqnarray*}

Once, we have found the optimum $\boldsymbol{a}$, the prediction function of the SVR is given by
\begin{equation}
y(\mathbf x) = \sum_{n=1}^N (a_n- \widehat a _n)k (\mathbf x, \mathbf x _n) +b
\end{equation}
where $b \in \mathbb{R}$ is the bias parameter.

We can estimate $b$ by considering a data point for which $0 < a_n < C$, which must have $\xi_n = 0$. Therefore this point must satisfy $\epsilon + y_n - t_n = 0$.
\begin{equation}
b = \frac{1}{N_\mathcal{M}} \sum_{n \in \mathcal{M}} \left( t_n - \epsilon - \sum_{m \in \mathcal{S}} (a_m- \widehat a _m)k (\mathbf x_n, \mathbf x _m)\right).
\end{equation}
Analogous results can be obtained by considering a point for which $0 < \widehat a_n < C$. $\mathcal{S} \subseteq \{1, \dots, N\}$ denotes the set of support vectors and $\mathcal{M} \subseteq \{1, \dots, N\}$ denotes the set of support vectors lying
on the margin with $N_\mathcal{M} = |\mathcal{M}|$.

> Below, implement a SVR for a simple regression problem by solving the dual problem above.
> For optimization make use of `scipy` and its [Optimization Module](https://docs.scipy.org/doc/scipy/reference/tutorial/optimize.html#sequential-least-squares-programming-slsqp-algorithm-method-slsqp).

In [None]:
class RBFKernel:
    def __init__(self, gamma=1):
        """Computes RBF kernel matrix between X_1 and X_2.

        Args:
            gamma (float): Hyperparameter of RBF kernel.
        """
        self.gamma = gamma

    def __call__(self, X_1, X_2):
        """Computes the kernel matrix.

        Args:
            X_1 (array-like): Input samples in shape (N, D).
            X_2 (array-like): Input samples in shape (N, D).

        Returns:
            ndarray: Kernel matrix of shape shape (N, M)
        """
        # Transform input to numpy arrays.
        X_1 = np.array(X_1).reshape(-1, 1)
        X_2 = np.array(X_2).reshape(-1, 1)

        # Compute NxM Euclidean distance matrix.
        E = np.sqrt((np.square(X_1[:,np.newaxis]-X_2).sum(axis=2))) # <-- SOLUTION

        # Compute NxM kernel matrix based on Euclidean distances.
        K = np.exp(-self.gamma * E**2) # <-- SOLUTION
        return K

In [None]:
class SVR:
    def __init__(self, kernel_func, eps=0.2, C=1.0, random_state=42):
        """Implementation of a C-SVM for regression.
        Args:
            C (float): Regularization parameter. The strength of the regularization is inversely
                proportional to C. Must be strictly positive. (default=1.0)
            eps (float): ...
            kernel_func (callable): Specifies the kernel type to be used in the algorithm.
            random_state (int): Random state to ensure reproducibility when initializing  a values.
        """
        self.C = C
        self.eps = eps
        self.kernel_func = kernel_func
        self.random_state = random_state


    def fit(self, X, t):
        """Fit the SVM model according to the given training data.

        Args:
            X (array-like): Training samples of shape (N, D).
            t (array-like): Training targets of shape (N).

        Returns:
            self: The fitted SVM object.
        """
        # Transform to ndarray.
        X, t = np.array(X), np.array(t)
        # Compute NxN kernel matrix based on kernel_func.
        K = self.kernel_func(X, X)
        # Number of samples.
        N = len(X)

        # Optimization
        # Step 1: Define the loss function and its gradient.
        def loss(a):
            # Compute loss for given a.
            a, a_hat = a[:N], a[N:]
            loss = - (
                - .5*( a@K@a - 2*a@K@a_hat + a_hat@K@a_hat)
                - self.eps * np.sum(a + a_hat)
                + (a - a_hat) @ t
            )
            return loss

        def jac(a):
            # Compute gradient of loss function w.r.t. a.
            a, a_hat = a[:N], a[N:]
            grad_a = -.5*(2*a@K  - 2*K@a_hat) - self.eps +  t
            grad_a_hat = -.5*(2*a_hat@K  - 2*a@K) - self.eps - t
            grad = -np.concatenate((grad_a, grad_a_hat), axis=0)
            return grad

        # Step 2: Define the Constraints.
        # We need to write the contraints in matrix notation:
        # - for inequalities: Ax <= b
        # - for eqalities cx = d
        # Note that x = a in our example.
        # 'fun' in the constraints needs to be adapted such that
        # 0 <= lambda a: ....

        # Set up the constraints:
        # Example: {'type': 'eq', 'fun': lambda a: a**2, 'jac': lambda a: 2*a}
        A = np.vstack((-np.eye(N*2), np.eye(N*2)))
        b = np.concatenate((np.zeros(N*2), self.C * np.ones(N*2)))
        constraints = (
            {'type': 'ineq', 'fun': lambda a: b - np.dot(A, a), 'jac': lambda a: -A},
        )

        # Optimize the a vector.
        a0 = np.random.RandomState(self.random_state).rand(2*N)  # Set an initial a vector.
        self.a_ = minimize(loss, a0, jac=jac, constraints=constraints, method='SLSQP').x
        self.a_[np.isclose(self.a_, 0)] = 0  # zero out nearly zeros
        self.a_[np.isclose(self.a_, self.C)] = self.C  # round the ones that are nearly C


        # Determine indices of support vectors.
        self.a_, self.a_hat = self.a_[:N], self.a_[N:]
        idx_support = self.a_ > 0
        idx_hat_support = self.a_hat > 0

        # Determine indices of support vectors that lie on the margin.
        idx_margin = (0 < self.a_) & (self.a_ < self.C)
        idx_hat_margin = (0 < self.a_hat) & (self.a_hat < self.C)

        # Determine bias parameter.
        a_diff = self.a_ - self.a_hat
        if np.sum(idx_margin) > 0:
            self.b_ = np.sum(t[idx_margin] - self.eps - K[idx_margin]@a_diff) / np.sum(idx_margin)
        elif np.sum(idx_hat_margin) > 0:
            self.b_ = np.sum(-t[idx_hat_margin] + self.eps - K[idx_hat_margin]@a_diff) / np.sum(idx_hat_margin)

        # Store support vectors including their targets and a.
        self.X_support_ = X[idx_support | idx_hat_support]
        self.t_support_ = t[idx_support | idx_hat_support]
        return self

    def predict(self, X):
        """Perform regression on samples in X.

        Args:
            X (array-like): Input samples whose targets are to be predicted.

        Returns:
            y (array-like): Predicted target of samples in X.
        """
        K = self.kernel_func(X, self.X_support_)
        idx = self.a_ + self.a_hat > 0
        d = K @ (self.a_[idx]-self.a_hat[idx]) + self.b_
        return d

> Train the SVR on the given dataset and plot its support vectors.

In [None]:
X, y = get_simple()

kernel_func = RBFKernel(gamma=1)
# Create SVR with C being infinity and a linear kernel.

eps = .2
C = 1
svm = SVR(kernel_func=kernel_func, C=C , eps=eps).fit(X=X, t=y) # <-- SOLUTION

# Make predictions for the SVM on X.
axis = np.linspace(-3, 3, 201)
y_pred = svm.predict(axis) # <-- SOLUTION

# Visualize predictions of the SVM.
fig, ax = plt.subplots()
plt.scatter(X, y)
plt.plot(axis, y_pred, zorder=5)
plt.fill_between(axis, y_pred-eps, y_pred+eps, alpha=.3, color='r')
plt.scatter(svm.X_support_, svm.t_support_, color='g', s=100, facecolors='none', edgecolors='g', label='support vectors')
plt.show()

NameError: name 'get_simple' is not defined