## Mathematics of Machine Learning

### Chapter 3: Linear classification methods
### Section 3.4: Soft SVM Rule

In [None]:
import numpy as np
import scipy.optimize as opt
import matplotlib.pyplot as plt

#### (0) Data Preparation

Generate the training data

In [None]:
# Size of the dataset
m = 25

In [None]:
np.random.seed(17)
x = np.random.uniform(low=-3, high=3, size=(2, m))
# print(x)

In [None]:
# true parameters
w_true = np.array([[1], [2]])
# print(w_true)

In [None]:
# Probabilities for label +1 according to the Bernoulli model with h_{w_true,0}
p = 1/(1 + np.exp(-(np.dot(w_true.T, x))))
# print(p)

In [None]:
# Dice out the random markers according to the probabilities p
y = 2*(np.random.uniform(low=0, high=1, size=(1, m)) <= p) - 1
# print(y)

In [None]:
# load data
x = np.genfromtxt("data_svm_soft_X.csv", delimiter=',')
y = np.genfromtxt("data_svm_soft_Y.csv", delimiter=',')
y = np.array([y])
# print(x)
# print(y)

#### (1) Soft SVM Rule

In [None]:
# both terms are equally weighted
lam = 1/m
print(lam)

In [None]:
# Define loss function
def hinge(w, x, y): 
    return np.amax(np.append(1 - np.multiply(y, np.dot(w.T, x)), np.zeros((1, m)), axis=0), axis=0)

In [None]:
# Determine the solution
def fun(w): return lam * np.linalg.norm(w)**2 + np.mean(hinge(w, x, y))

In [None]:
w_S, RS_min, iter, funcalls, warnflag = opt.fmin(fun, np.zeros((2, 1)), maxfun=100000, full_output=True)
print(w_S)

In [None]:
w_S = np.array([[i] for i in w_S])

 #### (1.1) Plot the objective function

In [None]:
# discretize w1 and w2
w1 = np.array([[i] for i in np.arange(-10*abs(w_S[0]), 10*abs(w_S[0]) + 20*abs(w_S[0])/1000, 20*abs(w_S[0])/1000)]) 
w2 = np.array([[i] for i in np.arange(-10*abs(w_S[1]), 10*abs(w_S[1]) + 20*abs(w_S[1])/1000, 20*abs(w_S[1])/1000)]) 
print(w1.shape)
print(w2.shape)

In [None]:
# Generate discretization grid
WW1, WW2 = np.meshgrid(w1, w2)
W1 = np.ravel(WW1, order='F')
W2 = np.ravel(WW2, order='F')
FW = np.zeros((len(W1), 1))

In [None]:
for i in range(len(W1)):
    ww = np.array([[W1[i]], [W2[i]]])
    FW[i] = fun(ww)

In [None]:
# Generate graphic (contour plot)
fig, ax = plt.subplots()

CS = ax.contour(WW1, WW2, np.reshape(np.log(FW), (len(w1), len(w2))), 25)
# ax.clabel(CS, inline=True, fontsize=10)

# plot learned value
ax.scatter(w_S[0], w_S[1], c="r")
# true
# ax.scatter(w_true[0], w_true[1], marker="+")

ax.set_title('log({:.2f} |w|^2 + R_S(w))'.format(lam))
ax.set_xlabel('w_1')
ax.set_ylabel('w_2')

fig.colorbar(CS)
plt.tight_layout()
plt.show()

#### (2) Plot the training data

In [None]:
fig, ax = plt.subplots()

# First plot the true hyperplane for x in [-3,3]
ax.plot([-3,3], -w_true[0]/w_true[1]*[-3,3], "--k", label="true hyperplane")

# Plot the learned hypothesis
ax.plot([-3,3], -w_S[0]/w_S[1]*[-3,3], "--", c="g", label="SVM learned hyperplane")

# For comparison: plot logistic regression
def RS_log(w): return np.mean(np.log(1 + np.exp(- np.multiply(y, (np.dot(w.T, x))))), axis=1)
w_LR, RS_min, iter, funcalls, warnflag = opt.fmin(RS_log, np.zeros((2, 1)), maxfun=100000, full_output=True)
w_LR = np.array([[i] for i in w_LR])
ax.plot([-3,3], -w_LR[0]/w_LR[1]*[-3,3], "--", c="m", label="Log-Reg learned hyperplane")

# Then enter the classified points
inds = [i for (i, val) in enumerate(y[0]) if val == 1]
indm = [i for (i, val) in enumerate(y[0]) if val == -1]

ax.scatter(x[0][inds], x[1][inds], c="b", marker="+", linewidths = 2)
ax.scatter(x[0][indm], x[1][indm], c="r", marker="d", linewidths = 2)

plt.legend()
plt.xlabel("x_1")
plt.ylabel("x_2")

ax.set(xlim=(-3, 3), ylim=(-3, 3))
fig.tight_layout()