# Mathematics of Machine Learning

## Programming tasks

Import required modules

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.optimize as opt 

### a) Preparation of the data

In [2]:
# Loading the data set
T = np.loadtxt('heart.dat')
print(T)

[[70.  1.  4. ...  3.  3.  2.]
 [67.  0.  3. ...  0.  7.  1.]
 [57.  1.  2. ...  0.  7.  2.]
 ...
 [56.  0.  2. ...  0.  3.  1.]
 [57.  1.  4. ...  0.  6.  1.]
 [67.  1.  4. ...  3.  3.  2.]]


In [3]:
# Extract the real features
X = T[:, [0, 3, 4, 7, 9, 11]]
print(X)

[[7.00e+01 1.30e+02 3.22e+02 1.09e+02 2.40e+00 3.00e+00]
 [6.70e+01 1.15e+02 5.64e+02 1.60e+02 1.60e+00 0.00e+00]
 [5.70e+01 1.24e+02 2.61e+02 1.41e+02 3.00e-01 0.00e+00]
 ...
 [5.60e+01 1.40e+02 2.94e+02 1.53e+02 1.30e+00 0.00e+00]
 [5.70e+01 1.40e+02 1.92e+02 1.48e+02 4.00e-01 0.00e+00]
 [6.70e+01 1.60e+02 2.86e+02 1.08e+02 1.50e+00 3.00e+00]]


In [4]:
# Extract and transform the markers
Y = 2 * T[:, 13] - 3
print(Y)

[ 1. -1.  1. -1. -1. -1.  1.  1.  1.  1. -1. -1. -1.  1. -1. -1.  1.  1.
 -1. -1.  1. -1. -1. -1. -1. -1. -1. -1.  1. -1.  1. -1. -1.  1.  1.  1.
  1.  1. -1. -1.  1. -1. -1. -1.  1. -1.  1.  1.  1.  1.  1. -1. -1. -1.
 -1. -1.  1. -1.  1.  1. -1.  1. -1. -1. -1.  1. -1.  1. -1.  1.  1. -1.
 -1. -1. -1.  1. -1. -1. -1. -1.  1.  1.  1. -1. -1. -1. -1. -1. -1.  1.
 -1.  1.  1.  1.  1.  1. -1.  1. -1. -1. -1.  1. -1.  1.  1.  1. -1.  1.
  1. -1.  1. -1.  1. -1. -1. -1.  1.  1. -1.  1.  1.  1.  1. -1. -1. -1.
  1. -1. -1.  1.  1.  1. -1.  1. -1. -1. -1.  1. -1. -1.  1. -1.  1. -1.
  1.  1.  1.  1.  1. -1. -1. -1. -1. -1. -1. -1.  1. -1. -1.  1.  1.  1.
 -1.  1. -1. -1. -1. -1. -1.  1. -1.  1.  1. -1. -1.  1.  1.  1.  1. -1.
 -1.  1.  1. -1. -1. -1.  1. -1. -1.  1. -1.  1. -1.  1. -1. -1. -1. -1.
 -1.  1. -1.  1.  1.  1.  1. -1. -1. -1.  1. -1.  1. -1. -1.  1. -1. -1.
 -1. -1. -1. -1.  1.  1. -1.  1. -1. -1.  1.  1. -1. -1.  1.  1. -1.  1.
 -1.  1. -1.  1. -1. -1.  1. -1. -1.  1. -1.  1.  1

In [5]:
# Number of data pairs
m = len(Y)
print(m)

270


In [6]:
# Number of features
d = np.size(X, axis=1)
print(d)

6


### b) Splitting the data

Random selection of the indices of the training and test data

In [7]:
# Share of training data
p = 0.7
data_ind = np.random.permutation(m)
print((np.ceil(p*m)+1))
ind_train = data_ind[:int((np.ceil(p*m)+1))]
ind_test = [i for i in data_ind if i not in ind_train]


print(data_ind)
print(len(data_ind))
print(len(set(data_ind)))
print(ind_train)
print(len(ind_train))
print(ind_test)
print(len(ind_test))

190.0
[168 125   7 218 134  94 252  54 208  85 264 197 152 104 165  34   0 214
 222  75  16 200 238 262   1 216  31  95 192 106 203 115 145 253 127 123
  80 113 230 229 102 256  51 150 130 236 247 250  40  72  49   6 217 186
  42 227 148 265 204 187  52  25  61 267 221 107 189 169 114 117 185 138
 195   4  87  74 136 132 158 224  83 249  35 146 142  66 160  39 251 261
 135 257 240 231  56 258 194  22 118  58 112 164 170 198 202 196 263 176
  44 174 259 268  21 235   3 156  97 248  10  37 183 188 151   9  23  73
 153  43 175 149 191 223  64 133 155 228 108 119  45 266 172  50 139  38
 254 244 184  91 109 162   8 205  60  68   5 233 143 213 178 126 242  24
 220 163 207 232  84 234  11  67 210  36  89  53 122  28 144  27  46 209
  33 180 147  86  62 212 190  63  48  88 129 166 255 201 128 141 101 199
 193 237  98  92 260 121 173  65 206 171 100 239  12 211  29   2 219  93
 241 140  30 110 215 243 105  26  57  69 269  17  59  77  55 116  99 182
  82 179  90 245  78  81 161 131  70  47 120 

In [8]:
# Training data
X_train = X[ind_train, :]
Y_train = Y[ind_train]
print(X_train)
print(Y_train)

[[4.50e+01 1.38e+02 2.36e+02 1.52e+02 2.00e-01 0.00e+00]
 [5.40e+01 1.60e+02 2.01e+02 1.63e+02 0.00e+00 1.00e+00]
 [5.90e+01 1.10e+02 2.39e+02 1.42e+02 1.20e+00 1.00e+00]
 ...
 [6.00e+01 1.50e+02 2.40e+02 1.71e+02 9.00e-01 0.00e+00]
 [6.60e+01 1.60e+02 2.46e+02 1.20e+02 0.00e+00 3.00e+00]
 [4.10e+01 1.26e+02 3.06e+02 1.63e+02 0.00e+00 0.00e+00]]
[-1. -1.  1. -1. -1.  1.  1. -1.  1. -1.  1. -1. -1.  1. -1.  1.  1. -1.
 -1.  1.  1. -1. -1.  1. -1. -1. -1.  1. -1. -1.  1. -1.  1. -1. -1. -1.
  1. -1.  1. -1. -1. -1. -1. -1.  1. -1. -1.  1.  1. -1.  1.  1. -1.  1.
 -1.  1.  1. -1.  1. -1. -1. -1.  1. -1.  1.  1.  1.  1. -1.  1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1.  1.  1.  1.  1. -1.  1. -1. -1.  1.
 -1.  1.  1.  1.  1.  1. -1. -1. -1.  1.  1. -1. -1. -1.  1. -1. -1.  1.
  1. -1. -1. -1. -1.  1. -1.  1.  1.  1. -1.  1. -1. -1. -1.  1. -1. -1.
 -1. -1.  1. -1.  1.  1. -1.  1. -1. -1.  1.  1. -1. -1.  1.  1. -1. -1.
 -1. -1. -1.  1. -1. -1.  1. -1. -1. -1. -1.  1. -1.  1.  1.  1. -1.

In [9]:
# Test data
X_test = X[ind_test, :]
Y_test = Y[ind_test]
print(X_test)
print(Y_test)

[[6.00e+01 1.17e+02 2.30e+02 1.60e+02 1.40e+00 2.00e+00]
 [5.00e+01 1.10e+02 2.54e+02 1.59e+02 0.00e+00 0.00e+00]
 [7.10e+01 1.60e+02 3.02e+02 1.62e+02 4.00e-01 2.00e+00]
 [5.80e+01 1.25e+02 3.00e+02 1.71e+02 0.00e+00 2.00e+00]
 [5.20e+01 1.34e+02 2.01e+02 1.58e+02 8.00e-01 1.00e+00]
 [5.90e+01 1.38e+02 2.71e+02 1.82e+02 0.00e+00 0.00e+00]
 [6.70e+01 1.20e+02 2.37e+02 7.10e+01 1.00e+00 0.00e+00]
 [7.70e+01 1.25e+02 3.04e+02 1.62e+02 0.00e+00 3.00e+00]
 [3.50e+01 1.26e+02 2.82e+02 1.56e+02 0.00e+00 0.00e+00]
 [4.30e+01 1.20e+02 1.77e+02 1.20e+02 2.50e+00 0.00e+00]
 [6.40e+01 1.40e+02 3.13e+02 1.33e+02 2.00e-01 0.00e+00]
 [5.40e+01 1.24e+02 2.66e+02 1.09e+02 2.20e+00 1.00e+00]
 [5.80e+01 1.20e+02 3.40e+02 1.72e+02 0.00e+00 0.00e+00]
 [5.40e+01 1.22e+02 2.86e+02 1.16e+02 3.20e+00 2.00e+00]
 [6.80e+01 1.20e+02 2.11e+02 1.15e+02 1.50e+00 0.00e+00]
 [5.70e+01 1.50e+02 2.76e+02 1.12e+02 6.00e-01 1.00e+00]
 [6.00e+01 1.02e+02 3.18e+02 1.60e+02 0.00e+00 1.00e+00]
 [6.90e+01 1.40e+02 2.54e+02 1.

### c) Logistic Regression

NOTE: We include the bias in the last position in the vector w.

In [10]:
# Empirical Risk Function
# RS_log = @(w) mean( log(1 + exp(- Y_train .* (X_train * w(1:d) + w(end)))) , 1)
def RS_log(w): return np.mean(np.log(1 + np.exp(-(np.multiply(Y_train, np.dot(X_train, w[0:d]) + w[-1])))), axis = 0)

Numerical calculation of ERM parameters...

... for this we allow enough iteration and choosing a random starting value:

In [11]:
# np.random.normal(size=(d+1,1))
w_LR, RS_min, iter, funcalls, warnflag = opt.fmin(RS_log, np.zeros((7, 1)), maxfun=100000, full_output=True)

print(w_LR)
print(RS_min)
print(iter)
print(funcalls)
print(warnflag)

Optimization terminated successfully.
         Current function value: 0.473308
         Iterations: 867
         Function evaluations: 1312
[-0.02460858  0.02001448  0.00547388 -0.02869171  0.67730437  1.00278759
 -0.11167897]
0.4733083915639057
867
1312
0


In [12]:
# Determine the misclassified training data via constraint violation:
Err_Train = np.mean(np.multiply(Y_train, np.dot(X_train, w_LR[0:d]) + w_LR[-1]) < 0)
print("{:.1f} percent of the training data is misclassified.".format(Err_Train * 100))

23.2 percent of the training data is misclassified.


ANSWER: If the sample were linearly separable, logistic regression would find the appropriate separating hypothesis. Because of the existing misclassifications, this is not that case.

In [13]:
# Determine the misclassified test data via constraint violation:
Err_Test = np.mean(np.multiply(Y_test, np.dot(X_test, w_LR[0:d]) + w_LR[-1]) < 0)
print("{:.1f} percent of the test data is misclassified.".format(Err_Test * 100))

18.8 percent of the test data is misclassified.


#### ANSWER:

In [14]:
print("So we estimate the expected risk of h_S to be {:.1f} percent.".format(Err_Test * 100))

So we estimate the expected risk of h_S to be 18.8 percent.


### d) Soft-margin SVM

Choice of lambda

In [15]:
# both terms equally weighted
lam = 1/m
Y_train = Y_train[:, None]

In [16]:
# Define loss function
def hinge(w, x, y): 
    # return np.amax(np.append(1 - np.multiply(y, (np.dot(x, w[0:d]) + w[-1])), np.zeros((len(y), 1)), axis = 1), axis = 1)
    return np.amax(np.append(1 - np.multiply(Y_train, (np.dot(X_train, w[0:d]) + w[-1])), np.zeros((len(Y_train), 1)), axis = 1), axis = 1)[:, None]

In [17]:
# Determine the solution
def fun(w): return lam * np.linalg.norm(w[0:d])**2 + np.mean(hinge(w, X_train, Y_train))

# w_SVM, RS_min, iter, funcalls, warnflag = opt.fmin(fun, np.random.randn(d+1, 1), maxfun=100000, full_output=True)
result = opt.minimize(fun, np.random.randn(d+1, 1), options={'disp': True})

w_SVM = result.x
RS_min = result.fun

print(w_SVM)
print(RS_min)
# print(iter)
# print(funcalls)
# print(warnflag)

         Current function value: 0.842107
         Iterations: 61
         Function evaluations: 1404
         Gradient evaluations: 174
[ 1.41389029e-09 -1.66170386e-08 -9.53661959e-09 -9.28719788e-09
 -1.09776492e-07 -2.36916540e-07 -9.99995396e-01]
0.842107057414959


In [18]:
# Determine the misclassified training data via constraint violation:
Err_Train = np.mean(np.multiply(Y_train, np.dot(X_train, w_SVM[0:d]) + w_SVM[-1]) < 0)
print("{:.1f} percent of the training data is misclassified.".format(Err_Train * 100))

42.1 percent of the training data is misclassified.


In [19]:
# Determine the misclassified test data via constraint violation:
Err_Test = np.mean(np.multiply(Y_test, np.dot(X_test, w_SVM[0:d]) + w_SVM[-1]) < 0)
print("{:.1f} percent of the test data is misclassified.".format(Err_Test * 100))

50.0 percent of the test data is misclassified.


In [20]:
print("So we estimate the expected risk of h_S to be {:.1f} percent.".format(Err_Test * 100))

So we estimate the expected risk of h_S to be 50.0 percent.
