# Mathematics of Machine Learning

## Programming tasks

Import required modules

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.optimize as opt 

### a) Preparation of the data

In [None]:
# Loading the data set
T = np.loadtxt('heart.dat')
# print(T)

In [None]:
# Extract the real features
X = T[:, [0, 3, 4, 7, 9, 11]]
print(X)

In [None]:
# Extract and transform the markers
Y = 2 * T[:, 13] - 3
print(Y)

In [None]:
# Number of data pairs
m = len(Y)
print(m)

In [None]:
# Number of features
d = np.size(X, axis=1)
print(d)

### b) Splitting the data

Random selection of the indices of the training and test data

In [None]:
# Share of training data
p = 0.7
data_ind = np.random.permutation(m)
print((np.ceil(p*m)+1))
ind_train = data_ind[:int((np.ceil(p*m)+1))]
ind_test = [i for i in data_ind if i not in ind_train]

In [None]:
# Training data
X_train = X[ind_train, :]
Y_train = Y[ind_train]
# print(X_train)
# print(Y_train)

In [None]:
# Test data
X_test = X[ind_test, :]
Y_test = Y[ind_test]
# print(X_test)
# print(Y_test)

### c) Logistic Regression

NOTE: We include the bias in the last position in the vector w.

In [None]:
# Empirical Risk Function
# RS_log = @(w) mean( log(1 + exp(- Y_train .* (X_train * w(1:d) + w(end)))) , 1)
def RS_log(w): return np.mean(np.log(1 + np.exp(-(np.multiply(Y_train, np.dot(X_train, w[0:d]) + w[-1])))), axis = 0)

Numerical calculation of ERM parameters...

... for this we allow enough iteration and choosing a random starting value:

In [None]:
# np.random.normal(size=(d+1,1))
w_LR, RS_min, iter, funcalls, warnflag = opt.fmin(RS_log, np.zeros((7, 1)), maxfun=100000, full_output=True)

print(w_LR)
print(RS_min)
print(iter)
print(funcalls)
print(warnflag)

In [None]:
# Determine the misclassified training data via constraint violation:
Err_Train = np.mean(np.multiply(Y_train, np.dot(X_train, w_LR[0:d]) + w_LR[-1]) < 0)
print("{:.1f} percent of the training data is misclassified.".format(Err_Train * 100))

ANSWER: If the sample were linearly separable, logistic regression would find the appropriate separating hypothesis. Because of the existing misclassifications, this is not that case.

In [None]:
# Determine the misclassified test data via constraint violation:
Err_Test = np.mean(np.multiply(Y_test, np.dot(X_test, w_LR[0:d]) + w_LR[-1]) < 0)
print("{:.1f} percent of the test data is misclassified.".format(Err_Test * 100))

#### ANSWER:

In [None]:
print("So we estimate the expected risk of h_S to be {:.1f} percent.".format(Err_Test * 100))

### d) Soft-margin SVM

Choice of lambda

In [None]:
# both terms equally weighted
lam = 1/m
Y_train = Y_train[:, None]

In [None]:
# Define loss function
def hinge(w, x, y): 
    # return np.amax(np.append(1 - np.multiply(y, (np.dot(x, w[0:d]) + w[-1])), np.zeros((len(y), 1)), axis = 1), axis = 1)
    return np.amax(np.append(1 - np.multiply(Y_train, (np.dot(X_train, w[0:d]) + w[-1])), np.zeros((len(Y_train), 1)), axis = 1), axis = 1)[:, None]

In [None]:
# Determine the solution
def fun(w): return lam * np.linalg.norm(w[0:d])**2 + np.mean(hinge(w, X_train, Y_train))

# w_SVM, RS_min, iter, funcalls, warnflag = opt.fmin(fun, np.random.randn(d+1, 1), maxfun=100000, full_output=True)
result = opt.minimize(fun, np.random.randn(d+1, 1), options={'disp': True})

w_SVM = result.x
RS_min = result.fun

print(w_SVM)
print(RS_min)
# print(iter)
# print(funcalls)
# print(warnflag)

In [None]:
# Determine the misclassified training data via constraint violation:
Err_Train = np.mean(np.multiply(Y_train, np.dot(X_train, w_SVM[0:d]) + w_SVM[-1]) < 0)
print("{:.1f} percent of the training data is misclassified.".format(Err_Train * 100))

In [None]:
# Determine the misclassified test data via constraint violation:
Err_Test = np.mean(np.multiply(Y_test, np.dot(X_test, w_SVM[0:d]) + w_SVM[-1]) < 0)
print("{:.1f} percent of the test data is misclassified.".format(Err_Test * 100))

In [None]:
print("So we estimate the expected risk of h_S to be {:.1f} percent.".format(Err_Test * 100))