### Get results

In [1]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from sklearn.model_selection import train_test_split
import itertools
import matplotlib.pyplot as plt
import time

np.random.seed(42)

df = pd.read_csv('DATA.csv')

train, test = train_test_split(df, test_size=0.255, random_state=1939671)

X = np.array(train[['x1', 'x2']])
y = np.array(train['y'])

X_test = np.array(test[['x1', 'x2']])
y_test = np.array(test['y'])


def tanh(s, sigma):
    prod = 2 * sigma * s
    return (np.exp(prod) - 1) / (np.exp(prod) + 1)


def feedforward(X, W, b, v, sigma):
    linear_layer = (np.dot(X, W) + b)
    activation = tanh(linear_layer, sigma)
    pred = np.dot(activation, v)

    return pred

def backpropagation(x0, funcArgs):
    
    X = funcArgs[0]
    y = funcArgs[1]
    sigma = funcArgs[2]
    N = funcArgs[3]
    rho = funcArgs[4]
    P = len(y)
    
    W = x0[:int(X.shape[1] * N)].reshape((X.shape[1], N))
    b = x0[int(X.shape[1] * N):int(X.shape[1] * N + N)]
    v = x0[int(X.shape[1] * N + N):]

    linear_layer = (np.dot(X, W) + b)
    a_2 = tanh(linear_layer, sigma)
    dJdf = (1 / P) * (np.dot(a_2, v) - y)
    dtanh = 1 - tanh(linear_layer, sigma) ** 2

    dW1_1 = np.tensordot(dJdf, np.transpose(v), axes=0)
    dW1_2 = dW1_1 * dtanh

    dv = np.dot(dJdf, a_2) + rho * v
    db = np.sum(dW1_2, axis=0) + rho * b
    dW = np.tensordot(np.transpose(X), dW1_2, axes=1) + rho * W

    return np.concatenate((dW, db, dv), axis=None)

def loss(x0, funcArgs, test=False):
    X = funcArgs[0]
    y = funcArgs[1]
    sigma = funcArgs[2]
    N = funcArgs[3]
    rho = funcArgs[4]

    W = x0[:int(X.shape[1] * N)].reshape((X.shape[1], N))
    b = x0[int(X.shape[1] * N):int(X.shape[1] * N + N)]
    v = x0[int(X.shape[1] * N + N):]

    P = len(y)
    norm = np.linalg.norm(x0)
    pred = feedforward(X, W, b, v, sigma)
    if test:
        res = ((np.sum((pred - y) ** 2)) * P ** (-1)) * 0.5
    else:
        res = ((np.sum((pred - y) ** 2)) * P ** (-1) + rho * norm**2) * 0.5

    return res


def loss_test(X, y, sigma, W, b, v):
    P = len(y)
    pred = feedforward(X, W, b, v, sigma)
    res = ((np.sum((pred - y) ** 2)) * P ** (-1)) * 0.5

    return res


def feedforwardplot(x1, x2, W, b, v, sigma):
    X = np.array([x1, x2])
    linear_layer = (np.dot(X, W) + b)
    activation = tanh(linear_layer, sigma)
    pred = np.dot(activation, v)

    return pred


def train(X, y, sigma, N, rho, W, b, v, max_iter=1000,
          tol=1e-5, method='CG', func=loss, disp=False):
          
    x0 = np.concatenate((W, b, v), axis=None)
    funcArgs = [X, y, sigma, N, rho]
    
    res = minimize(func,
                   x0,
                   args=funcArgs, 
                   method=method, 
                   tol=tol,
                   jac=backpropagation,
                   options={'maxiter':max_iter, 
                            'disp': disp})  
    
    return res
    

def plotting(W, b, v, sigma):
    fig = plt.figure(figsize=(12, 8))
    ax = plt.axes(projection='3d')
    # create the grid
    x = np.linspace(-3, 3, 50)
    y = np.linspace(-2, 2, 50)
    X_plot, Y_plot = np.meshgrid(x, y)

    Z = []
    for x1 in x:
        z = []
        for x2 in y:
            z.append(feedforwardplot(x1, x2, W, b, v, sigma))
        Z.append(z)
    Z = np.array(Z)

    ax.plot_surface(X_plot, Y_plot, Z, rstride=1, cstride=1, cmap='viridis', edgecolor='none')

    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_zlabel('z')
    ax.set_title('F(x) learnt from MLP BackPropagation')
    plt.show()

sigma = 1
N = 70
rho = 1e-5
method = 'CG'

W = np.random.normal(size=(X.shape[1], N))
b = np.random.normal(size=N)
v = np.random.normal(size=N)

x0 = np.concatenate((W, b, v), axis=None)
funcArgs = [X, y, sigma, N, rho]

print('===================')
print('Sigma:', sigma)
print('N:', N)
print('Rho:', rho)

loss_start = loss(x0, funcArgs)
grad_norm_start = np.linalg.norm(backpropagation(x0, 
                            funcArgs=funcArgs))

start = time.time()
res = train(X, y, sigma=sigma, 
            N=N, rho=rho, 
            W=W, b=b, v=v,
            max_iter=5000, tol=1e-6, 
            method=method, func=loss,
            disp=True)
stop = time.time()

loss_end = res.fun

W=res.x[:int(X.shape[1] * N)].reshape((X.shape[1], N)),
b=res.x[int(X.shape[1] * N):int(X.shape[1] * N + N)],
v=res.x[int(X.shape[1] * N + N):]

funcArgs_test = [X_test, y_test, sigma, N, rho]

loss(x0, funcArgs)

val_loss = loss(res.x, funcArgs_test, test=True)

grad_norm_end = np.linalg.norm(backpropagation(np.concatenate((W, b, v), axis=None), 
                            funcArgs=funcArgs))

train_loss = loss(res.x, funcArgs, test=True)

best_loss_start = loss_start
best_loss_end = loss_end
N_best = N
sigma_best = sigma
rho_best = rho
convergence = res.success
method_best = method

print('N')
print(N_best)
print('')
print('sigma')
print(sigma_best)
print('')
print('rho')
print(rho_best)
print('')
print('W')
print(W)
print('')
print('b')
print(b)
print('')
print('v')
print(v)
print('')
print('Convergence?')
print(convergence)
print('')
print('Best Method?')
print(method_best)
print('')
print('Objective Function Start')
print(best_loss_start)
print('')
print('Objective Function End')
print(best_loss_end)
print('')
print('Gradient Norm Start')
print(grad_norm_start)
print('')
print('Gradient Norm End')
print(grad_norm_end)
print('')
print('Computation time')
print(round(stop-start, 2))
print('')
print('Final Train Error')
print(train_loss)
print('')
print('Final Test Error')
print(val_loss)


Sigma: 1
N: 70
Rho: 1e-05
         Current function value: 0.001075
         Iterations: 5000
         Function evaluations: 8371
         Gradient evaluations: 8371
N
70

sigma
1

rho
1e-05

W
(array([[ 0.12499954,  0.24585235,  0.22118693,  0.74194332, -0.09482024,
         0.21466581,  1.00631939,  0.39847833, -0.15230157,  0.40491002,
        -0.84144332, -0.56935407,  0.0051245 , -1.16500378, -1.36063927,
        -0.04643047, -0.61831262,  0.02902469, -1.42003251, -1.03031277,
         0.76500671, -0.10400655, -0.02458852, -1.15920794, -1.08348087,
         0.3317156 , -1.37170094,  0.98208925, -0.51705019,  0.05845514,
        -0.29731198,  1.39011614,  0.18801272, -0.74983759,  1.22051389,
        -0.937588  , -0.10540283, -1.65058822, -1.77238822,  0.17214847,
         0.7059292 ,  0.07561034,  0.08839814, -0.31467994, -0.53822852,
        -0.65418666, -0.65015316,  0.53474061,  0.06240729, -1.68101923,
         0.22058677, -0.14155731, -0.33286267,  0.38213244,  0.77625546,
  

In [3]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from sklearn.model_selection import train_test_split
import itertools
import matplotlib.pyplot as plt
import time

df = pd.read_csv('DATA.csv')

train, test = train_test_split(df, test_size=0.255, random_state=1939671)

X = np.array(train[['x1', 'x2']])
y = np.array(train['y'])

X_test = np.array(test[['x1', 'x2']])
y_test = np.array(test['y'])


def rbf(X, c, sigma):
    """
    This function is only applied for a single observation
    x belongs to R^2
    c belongs to R^{2, 10}
    return R^10, 186
    """
    minus_matrix = []
    for i in range(len(c.T)):
        minus_matrix.append(X - c.T[i])
    minus_matrix = np.array(minus_matrix)

    return np.exp(-(np.linalg.norm(minus_matrix, axis=2)/sigma)**2)


def feedforward(X, c, v, sigma):
    """
    This function is only applied for a single observation
    x belongs to R^2
    c belongs to R^{2, 10}
    v belongs to R^N
    return float
    """
    
    pred = np.dot(rbf(X, c, sigma).T, v)
    return pred


def backpropagation(x0, funcArgs):

    X = funcArgs[0]
    y = funcArgs[1]
    sigma = funcArgs[2]
    N = funcArgs[3]
    rho = funcArgs[4]
    P = len(y)
    
    c = x0[:int(X.shape[1]*N)].reshape((X.shape[1],N))
    v = x0[int(X.shape[1]*N):]
    
    z_1 = rbf(X, c, sigma).T
    dJdf = (1/P)*(np.dot(z_1, v) - y)

    minus_matrix = []
    for i in range(len(c.T)):
        minus_matrix.append(X - c.T[i])
    minus_matrix = np.array(minus_matrix)

    dW1_1 = np.dot(dJdf.reshape((P, 1)), v.reshape((1,N)))
    dzdc = ((2*z_1)/(sigma**2))*minus_matrix.T

    dv = np.dot(dJdf, z_1) + rho*v
    dc = np.sum(dzdc*dW1_1, axis=1) + rho*c

    return np.concatenate((dc, dv), axis=None)


def loss(x0, funcArgs, test=False):
    
    X = funcArgs[0]
    y = funcArgs[1]
    sigma = funcArgs[2]
    N = funcArgs[3]
    rho = funcArgs[4]
    
    c = x0[:int(X.shape[1]*N)].reshape((X.shape[1],N))
    v = x0[int(X.shape[1]*N):]

    P = len(y)
    pred = feedforward(X, c, v, sigma)
    norm = np.linalg.norm(x0)
    if test:
        res = ((np.sum((pred - y) ** 2)) * P ** (-1)) * 0.5
    else:
        res = ((np.sum((pred - y) ** 2)) * P ** (-1) + rho * norm ** 2) * 0.5
    
    return res


def feedforwardplot(x_i_1, x_i_2, c, v, sigma):
    x_i = np.array([x_i_1, x_i_2])
    pred = np.dot(np.exp(-(np.linalg.norm((x_i - c.T), axis=1)/sigma)**2), v)
    return pred


def train(X, y, sigma, N, rho, c_init, 
          v_init, max_iter=1000, tol=1e-5, method='CG', func=loss, disp=False):
    
    x0 = np.concatenate((c_init, v_init), axis=None)
    funcArgs = [X, y, sigma, N, rho]

    res = minimize(func,
                   x0,
                   args=funcArgs, 
                   method=method, 
                   tol=tol,
                   jac=backpropagation,
                   options={'maxiter':max_iter, 
                            'disp': disp})    
    
    return res

sigma = 1
N = 70
rho = 1e-5
method = 'CG'

c = np.random.normal(size=(X.shape[1], N))
v = np.random.normal(size=N)

x0 = np.concatenate((c, v), axis=None)
funcArgs = [X, y, sigma, N, rho]

print('===================')
print('Sigma:', sigma)
print('N:', N)
print('Rho:', rho)

loss_start = loss(x0, funcArgs)
grad_norm_start = np.linalg.norm(backpropagation(x0, 
                                 funcArgs=funcArgs))

start = time.time()
res = train(X, y, sigma=sigma, 
            N=N, rho=rho, 
            c_init=c, v_init=v,
            max_iter=5000, tol=1e-6, 
            method=method, func=loss,
            disp=True)
stop = time.time()

loss_end = res.fun

funcArgs_test = [X_test, y_test, sigma, N, rho]

loss(x0, funcArgs)

val_loss = loss(res.x, funcArgs_test, test=True)

grad_norm_end = np.linalg.norm(backpropagation(res.x, 
                               funcArgs=funcArgs))

train_loss = loss(res.x, funcArgs, test=True)

best_loss_start = loss_start
best_loss_end = loss_end
N_best = N
sigma_best = sigma
rho_best = rho
convergence = res.success
method_best = method

print('N')
print(N_best)
print('')
print('sigma')
print(sigma_best)
print('')
print('rho')
print(rho_best)
print('')
print('Convergence?')
print(convergence)
print('')
print('Best Method?')
print(method_best)
print('')
print('Objective Function Start')
print(best_loss_start)
print('')
print('Objective Function End')
print(best_loss_end)
print('')
print('Gradient Norm Start')
print(grad_norm_start)
print('')
print('Gradient Norm End')
print(grad_norm_end)
print('')
print('Computation time')
print(round(stop-start, 2))
print('')
print('Final Train Error')
print(train_loss)
print('')
print('Final Test Error')
print(val_loss)


Sigma: 1
N: 70
Rho: 1e-05
         Current function value: 0.001372
         Iterations: 5000
         Function evaluations: 8286
         Gradient evaluations: 8286
N
70

sigma
1

rho
1e-05

Convergence?
False

Best Method?
CG

Objective Function Start
3.9203789914884295

Objective Function End
0.0013722166771667965

Gradient Norm Start
3.65625481076387

Gradient Norm End
2.123438703032642e-05

Computation time
36.22

Final Train Error
0.00035705536688818216

Final Test Error
0.0020141031986431537
