### Question 1_1

In [1]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from sklearn.model_selection import train_test_split
import itertools
import matplotlib.pyplot as plt
import time

df = pd.read_csv('DATA.csv')

train, test = train_test_split(df, test_size=0.255, random_state=1939671)

X = np.array(train[['x1', 'x2']])
y = np.array(train['y'])

X_test = np.array(train[['x1', 'x2']])
y_test = np.array(train['y'])

In [2]:
import copy

def J(f, x, dx=1e-8):
    n = x.shape
    func = f(x)
    jac = np.zeros(n)
    x_plus = x.copy()
    for i in range(n[0]):
        for j in range(n[1]):  # through columns to allow for vector addition
            x_plus[i, j] = x[i, j] + dx
            jac[i, j] = (f(x_plus) - func)/dx
            x_plus = x.copy()
    return jac

In [3]:
def J_(f, x, dx=1e-8):
    n = x.shape
    func = f(x)
    jac = np.zeros(n)
    x_plus = x.copy()
    for i in range(n[0]):
        x_plus[i] = x[i] + dx
        jac[i] = (f(x_plus) - func)/dx
        x_plus = x.copy()
    return jac

In [4]:
N = 10
P = len(y)
rho = 0
sigma = 1

W = np.random.randn(X.shape[1], N)
b = np.random.randn(N)
v = np.random.randn(N)

In [5]:
def tanh(s, sigma):
    prod = 2*sigma*s
    return (np.exp(prod)-1)/(np.exp(prod)+1)

def feedforward(X, W, b, v, sigma):
    
    linear_layer = (np.dot(X, W) + b)
    activation = tanh(linear_layer, sigma)
    pred = np.dot(activation, v)
    
    return pred

In [6]:
def loss_v(v, X=X, y=y, sigma=sigma, N=N, rho=rho, b=b, W=W):
    P = len(y)
    x0 = np.concatenate((W, v, b), axis=None)
    norm = np.linalg.norm(x0)
    pred = feedforward(X, W, b, v, sigma)
    res = ((np.sum((pred-y)**2))*P**(-1) + rho*norm)*0.5    
    
    return res

def loss_b(b, X=X, y=y, sigma=sigma, N=N, rho=rho, W=W, v=v):
    P = len(y)
    x0 = np.concatenate((W, v, b), axis=None)
    norm = np.linalg.norm(x0)
    pred = feedforward(X, W, b, v, sigma)
    res = ((np.sum((pred-y)**2))*P**(-1) + rho*norm)*0.5    
    
    return res

def loss_W(W, X=X, y=y, sigma=sigma, N=N, rho=rho, b=b, v=v):
    P = len(y)
    x0 = np.concatenate((W, v, b), axis=None)
    norm = np.linalg.norm(x0)
    pred = feedforward(X, W, b, v, sigma)
    res = ((np.sum((pred-y)**2))*P**(-1) + rho*norm)*0.5    
    
    return res

In [7]:
def J(f, x, dx=1e-8):
    n = x.shape
    func = f(x)
    jac = np.zeros(n)
    x_plus = x.copy()
    for i in range(n[0]):
        for j in range(n[1]):  # through columns to allow for vector addition
            x_plus[i, j] = x[i, j] + dx
            jac[i, j] = (f(x_plus) - func)/dx
            x_plus = x.copy()
    return jac

In [8]:
grads = {}

linear_layer = (np.dot(X, W) + b)
a_2 = tanh(linear_layer, sigma)
dJdf = (1/P)*(np.dot(a_2, v) - y)
dtanh = 1 - tanh(linear_layer, sigma)**2

dW1_1 = np.tensordot(dJdf, np.transpose(v), axes=0)
dW1_2 = dW1_1*dtanh

grads['v'] = np.dot(dJdf, a_2) + rho*v
grads['b'] = np.sum(dW1_2, axis=0) + rho*b
grads['W'] = np.tensordot(np.transpose(X), dW1_2, axes=1) + rho*W

In [9]:
dW1_2.shape

(186, 10)

In [10]:
J_(loss_b, b).T

array([-0.80613596,  0.74883886, -0.86719467, -0.2842885 , -1.51427066,
        0.42305066,  0.10000303, -0.72764239,  0.10300871,  0.75171336])

In [11]:
grads['b'].T

array([-0.80613599,  0.74883879, -0.86719474, -0.28428863, -1.51427073,
        0.42305048,  0.10000305, -0.72764253,  0.1030087 ,  0.75171325])

In [12]:
J_(loss_v, v).T

array([-0.42828932,  0.25427891,  2.04459676,  2.22529222,  1.23179626,
       -0.77665261, -1.93779748, -0.35375036,  0.4781664 ,  1.14949685])

In [13]:
grads['v'].T

array([-0.42828943,  0.25427877,  2.04459668,  2.22529217,  1.23179624,
       -0.77665272, -1.93779762, -0.35375037,  0.47816636,  1.14949678])

In [14]:
J(loss_W, W).T

array([[ 0.38439536,  0.68766992],
       [-0.06041025, -0.61983849],
       [ 0.43542601,  0.09192327],
       [-0.01864349, -0.14470665],
       [ 0.39348524,  0.30274565],
       [-0.01730402, -0.22060931],
       [-0.02837943,  0.04287344],
       [ 0.01389466,  0.57564229],
       [ 0.01219274, -0.12404229],
       [-0.29863125, -0.33205154]])

In [15]:
grads['W'].T

array([[ 0.38439525,  0.68766982],
       [-0.06041032, -0.61983862],
       [ 0.43542586,  0.09192315],
       [-0.0186436 , -0.14470671],
       [ 0.39348512,  0.30274559],
       [-0.01730416, -0.2206094 ],
       [-0.02837951,  0.04287343],
       [ 0.01389457,  0.57564218],
       [ 0.01219269, -0.12404235],
       [-0.29863126, -0.33205161]])

### Question 1_2 

In [16]:
N = 10
c = np.random.randn(X.shape[1], N)
v = np.random.randn(N)
sigma = 1
P = len(y)
rho=0

In [17]:
def rbf(X, c, sigma):
    """
    This function is only applied for a single observation
    x belongs to R^2
    c belongs to R^{2, 10}
    return R^10, 186
    """
    minus_matrix = []
    for i in range(len(c.T)):
        minus_matrix.append(X - c.T[i])
    minus_matrix = np.array(minus_matrix)

    return np.exp(-(np.linalg.norm(minus_matrix, axis=2)/sigma)**2)

def feedforward(X, c, v, sigma):
    """
    This function is only applied for a single observation
    x belongs to R^2
    c belongs to R^{2, 10}
    v belongs to R^N
    return float
    """
    
    pred = np.dot(rbf(X, c, sigma).T, v)
    return pred
    
def loss_v(v, X=X, y=y, sigma=sigma, N=N, rho=rho, c=c):

    P = len(y)
    x0 = np.concatenate((v, c), axis=None)
    sum_ = np.sum((feedforward(X, c, v, sigma) - y)**2)
    norm = np.linalg.norm(x0)
    res = (sum_*P**(-1) + rho*norm)*0.5 
    
    return res

def loss_c(c, X=X, y=y, sigma=sigma, N=N, rho=rho, v=v):
    
    P = len(y)
    x0 = np.concatenate((v, c), axis=None)
    sum_ = np.sum((feedforward(X, c, v, sigma) - y)**2)
    norm = np.linalg.norm(x0)
    res = (sum_*P**(-1) + rho*norm)*0.5 
    
    return res

In [18]:
grads = {}

z_1 = rbf(X, c, sigma).T
dJdf = (1/P)*(np.dot(z_1, v) - y)

minus_matrix = []
for i in range(len(c.T)):
    minus_matrix.append(X - c.T[i])
minus_matrix = np.array(minus_matrix)

dW1_1 = np.dot(dJdf.reshape((P, 1)), v.reshape((1,N)))
dzdc = ((2*z_1)/(sigma**2))*minus_matrix.T

grads['v'] = np.dot(dJdf, z_1) + rho*v
grads['c'] = np.sum(dzdc*dW1_1, axis=1) + rho*c

In [19]:
grads['v'].T

array([ 0.19615518,  0.10415326, -0.13837911,  0.14623971, -0.05062377,
        0.23559342,  0.20378928, -0.16549224,  0.10545867,  0.17595784])

In [20]:
J_(loss_v, v).T

array([ 0.19615516,  0.10415326, -0.13837913,  0.14623971, -0.05062377,
        0.23559343,  0.20378927, -0.16549224,  0.10545866,  0.17595785])

In [21]:
grads['c'].T

array([[ 0.03341052,  0.19978988],
       [-0.0038941 , -0.0527783 ],
       [ 0.01005668,  0.00831174],
       [ 0.06070191, -0.05972991],
       [-0.01080392, -0.04613853],
       [-0.08542633, -0.01771355],
       [-0.05303662,  0.01380043],
       [ 0.00360469, -0.04113533],
       [ 0.24430081,  0.47257123],
       [-0.0947091 ,  0.5037001 ]])

In [22]:
J(loss_c, c).T

array([[ 0.03341052,  0.19978985],
       [-0.00389409, -0.05277829],
       [ 0.01005667,  0.00831173],
       [ 0.06070191, -0.05972993],
       [-0.01080394, -0.04613854],
       [-0.08542633, -0.01771354],
       [-0.05303664,  0.01380043],
       [ 0.00360469, -0.04113534],
       [ 0.2443008 ,  0.47257123],
       [-0.09470909,  0.5037001 ]])