In [16]:
import pickle
import numpy as np
import plotly.express as ex
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
import json

__author__="Nuo Chen"

# MLP Model 
---
A basic MLP network:

**Input**:

**Learning rule**:
- Delta rule
- Backpropagation (generalised delta rule)

**Output**:


    

In [46]:
class Model:
    def __init__(self, nodes):
        """
        Arg:
            layers - [in, n1, n2, ..., nj, out] an array of numbers of nodes in each layer.
        Returns:
            a fully-connected network with default weights and biases sampled from N(0, 1)
        """
        self.n_layers = len(nodes)
        self.NODES = nodes
        self.init_param()
    def init_param(self, useBias=True):
        """
        Initialize the weights and biases with samples from normal distributions
        W = [shape(n1,n2), shape(n2,n3), ..., shape(nj,nj+1)], where nj = number of nodes in the layer
        B = [shape(n1,1), shape(n2,1), ..., shape(nj,1)]
        """
        self.useBias = useBias

        self.W = [np.random.normal(0,1,(i,j)) for i,j in zip(self.NODES[:-1], self.NODES[1:])]
        if useBias:
            self.B = [np.random.normal(0,1,(i,1)) for i in self.NODES[1:]]
        else:
            self.B = [np.zeros(i,1) for i in self.NODES[1:]]
            
    def init_training_param(self, n, batch_size, epochs, eta, lmbda, useMom=False):
        """
        Initialize the hyper-parameters
        """
        self.n_samples = n
        self.BATCH_SIZE = batch_size
        self.EPOCHS = epochs
        self.BATCHES = int(n/batch_size)
        self.LAMBDA = lmbda
        self.ETA = eta
        self.useMom = useMom
        if (useMom):
            self.alpha = .9
            self.theta = 0
            self.omega = 0

    def normalize(self, x):
        """
        Normalizes the inputs
        """
        mean = np.mean(x, axis=1, keepdims=True)
        std = np.std(x, axis=1, keepdims=True)
        x = (x-mean) / std
        return x

    @staticmethod
    def softmax():
        return lambda x: np.exp(x-np.max(x,axis=0)) / np.sum(np.exp(x-np.max(x,axis=0)), axis=0)
    @staticmethod
    def relu():
        return lambda x: np.maximum(0,x)
    @staticmethod
    def delta_rule():
        return lambda x,w,t,eta: -eta*(w@x - t)@x.T    
    @staticmethod
    def sigmoid():
        return lambda x: 2 / (1+np.exp(-x)) - 1
    
    def cross_entropy(self, p, y):
        """
        Returns the cross-entropy cost of the prediciton
        """
        p[p==0] = 1e-7
        cost = 1/y.shape[1] * -np.sum(y*np.log(p))
        w_sum = [w**2 for w in self.W]
        s = 0
        for w in w_sum: 
            s+= np.sum(w)
        cost += self.LAMBDA * s
        return cost
    def feedforward(self, activations, act_fn, out_fn):
        """
        s1 = w1 @ x + b1
        h1 = act_fn(s1)
        ....
        sn = wn @ h(n-1) + bn
        return out_fn(sn)
        """
        a = activations[0]
        for i in range(self.n_layers-1):
            s = self.W[i].T @ a + self.B[i]
            a = act_fn(a)
            activations.append(a)

        return out_fn(a)
    def backPropagation(self, y, p, activations):
        """
        Back propagate the network and calculate the gradients
        """
        dw = [np.zeros(w.shape) for w in self.W]
        db = [np.zeros(b.shape) for b in self.B]

        g = -(y - p)
        for i in range(len(self.W)-1, -1, -1):
            dw[i] = g @ activations[i].T * 1/self.BATCH_SIZE + 2 * self.LAMBDA * self.W[i].T
            db[i] =  (np.sum(g, axis=1) * 1/self.BATCH_SIZE).reshape(self.B[i].shape)
            g = self.W[i] @ g
            g[np.where(activations[i]<=0)] = 0
        
        return (dw, db)
    
    def backPass(self, labels, predictions, activations):
        dw = [np.zeros(w.shape) for w in self.W]
        db = [np.zeros(b.shape) for b in self.B]

        g = (p-y) * (1+p)*(1-p) * 0.5
        for i in range(len(self.W)-1, -1, -1):
            dw[i] = g
            db[i] =  (np.sum(g, axis=1) * 1/self.BATCH_SIZE).reshape(self.B[i].shape)
            g = g * ( (1+activations[i])*(1-activations[i]) * 0.5) * self.W[i].T

        return (dw, db)
            


    def accuracy(self, p, y):
        """
        Compute the accuracy of the predictions
        """
        predictions = np.argmax(p, axis=0)
        y = np.argmax(y, axis=0)
        acc = predictions.T[predictions == y].shape[0] / p.shape[1]
        return acc
    
    def update_batch(self, x, y):
        """
        For each batch: 
            Pass the input into the network and compute the predictions.
            Back propagate through the network to compute the gradients using the stored act>
            Update the weights and biases using the gradients
        """
        activations = [x]
        p = self.feedforward(activations, Model.relu(), Model.softmax())
        dw, db = self.backPropagation(y, p, activations)

        if (not self.useMom):
            for i in range(self.n_layers-1):
                self.W[i] = self.W[i] - self.ETA * dw[i].T
                if (self.useBias):
                    self.B[i] = self.B[i] - self.ETA * db[i]
        else:
            for i in range(self.n_layers-1):
                self.theta = self.alpha * self.theta - (1-self.alpha) * dw[i]
                self.omega = self.alpha * self.omega - (1-self.alpha) * db[i]
                self.W[i] = self.W[i] + self.ETA * self.theta
                if (self.useBias):
                    self.B[i] = self.B[i] + self.ETA * self.omega

    def SGD(self, features, targets, test_size, verbose=False):
        """
        Stochastic gradient descend method
        Trains the network a given number of epochs or cycles
        Return:
            Training cost and validation cost
            Training accuracy and validation accuracy
        """

        train_features, test_features, train_labels, test_labels = train_test_split(features, targets, test_size = test_size, random_state = 2020)

        training_cost = []
        validation_cost = []
        training_accuracy = []
        validation_accuracy = []
        for t in range(self.EPOCHS):
            # Shuffles the order of samples 
            idx = np.random.permutation(self.n_samples)
            for j in range(1, self.BATCHES):
                start = (j-1) * self.BATCH_SIZE
                end = j * self.BATCH_SIZE
                indices = idx[start:end]
                x_batch = train_features[:, indices]            
                y_batch = train_labels[:, indices]    
                self.update_batch(x_batch, y_batch)

                    # Check cost and accuracy 10 times per cycle 

            p_t = self.feedforward([train_features], Model.relu(), Model.softmax())
            p_v = self.feedforward([test_features],Model.relu(), Model.softmax())
            training_cost.append(self.cross_entropy(p_t, train_labels))
            validation_cost.append(self.cross_entropy(p_v, test_labels))
            training_accuracy.append(self.accuracy(p_t, train_labels))
            validation_accuracy.append(self.accuracy(p_v, test_labels))

            if (verbose):
                print("Epoch #{}--------------------------------------".format(t))
                print("Training Cost: {:.6f}".format(training_cost[-1]))
                print("Validation Cost: {:.6f}".format(validation_cost[-1]))
                print("Training Accuracy = {:.3f}".format(training_accuracy[-1]))
                print("Validation Accuracy = {:.3f}".format(validation_accuracy[-1]))
                print("-"*50)

        return (training_cost, validation_cost, training_accuracy, validation_accuracy)

    def save(self, filename):
        """
        Save the model to the file 'filename`.
        """
        data = {"Nodes": self.NODES,
                "W": [w.tolist() for w in self.W],
                "B": [b.tolist() for b in self.B]}
        with open(filename, "w") as f:
            json.dump(data, f)

    def load(self, filename):
        """
        Load the model
        """
        with open(filename, "r") as f:
            data = json.load(f)
        
        self.NODES = data["Nodes"]
        self.W = [np.array(w) for w in data["W"]]
        self.B = [np.array(b) for b in data["B"]]



# Two-layer MLP according to lab 1 instruction


In [9]:
class two_layer_model():
    def __init__(self, N):
        self.N = N
    def init_param(self, eta):
        self.W = np.random.uniform(0,1,(self.N,1))
        self.V = np.random.uniform(0,1,(self.N,1))
        self.dw = np.zeros((self.N,1))
        self.dv = np.zeros((self.N,1))
        self.ETA = eta

    def act_fn(self, x):
        return 2 / (1+np.exp(-x)) - 1

    def forward(self, x):
        x = np.concatenate(x, np.ones(1, x.shape[1]), axis=0)
        h_in = self.W * x
        h_out = act_fn(h_in)
        o_in = self.V * h_out
        o_out = act_fn(o_in)
        return (h_out, o_out)

    def backpass(self, x, y, h_out, o_out):
        delta_o = (o_out - y) * ((1+o_out)*(1-o_out)) * .5
        delta_h = (self.V.T * delta_o) * ((1+h_out)*(1-h_out)) * .5
        delta_h = delta_h[1:self.N, :]
        return delta_h, delta_o

    def update(self, x, h_out, delta_h, delta_o, alpha=.9):
        self.dw = (self.dw * alpha) - (delta_h * x.T) * (1-alpha)
        self.dv = (self.dv * alpha) - (delta-o * h_out.T) * (1-alpha)
        self.W = self.W + self.dw * self.ETA
        self.V = self.V + self.dv * self.ETA


In [66]:
def delta_rule(x,y,w,eta):
    return -eta * (w*x-y)*x


# Generate data

In [11]:
def linearly_separable(n, m1,m2, sigma1, sigma2):
    classA = np.random.normal(m1, sigma1, size=[n,2])
    classB = np.random.normal(m2, sigma2, size=[n,2])
    return classA, classB


## 3.1.1 - 3.1.2



In [31]:
N = 100
(A, B) = linearly_separable(N, [-3,0], [3,0], 1, 1)

fig = go.Figure()
fig.add_trace(go.Scatter(x=A[:,0], y=A[:,1], mode="markers"))
fig.add_trace(go.Scatter(x=B[:,0], y=B[:,1], mode="markers"))
fig.show()

In [67]:
features = np.concatenate([A,B],axis=0)
targets = np.concatenate([-np.ones((A.shape[0],1)), np.ones((B.shape[0],1))], axis=0)

# model1 = Model([N,2,1])
# model1.init_param(useBias=True)
# model1.init_training_param(N, 10, 3, .01, 0)
# t_cost, v_cost, t_acc, v_acc = model1.SGD(features, targets, .1, True)

w = np.random.uniform(0,1,(1,2))
print(w[:10])
for i in range(3):
    w += delta_rule(features, targets, w, .01)
print(w[:10] )

[[0.94567952 0.60252332]]


ValueError: non-broadcastable output operand with shape (1,2) doesn't match the broadcast shape (200,2)

[]
