In [58]:
import numpy as np
import glob, json
import ipdb
from builtins import range
from builtins import object
import os
import pickle as pickle

import numpy as np



def load_data(path,debug):
    if debug:
        print('Debug is ON!')
        load_key = 'gt_pose_align'
    else:
        load_key = 'gt_pose'
    jsons_train = {'squat':[],'reach':[],'lunge':[],'inline':[],
                   'hamstrings':[],'stretch':[],'deadbug':[],'pushup':[]
                  }
    jsons_val = {'squat':[],'reach':[],'lunge':[],'inline':[],
                 'hamstrings':[],'stretch':[],'deadbug':[],'pushup':[]
                }
    for (id,person) in enumerate(glob.glob(path+"/*/")):
        for move in glob.glob(person+"labels/*.json"):
            this = json.load(open(move,"r"))
            classid = move.split("/")[-1].split(".")[0]
            if id < 225:
                # NOTE: for bilateral movements, we grab the same example twice
                #  so then we don't have to worry about loss weighting
                #  due to class imbalance
                jsons_train[classid].append(np.array(this[load_key]['left']))
                jsons_train[classid].append(np.array(this[load_key]['right']))
            else:
                jsons_val[classid].append(np.array(this[load_key]['left']))
                jsons_val[classid].append(np.array(this[load_key]['right']))

    return jsons_train,jsons_val

def to_matrix(jsons):
    features = []
    labels = []
    sorted_key = sorted(jsons.keys())
    for counter, key in enumerate(sorted_key):
        for item in jsons[key]:
            features.append(item)
            labels.append(counter)
    features = np.array(features)
    labels = np.array(labels)
    return features, labels

def load_mds189(path,debug):
    js_train,js_val = load_data(path,debug)
    features_train, labels_train = to_matrix(js_train)
    feat_val, label_val = to_matrix(js_val)
    # random permute to shuffle the data
    np.random.seed(0)
    perm_train = np.random.permutation(features_train.shape[0])
    feat_train=features_train[perm_train, :]
    label_train=labels_train[perm_train]

    return feat_train, label_train, feat_val, label_val




class Solver(object):
    """
    """

    def __init__(self, model, data, **kwargs):
        """
        Construct a new Solver instance.

        Required arguments:
        - model: A model object conforming to the API described above
        - data: A dictionary of training and validation data containing:
          'X_train': Array, shape (N_train, d_1, ..., d_k) of training images
          'X_val': Array, shape (N_val, d_1, ..., d_k) of validation images
          'y_train': Array, shape (N_train,) of labels for training images
          'y_val': Array, shape (N_val,) of labels for validation images

        Optional arguments:
        - update_rule: A string giving the name of an update rule in optim.py.
          Default is 'sgd'.
        - optim_config: A dictionary containing hyperparameters that will be
          passed to the chosen update rule. Each update rule requires different
          hyperparameters (see optim.py) but all update rules require a
          'learning_rate' parameter so that should always be present.
        - lr_decay: A scalar for learning rate decay; after each epoch the
          learning rate is multiplied by this value.
        - batch_size: Size of minibatches used to compute loss and gradient
          during training.
        - num_epochs: The number of epochs to run for during training.
        - print_every: Integer; training losses will be printed every
          print_every iterations.
        - verbose: Boolean; if set to false then no output will be printed
          during training.
        - num_train_samples: Number of training samples used to check training
          accuracy; default is 1000; set to None to use entire training set.
        - num_val_samples: Number of validation samples to use to check val
          accuracy; default is None, which uses the entire validation set.
        - checkpoint_name: If not None, then save model checkpoints here every
          epoch.
        """
        self.model = model
        self.X_train = data['X_train']
        self.y_train = data['y_train']
        self.X_val = data['X_val']
        self.y_val = data['y_val']

        # Unpack keyword arguments
        self.update_rule = kwargs.pop('update_rule', 'sgd')
        self.optim_config = kwargs.pop('optim_config', {})
        self.lr_decay = kwargs.pop('lr_decay', 1.0)
        self.batch_size = kwargs.pop('batch_size', 100)
        self.num_epochs = kwargs.pop('num_epochs', 10)
        self.num_train_samples = kwargs.pop('num_train_samples', 1000)
        self.num_val_samples = kwargs.pop('num_val_samples', None)

        self.checkpoint_name = kwargs.pop('checkpoint_name', None)
        self.print_every = kwargs.pop('print_every', 10)
        self.verbose = kwargs.pop('verbose', True)

        # Throw an error if there are extra keyword arguments
        if len(kwargs) > 0:
            extra = ', '.join('"%s"' % k for k in list(kwargs.keys()))
            raise ValueError('Unrecognized arguments %s' % extra)

        # Make sure the update rule exists, then replace the string
        # name with the actual function
#         if not hasattr(optim, self.update_rule):
#             raise ValueError('Invalid update_rule "%s"' % self.update_rule)
        self.update_rule = self.__sgd
        self._reset()
        
    def __sgd(self, w, dw, config=None):

        if config is None: config = {}
        config.setdefault('learning_rate', 1e-2)

        w -= config['learning_rate'] * dw
        return w, config

        self._reset()


    def _reset(self):
        """
        Set up some book-keeping variables for optimization. Don't call this
        manually.
        """
        # Set up some variables for book-keeping
        self.epoch = 0
        self.best_val_acc = 0
        self.best_params = {}
        self.loss_history = []
        self.train_acc_history = []
        self.val_acc_history = []

        # Make a deep copy of the optim_config for each parameter
        self.optim_configs = {}
        for p in self.model.params:
            d = {k: v for k, v in self.optim_config.items()}
            self.optim_configs[p] = d


    def _step(self):
        """
        Make a single gradient update. This is called by train() and should not
        be called manually.
        """
        # Make a minibatch of training data
        num_train = self.X_train.shape[0]
        batch_mask = np.random.choice(num_train, self.batch_size)
        X_batch = self.X_train[batch_mask]
        y_batch = self.y_train[batch_mask]

        # Compute loss and gradient
        loss, grads = self.model.loss(X_batch, y_batch)
        self.loss_history.append(loss)

        # Perform a parameter update
        for p, w in self.model.params.items():
            dw = grads[p]
            config = self.optim_configs[p]
            next_w, next_config = self.update_rule(w, dw, config)
            self.model.params[p] = next_w
            self.optim_configs[p] = next_config


    def _save_checkpoint(self):
        if self.checkpoint_name is None: return
        checkpoint = {
          'model': self.model,
          'update_rule': self.update_rule,
          'lr_decay': self.lr_decay,
          'optim_config': self.optim_config,
          'batch_size': self.batch_size,
          'num_train_samples': self.num_train_samples,
          'num_val_samples': self.num_val_samples,
          'epoch': self.epoch,
          'loss_history': self.loss_history,
          'train_acc_history': self.train_acc_history,
          'val_acc_history': self.val_acc_history,
        }
        filename = '%s_epoch_%d.pkl' % (self.checkpoint_name, self.epoch)
        if self.verbose:
            print('Saving checkpoint to "%s"' % filename)
        with open(filename, 'wb') as f:
            pickle.dump(checkpoint, f)


    def check_accuracy(self, X, y, num_samples=None, batch_size=100):

        # Maybe subsample the data
        N = X.shape[0]
        if num_samples is not None and N > num_samples:
            mask = np.random.choice(N, num_samples)
            N = num_samples
            X = X[mask]
            y = y[mask]

        # Compute predictions in batches
        num_batches = N // batch_size
        if N % batch_size != 0:
            num_batches += 1
        y_pred = []
        for i in range(num_batches):
            start = i * batch_size
            end = (i + 1) * batch_size
            scores = self.model.loss(X[start:end])
            y_pred.append(np.argmax(scores, axis=1))
        y_pred = np.hstack(y_pred)
        acc = np.mean(y_pred == y)

        return acc


    def train(self):
        """
        Run optimization to train the model.
        """
        num_train = self.X_train.shape[0]
        iterations_per_epoch = max(num_train // self.batch_size, 1)
        num_iterations = self.num_epochs * iterations_per_epoch

        for t in range(num_iterations):
            self._step()

            # Maybe print training loss
#             if self.verbose and t % self.print_every == 0:
#                 print('(Iteration %d / %d) loss: %f' % (
#                        t + 1, num_iterations, self.loss_history[-1]))

            # At the end of every epoch, increment the epoch counter and decay
            # the learning rate.
            epoch_end = (t + 1) % iterations_per_epoch == 0
            if epoch_end:
                self.epoch += 1
                for k in self.optim_configs:
                    self.optim_configs[k]['learning_rate'] *= self.lr_decay

            # Check train and val accuracy on the first iteration, the last
            # iteration, and at the end of each epoch.
            first_it = (t == 0)
            last_it = (t == num_iterations - 1)
            if first_it or last_it or epoch_end:
                train_acc = self.check_accuracy(self.X_train, self.y_train,
                    num_samples=self.num_train_samples)
                val_acc = self.check_accuracy(self.X_val, self.y_val,
                    num_samples=self.num_val_samples)
                self.train_acc_history.append(train_acc)
                self.val_acc_history.append(val_acc)
                self._save_checkpoint()

                if self.verbose:
                    print('(Epoch %d / %d) train acc: %f; val_acc: %f' % (
                           self.epoch, self.num_epochs, train_acc, val_acc))

                # Keep track of the best model
                if val_acc > self.best_val_acc:
                    self.best_val_acc = val_acc
                    self.best_params = {}
                    for k, v in self.model.params.items():
                        self.best_params[k] = v.copy()

        # At the end of training swap the best params into the model
        self.model.params = self.best_params



# Part 2.1.1
## the following is computing the affine forward. Since X can be a (n, d1, d2, ... , dn) input, where n is the batch size, and d1-dn is the dimentions of the input matrix. to make this into a vector form (thus friendly to sigmoid functions, we must reshape X do be an NxD matrix, where D is the product of d1*d2*...*dn. As long as each X is treated the same, the results should not matter. We then take X*W + b for the forward pass. We must cache the results for the backward pass

In [59]:
def affine_forward(x, w, b):
    """
    Computes the forward pass for an affine (fully-connected) layer.

    Inputs:
    - x: A numpy array containing input data, of shape (N, d_1, ..., d_k)
    - w: A numpy array of weights, of shape (D, M)
    - b: A numpy array of biases, of shape (M,)

    Returns a tuple of:
    - out: output, of shape (N, M)
    - cache: (x, w, b)
    """
            
    ###########################################################################
    #                             MUH CODE                                    #
    ###########################################################################
    out = x.reshape(x.shape[0], np.prod(x.shape[1:])).dot(w) + b

    ###########################################################################
    #                             END OF MUH CODE                            #
    ###########################################################################
    cache = (x, w, b)
    return out, cache

In [76]:
# gradient checking: compare the analytical gradient with the numerical gradient
# taking the affine layer as an example
def eval_numerical_gradient_array(f, x, df, h=1e-5):
    """
    Evaluate a numeric gradient for a function that accepts a numpy
    array and returns a numpy array.
    """
    grad = np.zeros_like(x)
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        ix = it.multi_index
        oldval = x[ix]
        x[ix] = oldval + h
        pos = f(x).copy()
        x[ix] = oldval - h
        neg = f(x).copy()
        x[ix] = oldval

        grad[ix] = np.sum((pos - neg) * df) / (2 * h)
        it.iternext()
    return grad

N = 5
D = 10
M = 4
x = np.random.normal(size=(N, 2,  5))
w = np.random.normal(size=(D, M))
b = np.random.normal(size=(M, ))
dout = np.random.normal(size=(N, M))

# do a forward pass first
out, cache = affine_forward(x, w, b)
# check grad f/grad w, the [0] below gets the output out of the (output, cache) original output

f=lambda w: affine_forward(x, w, b)[0]
# compute the analytical gradient you wrote, [1] get the dw out of the (dx, dw, db) original output
grad = affine_backward(dout, cache)[1]

# compute the numerical gradient using the provided utility function
ngrad = eval_numerical_gradient_array(f, w, dout)
print(grad)
print(ngrad)
# they should be similar enough within some small error tolerance

[[-1.17963333 -0.2407088   0.07179463  0.8960936 ]
 [ 0.0916162   0.67373255  0.08781161 -0.2502169 ]
 [-0.56694346  0.61797012  4.6695976  -2.00207644]
 [ 0.43159037  1.07146382  4.95578555 -3.02372032]
 [ 0.91257439 -0.1374363   0.99985599 -1.37357052]
 [ 1.31523062  0.11572081 -2.98276545 -0.67611179]
 [-2.41976023 -0.64414636  1.19960947  0.85543091]
 [-2.41893652 -0.4558098   0.39034347  1.67115788]
 [ 4.35240658  3.4989828   2.72381122 -6.12624107]
 [ 2.78337474  0.1922092  -1.64673608  0.04780971]]
[[-1.17963333 -0.2407088   0.07179463  0.8960936 ]
 [ 0.0916162   0.67373255  0.08781161 -0.2502169 ]
 [-0.56694346  0.61797012  4.6695976  -2.00207644]
 [ 0.43159037  1.07146382  4.95578555 -3.02372032]
 [ 0.91257439 -0.1374363   0.99985599 -1.37357052]
 [ 1.31523062  0.11572081 -2.98276545 -0.67611179]
 [-2.41976023 -0.64414636  1.19960947  0.85543091]
 [-2.41893652 -0.4558098   0.39034347  1.67115788]
 [ 4.35240658  3.4989828   2.72381122 -6.12624107]
 [ 2.78337474  0.1922092  -1.6

### Now for the affine backward, This part is a bit more interesting as we need to adjust the weights of W and b by using the d(output)/dW (or /db) from the backwards pass of dL/d(Output) where L is the loss function. dL/dX is the backpropagation. dL/dX = (dO/dX)*(dL/dO), thus backprop is <dout, W^T> where W^T is the transpose of the weights. Reshape it to the size of x to ensure it is the same shape for the next step in the backprop. Now dW is simple, since it needs to be the same shape of x dot it with the backprop to get the gradient of w, and db is just the sum of the backprop weights since it becomes a matrix of ones. 

In [60]:
def affine_backward(dout, cache):
    """
    Computes the backward pass for an affine layer.
    Inputs:
    - dout: Upstream derivative, of shape (N, M)
    - cache: Tuple of:
      - x: Input data, of shape (N, d_1, ... d_k)
      - w: Weights, of shape (D, M)
      - b: Biases, of shape (M,)

    Returns a tuple of:
    - dx: Gradient with respect to x, of shape (N, d1, ..., d_k)
    - dw: Gradient with respect to w, of shape (D, M)
    - db: Gradient with respect to b, of shape (M,)
    """
    x, w, b = cache
    ###########################################################################
    #                              MUH CODE                                   #
    ###########################################################################
    dx = dout.dot(w.T)
    dx = dx.reshape(x.shape)
    dw = x.reshape(x.shape[0], np.prod(x.shape[1:])).T.dot(dout)
    db = np.sum(dout, axis=0)
    
    ###########################################################################
    #                             END MUH CODE                                #
    ###########################################################################
    return dx, dw, db

## Part 2.1.2
### First the relu forward. To ensure we do not have any weights decay (become irrelevant too quickly, we use a relu function, which zeros out all negative numbers.  this way on the back propagation they do not decay more. This is relevant to the backprop, as we zero out the gradients that were not relevant for gradient adjustments. 

In [61]:
def relu_forward(x):
    ###########################################################################
    #                                MUH CODE                                #
    ###########################################################################
    out = x.copy()
    out[x <= 0] = 0
    ###########################################################################
    #                             END OF MUH CODE                             #
    ###########################################################################
    cache = x
    return out, cache

def relu_backward(dout, cache):
   
    dx, x = dout.copy(), cache
    ###########################################################################
    #                                MUH CODE                                 #
    ###########################################################################

    dx[x <= 0] = 0
    ###########################################################################
    #                             END OF MUH CODE                             #
    ###########################################################################
    return dx

### There are many other activation functions besides ReLU, and each activation function has its advantages and disadvantages. One issue commonly seen with activation functions is vanishing gradients, i.e., getting zero (or close to zero) gradient flow during backpropagation. Which of activation functions (among: linear, ReLU, tanh, sigmoid) experience this problem? Why? What types of one-dimensional inputs would lead to this behavior?

#### Since the relu shown in class may have this issue, making the input very negative and it being the wrong answer would further the decay of a sigmoid neron. The gradient will find the "low spot" geometrically, so a largre negive input will return a small gradient. for relu this mitigates the problem, by ensuring that negitive weights do not decay further by zeroing them out on the forward and backward propagation. This itself is prone to decay, however when used with other functions it helps prevent it.

## 2.1.3
### The softmax function. the gradient passed backwars is the probablity matrix, where p_i,j = true class is subtracted by one, where the loss function is -(sclass - max) + loss(log(sum(e^ci))

In [62]:
def softmax_loss(x, y):
    loss = 0.0
    dx = None
    ###########################################################################
    #                               MUH CODE                                  #
    ###########################################################################

    dx = np.exp(x - np.max(x, axis=1, keepdims=True))
    dx /= np.sum(dx, axis=1, keepdims=True)
    loss = -np.sum(x[np.arange(x.shape[0]), y] - np.max(x, axis=1, keepdims=True)) / x.shape[0] -np.sum( np.log(
        np.sum(dx[np.arange(x.shape[0]), y])))/ x.shape[0]

    dx[np.arange(x.shape[0]), y] -= 1

    ###########################################################################
    #                             END OF YOUR CODE                            #
    ###########################################################################
    return loss, dx / x.shape[0]

## 2.2
### Below is the class for the neural network, you may define as many layers you want from 0 to as much as memory will hold for the input (hidden dim). 

In [67]:
class FullyConnectedNet(object):

    def __init__(self, input_dim, hidden_dim=[10, 5], num_classes=8,
                 weight_scale=0.1):

        self.params = {}
        self.hidden_dim = hidden_dim

        self.layers = 1 + len(hidden_dim)
        modif_hidden_dims = [input_dim] + hidden_dim + [num_classes]
        for i in range(0, self.layers):
            W = 'W' + str(i + 1)
            b = 'b' + str(i + 1)
            self.params[b] = np.zeros(modif_hidden_dims[i + 1])
            self.params[W] = weight_scale * np.random.normal(size=(modif_hidden_dims[i], modif_hidden_dims[i + 1]))

        ############################################################################
        #                             END OF YOUR CODE                             #
        ############################################################################


    def loss(self, X, y=None):
        
        scores = X
        ############################################################################
        #                               FORWARD PASS                               #
        ############################################################################
        cache = {}
        scores, cache['cA1'] = affine_forward(scores, self.params['W1'], self.params['b1'])
        for i in range(1, self.layers):
            W = 'W' + str(i + 1)
            b = 'b' + str(i + 1)
            cA = 'cA' + str(i + 1)
            cR = 'cR' + str(i + 1)
            scores, cache[cR] = relu_forward(scores)
            scores, cache[cA] = affine_forward(scores, self.params[W], self.params[b])
        if y is None:
            return scores
        ############################################################################
        #                     BACK PROPAGATION GRADIENTS, LOSS                     #
        ############################################################################
        loss, grads = 0.0, {}
        loss, dx = softmax_loss(scores, y)
        regParam = 2
        dx, dw, db = affine_backward(dx, cache['cA' + str(self.layers)])
        grads['W' + str(self.layers)] = dw + regParam * self.params['W' + str(self.layers)]
        grads['b' + str(self.layers)] = db
        for i in range(self.layers - 1, 0, -1):
            W = 'W' + str(i)
            b = 'b' + str(i)
            cA = 'cA' + str(i)
            cR = 'cR' + str(i + 1)
            dx = relu_backward(dx, cache[cR])
            dx, dw, db = affine_backward(dx, cache[cA])
            grads[W] = dw + regParam*self.params[W]
            grads[b] = db + self.params[b]
        ############################################################################
        #                             RETURN LOSS, GRADS                           #
        ############################################################################

        return loss, grads

## 2.3
### running and posting results

In [73]:
path = '/Users/alexkern/desktop/cs189/cs189_hw6/resources/trainval'
# load the dataset
debug = False  # OPTIONAL: you can change this to True for debugging *only*. Your reported results must be with debug = False
feat_train, label_train, feat_val, label_val = load_mds189(path, debug)

data = {
    'X_train': feat_train,
    'y_train': label_train,
    'X_val': feat_val,
    'y_val': label_val}



In [79]:
hyperparams = {'lr_decay': .9,
               'num_epochs': 10,
               'batch_size': 1,
               'learning_rate': 0.00005
               }
hidden_dim = [100, 100]  # this should be a list of units for each hiddent layer

model = FullyConnectedNet(input_dim=75,
                          hidden_dim=hidden_dim)
solver = Solver(model, data,
                update_rule='sgd',
                optim_config={
                    'learning_rate': hyperparams['learning_rate'],
                },
                lr_decay=hyperparams['lr_decay'],
                num_epochs=hyperparams['num_epochs'],
                batch_size=hyperparams['batch_size'],
                print_every=100)
solver.train()

(Epoch 0 / 10) train acc: 0.127000; val_acc: 0.125000


  # This is added back by InteractiveShellApp.init_path()


(Epoch 1 / 10) train acc: 0.720000; val_acc: 0.698480
(Epoch 2 / 10) train acc: 0.845000; val_acc: 0.812500
(Epoch 3 / 10) train acc: 0.891000; val_acc: 0.859797
(Epoch 4 / 10) train acc: 0.906000; val_acc: 0.871622
(Epoch 5 / 10) train acc: 0.899000; val_acc: 0.885980
(Epoch 6 / 10) train acc: 0.919000; val_acc: 0.889358
(Epoch 7 / 10) train acc: 0.907000; val_acc: 0.880912
(Epoch 8 / 10) train acc: 0.904000; val_acc: 0.907939
(Epoch 9 / 10) train acc: 0.919000; val_acc: 0.897804
(Epoch 10 / 10) train acc: 0.912000; val_acc: 0.904561


### playing with the results, I found the fastest rate of decent with 2 100 layers, and even good results with zero layers. I have tried small outputs. Note the amount of layers that went into the model was dependent on the outcome, the deeper the slower it seemed even