Reference: https://www.geeksforgeeks.org/ml-stochastic-gradient-descent-sgd/

In [1]:
import numpy as np

In [2]:
def initialize(dict_network, seed = 99):
    np.random.seed(seed)
    
    for dict_layer in dict_network:        
        dim_layer_in = dict_layer['dim_in']
        dim_layer_out = dict_layer['dim_out']  
        dict_layer['W'] = np.random.uniform(-1, 1, (dim_layer_out, dim_layer_in))
        dict_layer['B'] = np.random.uniform(-1, 1, (dim_layer_out, 1))
        dict_layer['prev_dW'] = np.zeros((dim_layer_out, dim_layer_in))
        dict_layer['prev_dB'] = np.zeros((dim_layer_out,1))
        
    return dict_network

In [3]:
def forward_propagation(input_, dict_network):
    Z = input_
    
    for dict_layer in dict_network:

        activation = dict_layer['activation']
        activation_params = dict_layer['activation_params']
        W = dict_layer['W']
        B = dict_layer['B'].T.flatten()
    
        X = Z
        dict_layer['X'] = X
        
        V = np.dot(W, X) + B

        if len(activation_params) == 0:
            raise Exception('Activation parameter is empty.')   
            
        elif activation == 'logi':
            a = activation_params[0]
            Z = 1 / (1 + np.exp(-a*V))
            
        elif activation == 'tanh':
            a = activation_params[0]
            c = activation_params[1]
            Z = a * np.tanh(c*V)
            
        elif activation == 'relu':
            a = activation_params[0]
            Z = np.maximum(V,a*V)
            
        else:
            raise Exception('Activation function is not supported')
            
        dict_layer['Z'] = Z
        dict_layer['V'] = V

    return dict_network

In [4]:
def backward_propagation(output_data, dict_network, learning_rate):
    
    d = output_data    
    o = dict_network[-1]['Z']
    error = d - o
    
    for index, dict_layer in reversed(list(enumerate(dict_network))):
        
        activation = dict_layer['activation']
        activation_params = dict_layer['activation_params']
        Z = dict_layer['Z']
        X = dict_layer['X']
        V = dict_layer['V']
        W = dict_layer['W']
        
        if len(activation_params) == 0:
            raise Exception('Activation parameter is empty.')      
            
        elif activation == 'logi':
            a = activation_params[0]
            dZ =  a * error * Z * (1-Z)
            
        elif activation == 'tanh':
            a = activation_params[0]
            c = activation_params[1]
            dZ = (c / a) * error * (a - Z) * (a + Z)
            
        elif activation == 'relu':
            a = activation_params[0]
            mult = np.where(V > 0, 1, a)
            dZ = mult * error
            
        else:
            raise Exception('Activation function is not supported')
        
        dict_layer['dZ'] = dZ
        dW = learning_rate * np.outer(dZ, X)
        dict_layer['dW'] = dW
        dB = learning_rate * dZ
        dict_layer['dB'] = dB
        error =  np.dot(dZ, W) 
        
    return dict_network, error

### Added Functions

In [5]:
def update_weights(dict_network, updated_dict_network, delta_weights, delta_biases, momentum):
    
    for index, dict_layer in enumerate(updated_dict_network):
        
        W = dict_layer['W']
        B = dict_layer['B']
        
        prev_dW = dict_layer['prev_dW']
        prev_dB = dict_layer['prev_dB']
        
        dW = np.array(delta_weights[index])
        dB = np.array(delta_biases[index]).reshape(-1,1)

        update_W = W + momentum * prev_dW + dW
        update_B = B + momentum * prev_dB + dB

        dict_network[index]['W'] = update_W
        dict_network[index]['B'] = update_B
        
        dict_network[index]['prev_dW'] = dW
        dict_network[index]['prev_dB'] = dB
        
    return dict_network

In [6]:
def one_hot_encoder(value):
    
    temp_list = [0]*8
    temp_list[int(value)-1] = 1
    
    return temp_list

def convert_class(probs):
    
    probs[probs > 0.5] = 1
    probs[probs <= 0.5] = 0
    
    label = np.argmax(probs) + 1
    return label

In [32]:
def sgd(dict_network, data, momentum=0.7, learning_rate=0.15, batch_size=8, max_iter=400, seed=1):
    
    init_dict_network = dict_network # No random values of weights (W) and biases (B), just the general network architecture
    upd_dict_network = initialize(dict_network) # Has random values of weights (W) and biases (B)
    
    for i in range(max_iter):
        
        actual_label = []
        predict_label = []
        
        np.random.shuffle(data) # Shuffles data
        batch_list = np.array_split(data, data.shape[0]/batch_size) # Splits data into batches
        
        for batch in batch_list:
            
            dW_list = []
            dB_list = []
        
            sse_batch = 0
            
            for j in range(len(dict_network)): 
                dW_list.append([]) # initializes a list of lists where each list in the dW_list corresponds to the dWs of a neural layer
                dB_list.append([]) # initializes a list of lists where each list in the dB_list corresponds to the dBs of a neural layer
                
            for instance in batch:
                
                X = instance[1:] # features
                actual = instance[0] # actual class
                Y = one_hot_encoder(actual) # converts actual class to binary 
                
                dict_network = forward_propagation(X, dict_network) # Forward Propagation Function
                dict_network, error = backward_propagation(Y, dict_network, learning_rate) # Backward Propagation Function
                
                for index, dict_layer in enumerate(upd_dict_network): 
                    dW = dict_layer['dW']
                    dB = dict_layer['dB']
                    
                    dW_list[index].append(dW)
                    dB_list[index].append(dB)
                
                predict = convert_class(upd_dict_network[-1]['Z']) # converts probabilities to predicted class

                actual_label.append(int(actual)) # actual class label list
                predict_label.append(int(predict)) # predicted class label list
            
            dW_ave = [np.mean(layer, axis=0).tolist() for layer in dW_list] # averages dW per batch
            dB_ave = [np.mean(layer, axis=0).tolist() for layer in dB_list] # averages dB per batch

            upd_dict_network = update_weights(init_dict_network, upd_dict_network, dW_ave, dB_ave, momentum) # updates dictionary
    
    return predict_label, actual_label

In [33]:
def accuracy(predicted, actual):
    correct = 0
    
    for i in range(len(predicted)):
        if predicted[i] == actual[i]:
            correct += 1
 
    return correct / len(predicted)

### Data

In [34]:
X = np.genfromtxt("training_set.csv", delimiter=" ", dtype=float)
Y_ = np.genfromtxt("training_labels.csv", delimiter=" ", dtype=int)
Y = Y_.reshape(-1, 1)
data = np.append(Y, X, axis=1)
print(data.shape)
print(f'Labels: {set(Y_)}')
print(f'No of Features: {int(X.shape[1])}')

(3200, 355)
Labels: {1, 2, 3, 4, 5, 6, 7, 8}
No of Features: 354


### Initial Network Architecture

In [35]:
'''
!!!YOU CAN INPUT ANY NUMBER OF NODES AND LAYERS HERE!!!
!!!   JUST ADD A DICTIONARY FOR A LAYER SPECIFICS   !!!
!!!   CHANGE DIM_IN FOR NO OF NODES IN PREV LAYER   !!!
!!!   CHANGE DIM_OUT FOR NO OF NODES IN CUR LAYER   !!!
'''

nodes = 5
dim_in = int(X.shape[1])
dim_out = int(len(set(Y_)))

network_A = [
    {'dim_in': dim_in,    'dim_out': nodes,      'activation': 'tanh', 'activation_params': [1.716, 2/3]},
    {'dim_in': nodes,     'dim_out': nodes,      'activation': 'relu', 'activation_params': [0.01]},
    {'dim_in': nodes,     'dim_out': dim_out,    'activation': 'logi', 'activation_params': [2.0]}]

In [36]:
predict, actual = sgd(network_A,data)

In [38]:
print(f'Accuracy: {accuracy(predict, actual)*100}%')

Accuracy: 96.375%
