In [1]:
import numpy as np
import plotly
import pandas as pd
import matplotlib.pyplot as plt
import time

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, ShuffleSplit
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit

from sklearn.metrics import make_scorer, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score

import numpy as np
from scipy.special import expit
import pandas as pd
import sys

import warnings
warnings.filterwarnings('ignore')

#one-hot encoding method
df = pd.read_csv(
        'http://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data',
        header=None)

df.columns = ['buying', 'maint', 'doors', 'persons', 'trunk', 'safety', 'class']

# One hot encode caegorical attributes
df_dummies = pd.get_dummies(df.drop('class', axis=1))

# Convert class to integers
y = df['class'].replace(to_replace=['unacc', 'acc', 'good', 'vgood'],
                        value=range(4)).values
X = df_dummies.values

In [8]:
# just start with the vectorized version and minibatch
class LeagueIsLife():
    def __init__(self, n_hidden=30,
                 l2_C=0.0, epochs=500, eta=0.001, random_state=None, alpha=0.0, decrease_const=0.0, 
                 shuffle=True, minibatches=1, f_cost='quadratic', f_layer="sigmoid", tanh_a=0.0):
        
        #original initializer
        np.random.seed(random_state)
        self.n_hidden = n_hidden
        self.l2_C = l2_C
        self.epochs = epochs
        self.eta = eta
        # need to add to the original initializer 
        self.alpha = alpha
        self.decrease_const = decrease_const
        self.shuffle = shuffle
        self.minibatches = minibatches
        self.f_cost = f_cost
        self.f_layer = f_layer
        self.tanh_a = tanh_a
        
    def _initialize_weights(self):
        """Initialize weights with small random numbers."""
        if self.f_layer == 'relu':
            init_bound = np.sqrt(6. / (self.n_hidden + self.n_features_ + 1))
            W1 = np.random.uniform(-init_bound, init_bound,(self.n_hidden, self.n_features_ + 1))
            
            init_bound = np.sqrt(6. / (self.n_hidden + self.n_features_ + 1))
            W2 = np.random.uniform(-init_bound, init_bound,(self.n_hidden, self.n_features_ + 1))
            
            init_bound = np.sqrt(2. / (self.n_output_ + self.n_hidden + 1))
            W3 = np.random.uniform(-init_bound, init_bound,(self.n_output_, self.n_hidden + 1))
            
        elif self.f_layer =='tanh':
            W1_num_elems = (self.n_features_ + 1)*self.n_hidden
            W1 = np.random.uniform(-6. / np.sqrt(W1_num_elems), 6. / np.sqrt(W1_num_elems), size=W1_num_elems)
            W1 = W1.reshape(self.n_hidden, self.n_features_ + 1) # reshape to be W
            
            W2_num_elems = (self.n_features_ + 1)*self.n_hidden
            W2 = np.random.uniform(-6. / np.sqrt(W2_num_elems), 6. / np.sqrt(W2_num_elems), size=W2_num_elems)
            W2 = W2.reshape(self.n_hidden, self.n_features_ + 1) # reshape to be W
                
            W3_num_elems = (self.n_hidden + 1)*self.n_output_
            W3 = np.random.uniform(-6. / np.sqrt(W3_num_elems), 6. / np.sqrt(W3_num_elems), size=W3_num_elems)
            W3 = W3.reshape(self.n_output_, self.n_hidden + 1)
            
        else:
            W1_num_elems = (self.n_features_ + 1)*self.n_hidden
            W1 = np.random.uniform(-1.0, 1.0,size=W1_num_elems)
            W1 = W1.reshape(self.n_hidden, self.n_features_ + 1) # reshape to be W

            W2_num_elems = (self.n_features_ + 1)*self.n_hidden
            W2 = np.random.uniform(-1.0, 1.0,size=W2_num_elems)
            W2 = W2.reshape(self.n_hidden, self.n_features_ + 1) # reshape to be W
        
            W3_num_elems = (self.n_hidden + 1)*self.n_output_
            W3 = np.random.uniform(-1.0, 1.0, size=W3_num_elems)
            W3 = W3.reshape(self.n_output_, self.n_hidden + 1)
            
        return W1, W2, W3
        
    @staticmethod
    def _sigmoid(z):
        """Use scipy.special.expit to avoid overflow"""
        # 1.0 / (1.0 + np.exp(-z))
        return expit(z)
    
    @staticmethod
    def _relu(Z):
        return np.maximum(0,Z.copy())
    
    #@staticmethod
    def _tanh(self, Z):
        return np.tanh(Z) + (self.tanh_a * Z)
    
    def _feedforward(self, X, W1, W2, W3):
        """Compute feedforward step
        """
        A1 = self._add_bias_unit(X, how='column')
        Z1 = W1 @ A1.T
              
        A2 = self._add_bias_unit(A1, how='row')
        Z2 = W2 @ A2.T
        
        if self.f_layer == 'relu':
            A3 = self._relu(Z2)
        elif self.f_layer == 'tanh':
            A3 = self._tanh(Z2)
        else:
            A3 = self._sigmoid(Z2) 
            
        A3 = self._add_bias_unit(A3,how='row')
        Z3 = W3 @ A3
        
        A4 = self._sigmoid(Z3)
        return A1, Z1, A2, Z2, A3, Z3, A4
        
    def fit(self, X, y, print_progress=False):
        """ Learn weights from training data. With mini-batch"""
        X_data, y_data = X.copy(), y.copy()
        Y_enc = self._encode_labels(y)
        
        # init weights and setup matrices
        self.n_features_ = X_data.shape[1]
        self.n_output_ = Y_enc.shape[0]
        self.W1, self.W2, self.W3 = self._initialize_weights()

        delta_W1_prev = np.zeros(self.W1.shape)
        delta_W2_prev = np.zeros(self.W2.shape)
        delta_W3_prev = np.zeros(self.W2.shape)

        self.cost_ = []
        self.score_ = []
        for i in range(self.epochs):

            # adaptive learning rate
            self.eta /= (1 + self.decrease_const*i)

            if print_progress>0 and (i+1)%print_progress==0:
                sys.stderr.write('\rEpoch: %d/%d' % (i+1, self.epochs))
                sys.stderr.flush()

            if self.shuffle:
                idx_shuffle = np.random.permutation(y_data.shape[0])
                X_data, Y_enc, y_data = X_data[idx_shuffle], Y_enc[:, idx_shuffle], y_data[idx_shuffle]

            mini = np.array_split(range(y_data.shape[0]), self.minibatches)
            mini_cost = []
            for idx in mini:

                # feedforward
                A1, Z1, A2, Z2, A3, Z3, A4 = self._feedforward(X_data[idx],
                                                       self.W1,
                                                       self.W2,
                                                        self.W3)
                
                cost = self._cost(A4,Y_enc[:,idx],self.W1,self.W2,self.W3)
                mini_cost.append(cost) # this appends cost of mini-batch only

                # compute gradient via backpropagation
                grad1, grad2, grad3 = self._get_gradient(A1=A1, A2=A2, A3=A3, Z1=Z1, Z2=Z2,
                                                  Z3=Z3,A4=A4,
                                                  Y_enc=Y_enc[:, idx],
                                                  W1=self.W1,W2=self.W2,W3=self.W3)

                delta_W1, delta_W2, delta_W3 = self.eta * grad1, self.eta * grad2, self.eta * grad3
                self.W1 -= (delta_W1 + (self.alpha * delta_W1_prev))
                self.W2 -= (delta_W2 + (self.alpha * delta_W2_prev))
                self.W3 -= (delta_W3 + (self.alpha * delta_W3_prev))

                delta_W1_prev, delta_W2_prev, delta_W3_prev = delta_W1, delta_W2, delta_W3

            self.cost_.append(mini_cost)
            self.score_.append(accuracy_score(y_data,self.predict(X_data)))
        
    def get_params(self, deep=True):
        return {"alpha": self.alpha, "f_cost": self.f_cost, "f_layer": self.f_layer, "l2_C": self.l2_C, 
               "eta": self.eta, "decrease_const": self.decrease_const, "epochs": self.epochs}
    
    def _cost(self,A4,Y_enc,W1,W2,W3):
        # Cross entropy choice
        if self.f_cost == 'cross':
            '''Get the objective function value'''
            cost = np.mean((self._add_bias_unit(Y_enc)-A4)**2)
            L2_term = self._L2_reg(self.l2_C, W1, W2, W3)
            return cost + L2_term
        # Quadradic (default) choice
        else:
            '''Get the objective function value'''
            cost = -np.mean(np.nan_to_num((Y_enc*np.log(A3)+(1-Y_enc)*np.log(1-A3))))
            L2_term = self._L2_reg(self.l2_C, W1, W2)
            return cost + L2_term
    
    def _get_gradient(self, A1, A2, A3,A4, Z1, Z2,Z3, Y_enc, W1, W2,W3):
                
        # Cross entrophy choice
        if self.f_cost == 'cross':
            sigma4 = (A4-Y_enc)
        else:
            sigma4 = -2*(Y_enc-A4)*A4*(1-A4)
        
        # Relu choice
        if self.f_layer == 'relu':
            sigma3 = (W3.T @ sigma4) 
            Z3_with_bias = self._add_bias_unit(Z3,how='row')
            sigma3[Z3_with_bias<=0] = 0
        else:
            sigma3 = (W3.T @ sigma4)*A3*(1-A3)
        
        # Relu choice
        if self.f_layer == 'relu':
            sigma2 = (W2.T @ sigma3) 
            Z1_with_bias = self._add_bias_unit(Z1,how='row')
            sigma2[Z1_with_bias<=0] = 0
        else:
            sigma2 = (W2.T @ sigma3)*A2*(1-A2)
        
        grad1 = sigma2[1:,:] @ A1
        grad2 = sigma3 @ A2.T
        grad3 = sigma4 @ A3
        
        # regularize weights that are not bias terms
        grad1[:, 1:] += W1[:, 1:] * self.l2_C
        grad2[:, 1:] += W2[:, 1:] * self.l2_C
        grad3[:, 1:] += W3[:, 1:] * self.l3_C
        return grad1, grad2, grad3
    
    @staticmethod
    def _encode_labels(y):
        """Encode labels into one-hot representation"""
        onehot = pd.get_dummies(y).values.T
        return onehot
    
    @staticmethod
    def _add_bias_unit(X, how='column'):
        """Add bias unit (column or row of 1s) to array at index 0"""
        if how == 'column':
            ones = np.ones((X.shape[0], 1))
            X_new = np.hstack((ones, X))
        elif how == 'row':
            ones = np.ones((1, X.shape[1]))
            X_new = np.vstack((ones, X))
        return X_new
    
    @staticmethod
    def _L2_reg(lambda_, W1, W2, W3):
        """Compute L2-regularization cost"""
        # only compute for non-bias terms
        return (lambda_/2.0) * np.sqrt(np.mean(W1[:, 1:] ** 2) + np.mean(W2[:, 1:] ** 2) + np.mean(W3[:,1:] ** 2))

    def predict(self, X):
        """Predict class labels"""
        _, _, _, _, _, _, A4 = self._feedforward(X, self.W1, self.W2, self.W3)
        y_pred = np.argmax(A4, axis=0)
        return y_pred
    
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            self.__setattr__(parameter, value)
        return self

In [9]:
a = LeagueIsLife(f_layer='relu', f_cost='cross', alpha=0, decrease_const=1e-09, 
           epochs=500, l2_C=0.01)
a.fit(X,y)

ValueError: shapes (4,31) and (30,1729) not aligned: 31 (dim 1) != 30 (dim 0)