In [11]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression as SKLogisticRegression
from sklearn.metrics import accuracy_score
from scipy.special import expit

USE_SHRUNK_DATASET = True

In [20]:
# load the data
data = np.loadtxt('../Data/Cov_Type/covtype.data', delimiter=',')

In [30]:
print('Data shape: {}'.format(data.shape))

if USE_SHRUNK_DATASET:
    # get the number of samples for the class witht the least samples
    min_samples = np.min(np.bincount(data[:, -1].astype(int)))

    # get all instances for the first class
    class_4 = data[data[:, -1] == 4]
    # print the first row of the first class
    print(class_4.shape)

    # get that many samples from each class
    alt_data = np.vstack([data[data[:, -1] == i][:min_samples] for i in range(1, 8)])

print('Data shape:', alt_data.shape)

# split the data into features and labels
X = data[:, :-1]
y = data[:, -1]

# normalize the features
X = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))


Data shape: (581012, 55)
(2747, 55)
Data shape: (0, 55)


In [None]:
print('X.shape =', X.shape)
print('y.shape =', y.shape)

In [9]:


X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

ValueError: With n_samples=0, test_size=0.2 and train_size=0.8, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [8]:
%%time

lr_sk = SKLogisticRegression(solver='liblinear') # all params default

lr_sk.fit(X_train, y_train)
yhat = lr_sk.predict(X_test)
print('Accuracy of: ',accuracy_score(y_test,yhat))

Accuracy of:  0.7156613856785109
CPU times: user 55.4 s, sys: 573 ms, total: 56 s
Wall time: 55.7 s


In [7]:
class LogisticRegressionSolver:
    # private:
    def __init__(self, eta, iterations=20):
        self.eta = eta
        self.iters = iterations
        # internally we will store the weights as self.w_ to keep with sklearn conventions
    
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'Binary Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained Binary Logistic Regression Object'
    
    # convenience, private and static:
    @staticmethod
    def _sigmoid(theta):
        # increase stability, redefine sigmoid operation
        return expit(theta) #1/(1+np.exp(-theta))
    
    @staticmethod
    def _add_bias(X):
        return np.hstack((np.ones((X.shape[0],1)),X)) # add bias term

    def _get_gradient(self, X, y, index):
        ydiff = y-self._predict_proba(X, index, add_bias=False).ravel() # get y difference
        gradient = np.mean(X * ydiff[:,np.newaxis], axis=0) # make ydiff a column vector and multiply through
        
        return gradient.reshape(self.class_w_[index].shape)

    def _predict_proba(self, X, index, add_bias=True):
        # add bias term if requested
        Xb = self._add_bias(X) if add_bias else X
        return self._sigmoid(Xb @ self.class_w_[index]) # return the probability y=1

    # public:

    # one vs all:
    def fit(self, X, y):
        self.unique_ = np.unique(y) # get each unique class value
        num_unique_classes = len(self.unique_)
        self.class_w_ = [] # will fill this array with the weights for each binary classifier

        # for each unique class value:
        for i, yval in enumerate(self.unique_):
            y_binary = (y == yval) # set the binary label for one vs all

            Xb = self._add_bias(X) # add bias term    

            num_samples, num_features = Xb.shape

            self.class_w_.append(np.zeros((num_features, 1))) # init weight vector to zeros
            
            # for as many as the max iterations
            for _ in range(self.iters):
                gradient = self._get_gradient(Xb, y, i)
                self.class_w_[i] += gradient*self.eta # multiply by learning rate

        self.w_ = np.hstack(self.class_w_).T

    def predict_proba(self, X):
        probs = []
        for i in range(len(self.class_w_)):
            probs.append(self._predict_proba(X, i)) # get probability for each classifier
    
    def predict(self,X):
        return self.unique_[np.argmax(self.predict_proba(X),axis=1)] # take argmax along row

In [8]:
lr = LogisticRegressionSolver(.01) # all params default

lr.fit(X_train, y_train)
yhat = lr.predict(X_test)
print('Accuracy of: ',accuracy_score(y_test,yhat))

KeyboardInterrupt: 