In [93]:
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression as SKLogisticRegression
from sklearn.metrics import accuracy_score
from scipy.special import expit

LOAD_FROM_PICKLE = True
USE_SHRUNK_DATASET = True

np.set_printoptions(precision=2, suppress=True)

In [94]:
# load the data
if LOAD_FROM_PICKLE:
    with open('../Data/Pickle/cover_data.pickle', 'rb') as handle:
        data = pickle.load(handle)

    print('Loaded data from pickle')
else:
    data = np.loadtxt('../Data/Cov_Type/covtype.data', delimiter=',')
    with open('../Data/Pickle/cover_data.pickle', 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

Loaded data from pickle


In [95]:
print('Data shape: {}'.format(data.shape))

# used for faster testing
if USE_SHRUNK_DATASET:
    # get the number of samples for the class witht the least samples
    min_samples = np.bincount(data[:, -1].astype(int), minlength=7)[1:].min()

    # get min number of samples for each class
    data = np.concatenate([data[data[:, -1] == i][:min_samples] for i in range(1, 8)])

    print('New data shape:', data.shape)

# split the data into features and labels
X = data[:, :-1]
y = data[:, -1]

# normalize the features
X = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0) + .0000001)


Data shape: (581012, 55)
New data shape: (19229, 55)


In [96]:
print('X.shape =', X.shape)
print('y.shape =', y.shape)

X.shape = (19229, 54)
y.shape = (19229,)


In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [98]:
%%time

lr_sk = SKLogisticRegression(solver='liblinear') # all params default

lr_sk.fit(X_train, y_train)
yhat = lr_sk.predict(X_test)
print('Accuracy of: ',accuracy_score(y_test,yhat))

Accuracy of:  0.6820072802912116
CPU times: user 1.91 s, sys: 58.2 ms, total: 1.97 s
Wall time: 1.86 s


In [None]:
class BinaryLogisticRegression:
    # private:
    def __init__(self, eta, iterations=20):
        self.eta = eta
        self.iters = iterations
        # internally we will store the weights as self.w_ to keep with sklearn conventions
    
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'Binary Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained Binary Logistic Regression Object'
    
    # convenience, private and static:
    @staticmethod
    def _sigmoid(theta):
        # increase stability, redefine sigmoid operation
        return expit(theta) #1/(1+np.exp(-theta))
    
    @staticmethod
    def _add_bias(X):
        return np.hstack((np.ones((X.shape[0],1)),X)) # add bias term

    def _get_gradient(self,X,y):
        ydiff = y-self.predict_proba(X,add_bias=False).ravel() # get y difference
        gradient = np.mean(X * ydiff[:,np.newaxis], axis=0) # make ydiff a column vector and multiply through
        
        return gradient.reshape(self.w_.shape)
    
    # public:
    def fit(self, X, y):
        Xb = self._add_bias(X) # add bias term
        num_samples, num_features = Xb.shape
        
        self.w_ = np.zeros((num_features,1)) # init weight vector to zeros
        
        # for as many as the max iterations
        for _ in range(self.iters):
            gradient = self._get_gradient(Xb,y)
            self.w_ += gradient*self.eta # multiply by learning rate 

        print(self.w_)

    def predict_proba(self, X, add_bias=True):
        # add bias term if requested
        Xb = self._add_bias(X) if add_bias else X
        return self._sigmoid(Xb @ self.w_) # return the probability y=1
    
    def predict(self,X):
        return (self.predict_proba(X)>0.5) #return the actual prediction

In [118]:
class LogisticRegression:
    def __init__(self, eta, iterations=20):
        self.eta = eta
        self.iters = iterations
        # internally we will store the weights as self.w_ to keep with sklearn conventions
    
    def __str__(self):
        if(hasattr(self,'w_')):
            return 'MultiClass Logistic Regression Object with coefficients:\n'+ str(self.w_) # is we have trained the object
        else:
            return 'Untrained MultiClass Logistic Regression Object'
        
    def fit(self,X,y):
        num_samples, num_features = X.shape
        self.unique_ = np.unique(y) # get each unique class value
        num_unique_classes = len(self.unique_)
        self.classifiers_ = [] # will fill this array with binary classifiers
        
        for i,yval in enumerate(self.unique_): # for each unique values
            y_binary = (y==yval) # create a binary problem
            # train the binary classifier for this class
            blr = BinaryLogisticRegression(self.eta, self.iters)
            blr.fit(X,y_binary)
            # add the trained classifier to the list
            self.classifiers_.append(blr)
            
        # save all the weights into one matrix, separate column for each class
        self.w_ = np.hstack([x.w_ for x in self.classifiers_]).T
        
    def predict_proba(self,X):
        probs = []
        for blr in self.classifiers_:
            probs.append(blr.predict_proba(X)) # get probability for each classifier
        
        return np.hstack(probs) # make into single matrix
    
    def predict(self,X):
        return self.unique_[np.argmax(self.predict_proba(X),axis=1)] # take argmax along row

In [127]:
lr = LogisticRegression(.1, .025, 1000) # all params default

lr.fit(X_train, y_train)
yhat = lr.predict(X_test)
print('Accuracy of: ',accuracy_score(y_test,yhat))


First 10 predictions:  [[0.25 0.25 0.07 0.07 0.17 0.07 0.18]
 [0.12 0.12 0.19 0.08 0.16 0.17 0.14]
 [0.16 0.19 0.09 0.08 0.26 0.08 0.13]
 [0.07 0.08 0.21 0.47 0.09 0.14 0.07]
 [0.15 0.15 0.13 0.08 0.17 0.16 0.17]
 [0.17 0.15 0.12 0.06 0.14 0.17 0.19]
 [0.08 0.08 0.19 0.43 0.08 0.13 0.08]
 [0.14 0.13 0.11 0.06 0.14 0.13 0.3 ]
 [0.17 0.17 0.11 0.07 0.16 0.13 0.2 ]
 [0.23 0.23 0.07 0.06 0.17 0.07 0.2 ]]
Accuracy of:  0.5405616224648986
