## Logistic Regression

In [None]:
class LogisticRegression:
    '''
    This is a class for Logistic Regression algorithm.
    
    The class contains the hyper parameters of the logistic regression algorithm as attributes.
    It also contains the functions for initializing the class, fitting the ridge regression model and use the fitted 
    model to predict test samples.
    
    Attributes:
        lr:        learning rate of gradient descent
        max_itr:   maximum number of iteration for gradient descent
        tol:       if the change in loss is smaller than tol, then we stop iteration
        W:         concatenation of weight w and bias b
        verbose:   whether or not print the value of logitic loss every 1000 iterations
        
    '''
    def __init__(self, lr=0.01, max_itr=100000, tol = 1e-5, verbose = False):
        self.lr = lr
        self.max_itr = max_itr
        self.tol = tol
        self.verbose = verbose
 
    def __sigmoid(self, z):
        '''
        Define the Sigmoid function to convert from real value to [0,1]
        
        Args: 
            z (matrix, num_samples*1): scores or real value
            
        Returns:
            A matrix (num_variables+1)*1: a value in the interval [0,1]
        '''
        return 1 / (1 + np.exp(-z))
    
    def __logistic_loss(self, h, y):
        '''
        Calculate the logistic loss
        '''
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    def fit(self, x, y):
        '''
        estimate the weight and bias in the logistic regression model by gradient descent
        
        Args: 
            x (matrix, num_train*num_variables): input of training samples
            y (matrix, num_test*1): labels of training samples, 0 or 1
            
        Returns:
            self.W (matrix, (num_variables+1)*1): estimation of weight and bias, i.e (w,b)
        '''
        ### Add the all-one vector to the last column 
        m = x.shape[0]
        X = np.concatenate((x, np.ones((m, 1))), axis=1)
        y = y.reshape(-1,1)
        # weight and bias initialization
        d = X.shape[1]
        self.W = np.zeros((d,1))
        
        z = np.dot(X, self.W)
        h = self.__sigmoid(z)
        previous_loss = self.__logistic_loss(h, y)
        for i in range(self.max_itr):
            #Calculate the gradient and update w and b
            z = np.dot(X, self.W)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / m
            self.W -= self.lr * gradient
            
            #Calculate the new logistic loss
            z = np.dot(X, self.W)
            h = self.__sigmoid(z)
            current_loss = self.__logistic_loss(h, y)
            if previous_loss - current_loss < self.tol:
                print('Converged after {} iterations'.format(i+1))
                print('Logistic loss after {} iterations is {}'.format(i+1,current_loss))
                break
            else:
                previous_loss = current_loss
            if(self.verbose == True and i % 10000 == 0):
                print('Logistic loss after {} iterations is {}'.format(i+1,current_loss))
        return self.W
    
    def predict_prob(self, x):
        '''
        predict the posterior probability p_1(x; W) of the test samples
        
        Args: 
            x (matrix, num_test*num_variables): input of test samples
            
        Returns:
            y (matrix, num_test*1): predicted posterior probability p_1(x; W) of test samples
        ''' 
        m = x.shape[0]
        X = np.concatenate((x, np.ones((m, 1))), axis=1)
        return self.__sigmoid(np.dot(X, self.W))
    
    def predict(self, x):
        '''
        predict the label of the test samples
        
        Args: 
            x (matrix, num_test*num_variables): input of test samples
            
        Returns:
            y (matrix, num_test*1): predicted labels of test samples, 0 or 1
        ''' 
        return self.predict_prob(x).round()

In [None]:
def loadDataSet(dataset_path, file_type="txt"):
    if file_type == "txt":
        X = []                                                        #create feature matrix
        y = []                                                       # create label matrix
        fr = open(dataset_path)                                            #open file
        for line in fr.readlines():                                         #read datum
            lineArr = line.strip().split()                                  #remove the `\n` and obtain the data from string
            X.append([float(x) for x in lineArr[:-1]])     # add to the feature matrix
            y.append(float(lineArr[-1]))                                # add to the label matrix
        fr.close()                                                          # close file
        return X, y    

## 2.2 Load Dataset

In [None]:
# read the data
import numpy as np
X_train, y_train = loadDataSet("horseColicTraining.txt")
X_test, y_test = loadDataSet("horseColicTest.txt")

# transform the data from list to np.array
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

# normalize
X = np.vstack([X_train, X_test])
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## 2.3 Model Fitting

In [None]:
### initiate the logistic regressor
model = LogisticRegression(lr=0.1, max_itr=100000, tol = 1e-8, verbose=True)
### fit the model with training data and get the estimation of parameters (w & b)
W = model.fit(X_train, y_train)
### Print the estimated w and b
print(W.T)
### Print the estimated w and b
print("The weight w of LR is \n {}.".format(W[:X_test.shape[1],0].T))
print("The bias b of LR is {}.".format(W[X_test.shape[1],0]))

Logistic loss after 1 iterations is 0.6898762822641373
Logistic loss after 10001 iterations is 0.5217484321994517
Converged after 11802 iterations
Logistic loss after 11802 iterations is 0.52171924670628
[[ 0.7611359  -0.2002035   1.00541571 -2.51425541  0.82184505 -0.60528775
  -0.36426046 -1.38660266 -0.16129892 -1.17278023  1.47758935 -0.60870321
   1.39102436 -0.31464443 -0.98595971  0.58698705 -0.70142135 -0.4994333
   1.04153942  0.06140696 -1.07582939  0.95021878]]
The weight w of LR is 
 [ 0.7611359  -0.2002035   1.00541571 -2.51425541  0.82184505 -0.60528775
 -0.36426046 -1.38660266 -0.16129892 -1.17278023  1.47758935 -0.60870321
  1.39102436 -0.31464443 -0.98595971  0.58698705 -0.70142135 -0.4994333
  1.04153942  0.06140696 -1.07582939].
The bias b of LR is 0.9502187772809589.


## 2.4 Prediction and Evaluation

In [None]:
y_pred = model.predict(X_test)
accuracy = np.sum(y_pred[:,0] == y_test)/len(y_test)
print("Accuracy of LR on the test dataset is {}.".format(accuracy))

Accuracy of LR on the test dataset is 0.7164179104477612.
