In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, accuracy_score

## Logistic Regression

In [4]:
class LogisticRegression:
    '''
    This is a class for Logistic Regression algorithm.
    
    The class contains the hyper parameters of the logistic regression algorithm as attributes.
    It also contains the functions for initializing the class, fitting the ridge regression model and use the fitted 
    model to predict test samples.
    
    Attributes:
        lr:        learning rate of gradient descent
        max_itr:   maximum number of iteration for gradient descent
        tol:       if the change in loss is smaller than tol, then we stop iteration
        W:         concatenation of weight w and bias b
        verbose:   whether or not print the value of logitic loss every 1000 iterations
        
    '''
    def __init__(self, lr=0.01, max_itr=100000, tol = 1e-5, verbose = False):
        self.lr = lr
        self.max_itr = max_itr
        self.tol = tol
        self.verbose = verbose
 
    def __sigmoid(self, z):
        '''
        Define the Sigmoid function to convert from real value to [0,1]
        
        Args: 
            z (matrix, num_samples*1): scores or real value
            
        Returns:
            A matrix (num_variables+1)*1: a value in the interval [0,1]
        '''
        return 1 / (1 + np.exp(-z))
    
    def __logistic_loss(self, h, y):
        '''
        Calculate the logistic loss
        '''
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    def fit(self, x, y):
        '''
        estimate the weight and bias in the logistic regression model by gradient descent
        
        Args: 
            x (matrix, num_train*num_variables): input of training samples
            y (matrix, num_test*1): labels of training samples, 0 or 1
            
        Returns:
            self.W (matrix, (num_variables+1)*1): estimation of weight and bias, i.e (w,b)
        '''
        ### Add the all-one vector to the last column 
        m = x.shape[0]
        X = np.concatenate((x, np.ones((m, 1))), axis=1)
        y = y.reshape(-1,1)
        # weight and bias initialization
        d = X.shape[1]
        self.W = np.zeros((d,1))
        
        z = np.dot(X, self.W)
        h = self.__sigmoid(z)
        previous_loss = self.__logistic_loss(h, y)
        for i in range(self.max_itr):
            #Calculate the gradient and update w and b
            z = np.dot(X, self.W)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / m
            self.W -= self.lr * gradient
            
            #Calculate the new logistic loss
            z = np.dot(X, self.W)
            h = self.__sigmoid(z)
            current_loss = self.__logistic_loss(h, y)
            if previous_loss - current_loss < self.tol:
                print('Converged after {} iterations'.format(i+1))
                print('Logistic loss after {} iterations is {}'.format(i+1,current_loss))
                break
            else:
                previous_loss = current_loss
            if(self.verbose == True and i % 10000 == 0):
                print('Logistic loss after {} iterations is {}'.format(i+1,current_loss))
        return self.W
    
    def predict_prob(self, x):
        '''
        predict the posterior probability p_1(x; W) of the test samples
        
        Args: 
            x (matrix, num_test*num_variables): input of test samples
            
        Returns:
            y (matrix, num_test*1): predicted posterior probability p_1(x; W) of test samples
        ''' 
        m = x.shape[0]
        X = np.concatenate((x, np.ones((m, 1))), axis=1)
        return self.__sigmoid(np.dot(X, self.W))
    
    def predict(self, x):
        '''
        predict the label of the test samples
        
        Args: 
            x (matrix, num_test*num_variables): input of test samples
            
        Returns:
            y (matrix, num_test*1): predicted labels of test samples, 0 or 1
        ''' 
        return self.predict_prob(x).round()

In [56]:
# load datasets function
def load_data(data_file_name):
    data_dir = "..\..\data\data_classification"
    data_path = os.path.join(data_dir, data_file_name)
    print(data_path)
    df = pd.read_csv(data_path)
    data_X = df.iloc[:,:-1]
    data_y = df.iloc[:,-1]
    scaler_X = StandardScaler()
    data_X = scaler_X.fit_transform(data_X)
    data_y = pd.Categorical(data_y).codes.reshape(-1)
    return data_X, data_y


def main():

    # model-specified parameters
    n_estimators = 5

    # read dataset from csv file
    data_name = "messidor_classification"
    data_X, data_y = load_data("{}.csv".format(data_name))
    pd.csv.split(5)

    # CrossValidation with 5 splits
    kf = KFold(n_splits=5)
    res_list = []
    for train_index, test_index in kf.split(data_X):
        train_X, train_y = data_X[train_index,:], data_y[train_index]
        test_X, test_y = data_X[test_index,:], data_y[test_index]
    
    return train_X, train_y,test_X, test_y

def run():
    ### initiate the logistic regressor
    model = LogisticRegression(lr=0.1, max_itr=100000, tol = 1e-8, verbose=True)
    ### fit the model with training data and get the estimation of parameters (w & b)
    W = model.fit(main()[0], main()[1])
    ### Print the estimated w and b
    print(W.T)
    ### Print the estimated w and b
    print("The weight w of LR is \n {}.".format(W[:main()[2].shape[1],0].T))
    print("The bias b of LR is {}.".format(W[main()[2].shape[1],0]))

    y_pred = model.predict(main()[2])
    accuracy = np.sum(y_pred[:,0] == main()[3])/len(main()[3])
    print("Accuracy of LR on the test dataset is {}.".format(accuracy))

In [57]:
run()

..\..\data\data_classification\messidor_classification.csv
..\..\data\data_classification\messidor_classification.csv
Logistic loss after 1 iterations is 0.6850519586171843


  return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
  return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()


Logistic loss after 10001 iterations is nan
Logistic loss after 20001 iterations is nan
Logistic loss after 30001 iterations is nan
Logistic loss after 40001 iterations is nan
Logistic loss after 50001 iterations is nan
Logistic loss after 60001 iterations is nan
Logistic loss after 70001 iterations is nan
Logistic loss after 80001 iterations is nan
Logistic loss after 90001 iterations is nan
[[ 5.39305776e-01 -3.02237216e-01  1.73272888e+01 -3.86641401e+00
  -8.11513173e+00 -3.18147228e+00 -1.20821519e+00  8.77367275e-01
   5.83633193e-01 -4.78478074e-01  1.07891459e-01 -6.28442453e-01
   4.30643900e-01 -2.14016203e+00  3.99311985e+00  4.64742622e-01
   4.98400131e-03 -4.80248669e-02 -5.19470564e-02  1.39062746e+00]]
..\..\data\data_classification\messidor_classification.csv
The weight w of LR is 
 [ 5.39305776e-01 -3.02237216e-01  1.73272888e+01 -3.86641401e+00
 -8.11513173e+00 -3.18147228e+00 -1.20821519e+00  8.77367275e-01
  5.83633193e-01 -4.78478074e-01  1.07891459e-01 -6.2844245