In [17]:
from encodings import search_function
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

## Logistic Regression

In [20]:
class LogisticRegression:
    '''
    This is a class for Logistic Regression algorithm.
    
    The class contains the hyper parameters of the logistic regression algorithm as attributes.
    It also contains the functions for initializing the class, fitting the ridge regression model and use the fitted 
    model to predict test samples.
    
    Attributes:
        lr:        learning rate of gradient descent
        max_itr:   maximum number of iteration for gradient descent
        tol:       if the change in loss is smaller than tol, then we stop iteration
        W:         concatenation of weight w and bias b
        verbose:   whether or not print the value of logitic loss every 1000 iterations
        
    '''
    def __init__(self, lr=0.01, max_itr=100000, tol = 1e-5, verbose = False):
        self.lr = lr
        self.max_itr = max_itr
        self.tol = tol
        self.verbose = verbose
 
    def __sigmoid(self, z):
        '''
        Define the Sigmoid function to convert from real value to [0,1]
        
        Args: 
            z (matrix, num_samples*1): scores or real value
            
        Returns:
            A matrix (num_variables+1)*1: a value in the interval [0,1]
        '''
        return 1 / (1 + np.exp(-z))
    
    def __logistic_loss(self, h, y):
        '''
        Calculate the logistic loss
        '''
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    def fit(self, x, y):
        '''
        estimate the weight and bias in the logistic regression model by gradient descent
        
        Args: 
            x (matrix, num_train*num_variables): input of training samples
            y (matrix, num_test*1): labels of training samples, 0 or 1
            
        Returns:
            self.W (matrix, (num_variables+1)*1): estimation of weight and bias, i.e (w,b)
        '''
        ### Add the all-one vector to the last column 
        m = x.shape[0]
        X = np.concatenate((x, np.ones((m, 1))), axis=1)
        y = y.reshape(-1,1)
        # weight and bias initialization
        d = X.shape[1]
        self.W = np.zeros((d,1))
        
        z = np.dot(X, self.W)
        h = self.__sigmoid(z)
        previous_loss = self.__logistic_loss(h, y)
        for i in range(self.max_itr):
            ######################################
            ######################################
            ####### Write your codes below #######
            ### Calculate the gradient and update self.W
            p1 = np.divide(np.exp(np.dot(X, self.W)),(1+np.exp(np.dot(X,self.W))))
            
            grad = - 1/m * (np.dot(np.transpose(X),y-p1))

            self.W = self.W - self.lr * grad

            ######################################
            ######################################
            
            #Calculate the new logistic loss
            z = np.dot(X, self.W)
            h = self.__sigmoid(z)
            current_loss = self.__logistic_loss(h, y)
            if previous_loss - current_loss < self.tol:
                print('Converged after {} iterations'.format(i+1))
                print('Logistic loss after {} iterations is {}'.format(i+1,current_loss))
                break
            else:
                previous_loss = current_loss
            if(self.verbose == True and i % 10000 == 0):
                print('Logistic loss after {} iterations is {}'.format(i+1,current_loss))
        return self.W
    
    def predict_prob(self, x):
        '''
        predict the posterior probability p_1(x; W) of the test samples
        
        Args: 
            x (matrix, num_test*num_variables): input of test samples
            
        Returns:
            y (matrix, num_test*1): predicted posterior probability p_1(x; W) of test samples
        ''' 
        ######################################
        ######################################
        ####### Write your codes below #######
        m = x.shape[0]
        X = np.concatenate((x,np.ones((m,1))),axis=1)    
        y = np.divide(np.exp(np.dot(X, self.W)),(1+np.exp(np.dot(X,self.W))))
        
        return y
        ######################################
        ######################################
    
    def predict(self, x):
        '''
        predict the label of the test samples
        
        Args: 
            x (matrix, num_test*num_variables): input of test samples
            
        Returns:
            y (matrix, num_test*1): predicted labels of test samples, 0 or 1
        ''' 
        ######################################
        ######################################
        ####### Write your codes below #######
        m = x.shape[0]
        X = np.concatenate((x,np.ones((m,1))),axis=1)
        y = 1 / (1 + np.exp(-np.dot(X,self.W)))
        y = np.where(y>0.5,1,0)
        
        return y
        ######################################
        ######################################

In [61]:
# load datasets function
def load_data(data_file_name):
    data_dir = "..\..\..\data\data_classification"
    data_path = os.path.join(data_dir, data_file_name)
    df = pd.read_csv(data_path)
    data_X = df.iloc[:,:-1]
    data_y = df.iloc[:,-1]
    scaler_X = StandardScaler()
    data_X = scaler_X.fit_transform(data_X)
    data_y = pd.Categorical(data_y).codes.reshape(-1)
    return data_X, data_y

def main():

    # read dataset from csv file
    data_name = "messidor_classification"
    data_X, data_y = load_data("{}.csv".format(data_name))

    # # CrossValidation with 5 splits
    # # model-specified parameters
    # n_estimators = 5
    # kf = KFold(n_splits=5)
    # res_list = []
    # for train_index, test_index in kf.split(data_X):
    #     train_X, train_y = data_X[train_index,:], data_y[train_index]
    #     test_X, test_y = data_X[test_index,:], data_y[test_index]

    # Randomly assingning a train and test set
    train_X, test_X, train_y, test_y = train_test_split(data_X, data_y, test_size=0.33, random_state=2200)
    
    ### initiate the logistic regressor
    model = LogisticRegression(lr=1, max_itr=100000, tol = 1e-8, verbose=True)
    ### fit the model with training data and get the estimation of parameters (w & b)
    W = model.fit(train_X, train_y)
    ### Print the estimated w and b
    print(W.T)
    ### Print the estimated w and b
    print("The weight w of LR is \n {}.".format(W[:test_X.shape[1],0].T))
    print("The bias b of LR is {}.".format(W[test_X.shape[1],0]))

    y_pred = model.predict(test_X)
    accuracy = np.sum(y_pred[:,0] == test_y)/len(test_y)
    print("Accuracy of LR on the test dataset is {}.".format(accuracy))

In [47]:
# test_size=0.33, random_state=2200, lr=0.1, max_itr=100000, tol = 1e-8, verbose=True
main()

Logistic loss after 1 iterations is 0.6838623451102486
Logistic loss after 10001 iterations is 0.5100447489191648
Logistic loss after 20001 iterations is 0.49833145587368816
Logistic loss after 30001 iterations is 0.49234880920929286
Logistic loss after 40001 iterations is 0.4887060952482046
Logistic loss after 50001 iterations is 0.48628621226189894
Logistic loss after 60001 iterations is 0.4845822765472486
Logistic loss after 70001 iterations is 0.4833318116095802
Logistic loss after 80001 iterations is 0.48238539816737697
Logistic loss after 90001 iterations is 0.4816515366397042
[[ 0.52244598 -0.29337878 17.03643152 -3.30565035 -9.06976625 -3.00483606
  -0.21214185  0.3362852   0.56957169 -0.21787326 -0.07485432 -0.59727419
   0.59130027 -1.73886879  1.80174032  0.50608556  0.08228581 -0.26506719
  -0.2188865   1.08788073]]
The weight w of LR is 
 [ 0.52244598 -0.29337878 17.03643152 -3.30565035 -9.06976625 -3.00483606
 -0.21214185  0.3362852   0.56957169 -0.21787326 -0.07485432 -0

In [49]:
# test_size=0.33, random_state=2200, lr=0.01, max_itr=100000, tol = 1e-8, verbose=True
main()

Logistic loss after 1 iterations is 0.6921641748267755
Logistic loss after 10001 iterations is 0.5555033483898666
Logistic loss after 20001 iterations is 0.542446259552765
Logistic loss after 30001 iterations is 0.5340283371704706
Logistic loss after 40001 iterations is 0.527955115531314
Logistic loss after 50001 iterations is 0.5233343289308922
Logistic loss after 60001 iterations is 0.5196705066902175
Logistic loss after 70001 iterations is 0.5166664586948004
Logistic loss after 80001 iterations is 0.5141356876108761
Logistic loss after 90001 iterations is 0.5119565769119439
[[ 0.39141833 -0.26602959  6.12217218  1.34022129 -2.14053075 -2.90379363
  -1.75791881  0.47324419  0.55613116 -0.2982995   0.06378874 -0.64538668
   0.54006301 -1.08652891  1.03865021  0.87287042  0.06689074 -0.2864109
  -0.20862822  0.69042134]]
The weight w of LR is 
 [ 0.39141833 -0.26602959  6.12217218  1.34022129 -2.14053075 -2.90379363
 -1.75791881  0.47324419  0.55613116 -0.2982995   0.06378874 -0.645386

In [51]:
# test_size=0.33, random_state=2200, lr=0.01, max_itr=1000000, tol = 1e-8, verbose=True
main()

Logistic loss after 1 iterations is 0.6921641748267755
Logistic loss after 10001 iterations is 0.5555033483898666
Logistic loss after 20001 iterations is 0.542446259552765
Logistic loss after 30001 iterations is 0.5340283371704706
Logistic loss after 40001 iterations is 0.527955115531314
Logistic loss after 50001 iterations is 0.5233343289308922
Logistic loss after 60001 iterations is 0.5196705066902175
Logistic loss after 70001 iterations is 0.5166664586948004
Logistic loss after 80001 iterations is 0.5141356876108761
Logistic loss after 90001 iterations is 0.5119565769119439
Logistic loss after 100001 iterations is 0.5100470636570057
Logistic loss after 110001 iterations is 0.5083500496768416
Logistic loss after 120001 iterations is 0.5068246841432131
Logistic loss after 130001 iterations is 0.5054409811540579
Logistic loss after 140001 iterations is 0.5041763991833677
Logistic loss after 150001 iterations is 0.5030136115342634
Logistic loss after 160001 iterations is 0.5019390202560

In [62]:
main()

Logistic loss after 1 iterations is 0.6503645821819763
Logistic loss after 10001 iterations is 0.481070252852604
Logistic loss after 20001 iterations is 0.478701109161681
Logistic loss after 30001 iterations is 0.4781392787274469
Converged after 39261 iterations
Logistic loss after 39261 iterations is 0.4779747259564311
[[ 0.5963301  -0.28901531 22.62791265 -9.19124023 -9.01601166 -2.3929928
  -0.29851959  0.27779321  0.57521055 -0.19728158 -0.10612059 -0.59066354
   0.55910458 -1.63791733  1.60915241  0.57491034  0.07528688 -0.25517917
  -0.21667281  1.23362117]]
The weight w of LR is 
 [ 0.5963301  -0.28901531 22.62791265 -9.19124023 -9.01601166 -2.3929928
 -0.29851959  0.27779321  0.57521055 -0.19728158 -0.10612059 -0.59066354
  0.55910458 -1.63791733  1.60915241  0.57491034  0.07528688 -0.25517917
 -0.21667281].
The bias b of LR is 1.2336211669052164.
Accuracy of LR on the test dataset is 0.7684210526315789.
