In [1]:
from matplotlib import pyplot as plt
import numpy as np 
import math
import matplotlib.pyplot as plt
%matplotlib inline

### Defined by me

In [2]:
def sigmoid(s):
    return 1 / (1 + np.exp(-s))

In [80]:
def logistic_regression(data, label, max_iter, learning_rate):
    '''
    The logistic regression classifier function.

    Args:
    data: train data with shape (1561, 3), which means 1561 samples and 
          each sample has 3 features.(1, symmetry, average internsity)
          
    label: train data's label with shape (1561,1). 
           1 for digit number 1 and -1 for digit number 5.
           
    max_iter: max iteration numbers
    
    learning_rate: learning rate for weight update

    Returns:
        w: the seperater with shape (3, 1). You must initilize it with w = np.zeros((d,1))
    '''
    N,m = data.shape
    w = np.zeros((m,1))
    # Compute the Gradient
    for t in range(max_iter):
        g = 0
        for n in range(N):
            g = (-label[n]*data[n]*sigmoid(-label[n]*np.dot(w.T,data[n]))) + g
        g = (1/N)*g
        g = g.reshape((m,1))
        w = w + (learning_rate * -g)    
    return w

In [4]:
def accuracy(x, y, w):
    '''
    This function is used to compute accuracy of a logsitic regression model.
    
    Args:
    x: input data with shape (n, d), where n represents total data samples and d represents
        total feature numbers of a certain data sample.
        
    y: corresponding label of x with shape(n, 1), where n represents total data samples.
    
    w: the seperator learnt from logistic regression function with shape (d, 1),
        where d represents total feature numbers of a certain data sample.

    Return 
        accuracy: total percents of correctly classified samples. Set the threshold as 0.5,
        which means, if the predicted probability > 0.5, classify as 1; Otherwise, classify as -1.
    '''
    final_scores = np.dot(x, w)
    preds = sigmoid(final_scores)
    preds = [1 if(x >= 0.5) else -1 for x in preds]
    top = 0
    for i in range(len(y)):
        if(preds[i]==y[i]):
            top = top + 1
    accuracy = top/len(y)
    return accuracy

In [60]:
def thirdorder(data):
    '''
    This function is used for a 3rd order polynomial transform of the data.
    Args:
    data: input data with shape (:, 3) the first dimension represents 
          total samples (training: 1561; testing: 424) and the 
          second dimesion represents total features.

    Return:
    result: A numpy array format new data with shape (:,10), which using 
            a 3rd order polynomial transformation to extend the feature numbers 
            from 3 to 10. 
            The first dimension represents total samples (training: 1561; testing: 424) 
            and the second dimesion represents total features.
    '''
    N,_ = data.shape
    ones = np.ones((N,1))
    x_one_pow_two = np.power(data[:,0:1],2)
    x_two_pow_two = np.power(data[:,1:2],2)
    x_one_pow_thr = np.power(data[:,0:1],3)
    x_two_pow_thr = np.power(data[:,1:2],3)
    
    result = np.hstack((ones,                      #  1
                        data[:,0:1],               # x_1
                        data[:,1:2],               # x_2
                        x_one_pow_two,             # (x_1)^2
                        data[:,0:1]*data[:,1:2],   # x_1*x_2
                        x_two_pow_two,             # (x_2)^2
                        x_one_pow_thr,             # (x_1)^3
                        x_one_pow_two*data[:,1:2], # (x_1)^2 * x_2
                        data[:,0:1]*x_two_pow_two, # x_1 * (x_2)^2
                        x_two_pow_thr              # (x_2)^3
                       ))       

    return result

### Defined by Professor

In [82]:
#Use for testing the training and testing processes of a model
def train_test_a_model(modelname, train_data, train_label, test_data, test_label, max_iter, learning_rate):
    print(modelname+" testing...")
    
    # max iteration test cases 
    for i, m_iter in enumerate(max_iter):
        w = logistic_regression(train_data, train_label, m_iter, learning_rate[1])
        Ain, Aout = accuracy(train_data, train_label, w), accuracy(test_data, test_label, w)
        print("max iteration testcase%d: Train accuracy: %f, Test accuracy: %f"%(i, Ain, Aout))
    # learning rate test cases
    for i, l_rate in enumerate(learning_rate):
        w = logistic_regression(train_data, train_label, max_iter[3], l_rate)
        Ain, Aout = accuracy(train_data, train_label, w), accuracy(test_data, test_label, w)
        print("learning rate testcase%d: Train accuracy: %f, Test accuracy: %f"%(i, Ain, Aout))
    print(modelname+" test done.")

In [72]:
def extract_feature(image):
    image = np.reshape(image, (16, 16))
    flip_image = np.flip(image, 1)
    diff = abs(image-flip_image)
    sys  = -sum(sum(diff))/256
    intense = sum(sum(image))/256
    return sys, intense

In [73]:
def load_data(dataloc):
    data = np.loadtxt(dataloc, unpack='true')
    data = np.transpose(data, (1,0))
    return data

In [74]:
def load_features(dataloc):
    data = load_data(dataloc)
    n, _ = data.shape
    data_set = []
    for i in range(n):
        label = 1 if data[i, 0]==1 else -1
        image = data[i, 1:]
        sys, intense = extract_feature(image)
        data_set.append([label, 1, sys, intense])
    return np.array(data_set)[:,1:], np.array(data_set)[:,0]

In [75]:
def test_logistic_regression():
    max_iter = [100, 200, 500,1000]
    learning_rate = [0.1, 0.2, 0.5]
    traindataloc,testdataloc = "../data/train.txt", "../data/test.txt"
    train_data,train_label = load_features(traindataloc)
    test_data, test_label = load_features(testdataloc)
    try:
        train_test_a_model("logistic regression", train_data, train_label, test_data,test_label, max_iter, learning_rate)
    except:
        print("Please finish logistic_regression() and cross_entropy_error() functions \n\
        before you run the test_logistic_regression() function.\n")

In [28]:
test_logistic_regression()

logistic regression testing...
max iteration testcase0: Train accuracy: 0.834721, Test accuracy: 0.827830
max iteration testcase1: Train accuracy: 0.924407, Test accuracy: 0.900943
max iteration testcase2: Train accuracy: 0.966047, Test accuracy: 0.941038
max iteration testcase3: Train accuracy: 0.973735, Test accuracy: 0.950472
learning rate testcase0: Train accuracy: 0.966047, Test accuracy: 0.941038
learning rate testcase1: Train accuracy: 0.973735, Test accuracy: 0.950472
learning rate testcase2: Train accuracy: 0.978860, Test accuracy: 0.962264
logistic regression test done.


In [83]:
def test_thirdorder_logistic_regression():
    max_iter = [100, 200, 500,1000]
    learning_rate = [0.1, 0.2, 0.5]
    traindataloc,testdataloc = "../data/train.txt", "../data/test.txt"
    train_data,train_label = load_features(traindataloc)
    test_data, test_label = load_features(testdataloc)
    try:
        new_train_data = thirdorder(train_data[:,1:3])
        new_test_data = thirdorder(test_data[:,1:3])
        train_test_a_model("3rd order logistic regression", new_train_data, train_label,
                           new_test_data, test_label, max_iter, learning_rate)
    except:
        print("Please finish thirdorder() function before you run\n\
                the test_thirdorder_logistic_regression() function.\n")

In [84]:
test_thirdorder_logistic_regression()

3rd order logistic regression testing...
max iteration testcase0: Train accuracy: 0.924407, Test accuracy: 0.898585
max iteration testcase1: Train accuracy: 0.958360, Test accuracy: 0.941038
max iteration testcase2: Train accuracy: 0.970532, Test accuracy: 0.948113
max iteration testcase3: Train accuracy: 0.975016, Test accuracy: 0.955189
learning rate testcase0: Train accuracy: 0.970532, Test accuracy: 0.948113
learning rate testcase1: Train accuracy: 0.975016, Test accuracy: 0.955189
learning rate testcase2: Train accuracy: 0.978219, Test accuracy: 0.964623
3rd order logistic regression test done.
