In [42]:
import numpy as np
import random
import os


In [43]:
def nan_check(data, label):
    """Find out the rows in datasets and delete these rows
    
    """
    
    nan_rows = np.array(0); #define an array containg the no. of rows having 'nan'
    
    #collect all the numbers of 'nan'-data rows
    for i in range(len(data)):
        for j in range(16):
            if str(data[i][j]) == 'nan':
                nan_rows = np.append(nan_rows, i)
    nan_rows = np.delete(nan_rows, 0) #delete the first element of nan_rows which was made to fit the append()
    
    #output the dataset whose 'nan'-data rows have been deleted
    return np.delete(data, nan_rows, 0), np.delete(label, nan_rows, 0) 

In [44]:
def shuffle(data_set, label_set):
    """Randomly shuffle the data and label
    
    data_set    the data samples
    
    label_set   the lables
    """
    
    shuffled_data = np.zeros((data_set.shape))
    shuffled_label = np.zeros((label_set.shape))
    idx = np.array(xrange(len(label_set)))
    random.shuffle(idx)
    i = 0
    for j in idx:
        shuffled_data[i] = data_set[int(j)]
        shuffled_label[i] = label_set[int(j)]
        i += 1
    return shuffled_data, shuffled_label

In [45]:
def get_data(set_type):
    """Get data from files and storage them in a array. Return the data_set and label_set.
    
    set_type    the type of data set you want to build, including train dataset, dev dataset 
                and eval dataset
    """
    
    data_path = {'train': 'train/lab/hw1train_labels.txt', 'dev': 'dev/lab/hw1dev_labels.txt', \
                 'eval': 'eval/lab/hw1eval_labels.txt'} 

    label_array = np.loadtxt(data_path[set_type], dtype='string') #load the label file into a array

    #creat empty arrays to insert label and data
    label_set = np.zeros([len(label_array), 1])
    data_set = np.zeros([len(label_array), 16])
    
    # the first column of the label file is the label,
    # the second column is the corresbonding data file nam
    for i in range(len(label_array)): 
        #build the label set
        label_set[i] = label_array[i][0] # insert label into label_set
        
        #build the data set
        with open(label_array[i][1]) as data_file:
            data = data_file.readlines()[0].split() #find the data accoding to label
        for j in range(len(data)):
            data_set[i][j] = data[j] #insert data into the dataset
            
    data_set, label_set = nan_check(data_set, label_set) #delete the rows containing 'nan'

    return shuffle(data_set, label_set) #return the shuffled data set and label set

In [46]:
def linear_regression_gradient(data, label, weight, b):
    """Calculate the gradient of linear regression classifier. Return the gradient.
    
    """

    gradient_w, gradient_b = 0, 0
    for i in range(len(label)):
        gradient_w += (-2) * (label[i] - (np.dot(weight, data[i]) + b)) * data[i]
        gradient_b += (-2) * (label[i] - (np.dot(weight, data[i]) + b))

    return gradient_w, gradient_b

In [54]:
def logistic_regression_gradient(data, label, weight, b):
    """Calculate the gradient of logistic regression . Return the gradient
    
    """
    
    gradient_w, gradient_b = 0, 0
    for i in range(len(label)):
        gradient_w += (-2) * ((np.dot(weight, data[i]) + b) - label[i]) * (np.dot(weight, data[i]) + b) * \
                   (1 - (np.dot(weight, data[i]) + b)) * data[i]
        gradient_b += (-2) * ((np.dot(weight, data[i]) + b) - label[i]) * (np.dot(weight, data[i]) + b) * \
                   (1 - (np.dot(weight, data[i]) + b))
        
    return gradient_w / len(label), gradient_b / len(label)

In [55]:
def gradient_descent(weight, b, learning_rate, gradient_w, gradient_b):
    """Update and return weight and b.
    
    """
    
    weight -= learning_rate * gradient_w
    b -= learning_rate * gradient_b
    return weight, b

In [56]:
def compute_MSE(data, label, weight, b, mse):
    """Compute the Mean Square Error
    
    """
    
    for i in range(len(label)):
        mse += (label[i] - (np.dot(weight, data[i]) + b)) ** 2
        
    mse = mse / len(label)
    return mse

In [57]:
def compute_mse(dev_data, dev_label, w, b):
    """Compute the mean square error
    
    """
    
    mse = 0
    mse = compute_MSE(dev_data, dev_label, w, b, mse)
    
    return mse

In [58]:
def compute_acc(data, label, w, b):
    """accuracy
    
    """
    
    acc = 0
    for i in range(len(label)):
        if label[i] == round(np.dot(w, data[i]) + b):
            acc += 1
    return acc / float(len(label))

In [59]:
def activate(epoch = 1000, lr = 0.0001):
    """
    
    """

    # data and parameter initialization
    w = 2 * np.random.random(size = 16) - 1
    b = 0

    train_data, train_label = get_data('train') #build the dataset for training network
    dev_data, dev_label = get_data('dev')
    
    for i in range(epoch):    
        g_w, g_b = logistic_regression_gradient(train_data, train_label, w, b)
        w, b = gradient_descent(w, b, lr, g_w, g_b)
    
        mse = compute_mse(dev_data, dev_label, w, b)
        acc = compute_acc(dev_data, dev_label, w, b)
        
        print "epoch %d, loss: %f, error rate: %s " % (i, mse, 1 - acc)

In [None]:
activate(2000, 0.00001)

epoch 0, loss: 12.331356, error rate: 0.847286821705 
epoch 1, loss: 11.834105, error rate: 0.844186046512 
epoch 2, loss: 11.381098, error rate: 0.843410852713 
epoch 3, loss: 10.966595, error rate: 0.838759689922 
epoch 4, loss: 10.585818, error rate: 0.843410852713 
epoch 5, loss: 10.234752, error rate: 0.841860465116 
epoch 6, loss: 9.910003, error rate: 0.839534883721 
epoch 7, loss: 9.608678, error rate: 0.838759689922 
epoch 8, loss: 9.328298, error rate: 0.837209302326 
epoch 9, loss: 9.066722, error rate: 0.835658914729 
epoch 10, loss: 8.822097, error rate: 0.832558139535 
epoch 11, loss: 8.592804, error rate: 0.829457364341 
epoch 12, loss: 8.377428, error rate: 0.831007751938 
epoch 13, loss: 8.174724, error rate: 0.831007751938 
epoch 14, loss: 7.983590, error rate: 0.827131782946 
epoch 15, loss: 7.803053, error rate: 0.827131782946 
epoch 16, loss: 7.632243, error rate: 0.827131782946 
epoch 17, loss: 7.470386, error rate: 0.826356589147 
epoch 18, loss: 7.316788, error 