In [1]:
import numpy as np
import random
import os


In [2]:
def nan_check(data, label):
    """Find out the rows in datasets and delete these rows
    
    """
    
    nan_rows = np.array(0); #define an array containg the no. of rows having 'nan'
    
    #collect all the numbers of 'nan'-data rows
    for i in range(len(data)):
        for j in range(16):
            if str(data[i][j]) == 'nan':
                nan_rows = np.append(nan_rows, i)
    nan_rows = np.delete(nan_rows, 0) #delete the first element of nan_rows which was made to fit the append()
    
    #output the dataset whose 'nan'-data rows have been deleted
    return np.delete(data, nan_rows, 0), np.delete(label, nan_rows, 0) 

In [3]:
def shuffle(data_set, label_set):
    """Randomly shuffle the data and label
    
    data_set    the data samples
    
    label_set   the lables
    """
    
    shuffled_data = np.zeros((data_set.shape))
    shuffled_label = np.zeros((label_set.shape))
    idx = np.array(xrange(len(label_set)))
    random.shuffle(idx)
    i = 0
    for j in idx:
        shuffled_data[i] = data_set[int(j)]
        shuffled_label[i] = label_set[int(j)]
        i += 1
    return shuffled_data, shuffled_label

In [4]:
def get_data(set_type):
    """Get data from files and storage them in a array. Return the data_set and label_set.
    
    set_type    the type of data set you want to build, including train dataset, dev dataset 
                and eval dataset
    """
    
    data_path = {'train': 'train/lab/hw1train_labels.txt', 'dev': 'dev/lab/hw1dev_labels.txt', \
                 'eval': 'eval/lab/hw1eval_labels.txt'} 

    label_array = np.loadtxt(data_path[set_type], dtype='string') #load the label file into a array

    #creat empty arrays to insert label and data
    label_set = np.zeros([len(label_array), 1])
    data_set = np.zeros([len(label_array), 16])
    
    # the first column of the label file is the label,
    # the second column is the corresbonding data file nam
    for i in range(len(label_array)): 
        #build the label set
        label_set[i] = label_array[i][0] # insert label into label_set
        
        #build the data set
        with open(label_array[i][1]) as data_file:
            data = data_file.readlines()[0].split() #find the data accoding to label
        for j in range(len(data)):
            data_set[i][j] = data[j] #insert data into the dataset
            
    data_set, label_set = nan_check(data_set, label_set) #delete the rows containing 'nan'

    return shuffle(data_set, label_set) #return the shuffled data set and label set

In [5]:
def linear_regression_gradient(data, label, weight, b):
    """Calculate the gradient of linear node classifier. Return the gradient.
    
    """

    gradient_w, gradient_b = 0, 0
    for i in range(len(label)):
        gradient_w += (-2) * (label[i] - (np.dot(weight, data[i]) + b)) * data[i]
        gradient_b += (-2) * (label[i] - (np.dot(weight, data[i]) + b))

    return gradient_w, gradient_b

In [6]:
def gradient_descent(weight, b, learning_rate, gradient_w, gradient_b):
    """Update and return weight and b.
    
    """
    
    weight -= learning_rate * gradient_w
    b -= learning_rate * gradient_b
    return weight, b

In [7]:
def compute_MSE(data, label, weight, b, mse):
    """Compute the Mean Square Error
    
    """
    
    for i in range(len(label)):
        mse += (label[i] - (np.dot(weight, data[i]) + b)) ** 2
        
    mse = mse / len(label)
    return mse

In [8]:
def compute_mse(dev_data, dev_label, w, b):
    """Compute the mean square error
    
    """
    
    mse = 0
    mse = compute_MSE(dev_data, dev_label, w, b, mse)
    
    return mse

In [11]:
def compute_acc(data, label, w, b):
    """accuracy
    
    """
    
    acc = 0
    for i in range(len(label)):
        if label[i] == round(np.dot(w, data[i]) + b):
            acc += 1
    return acc / float(len(label))

In [17]:
def activate(epoch = 300, lr = 0.000001):
    """
    
    """

    # data and parameter initialization
    w = 2 * np.random.random(size = 16) - 1
    b = 0

    train_data, train_label = get_data('train') #build the dataset for training network
    dev_data, dev_label = get_data('dev')
    
    for i in range(epoch):    
        g_w, g_b = linear_regression_gradient(train_data, train_label, w, b)
        w, b = gradient_descent(w, b, lr, g_w, g_b)
    
        mse = compute_mse(dev_data, dev_label, w, b)
        acc = compute_acc(dev_data, dev_label, w, b)
        
        print("epoch %d, loss: %f, error rate: %s " % (i, mse, 1 - acc))

In [18]:
activate()

epoch 0, loss: 5.423835, error rate: 0.820930232558 
epoch 1, loss: 2.971491, error rate: 0.749612403101 
epoch 2, loss: 2.669611, error rate: 0.73488372093 
epoch 3, loss: 2.541482, error rate: 0.728682170543 
epoch 4, loss: 2.441614, error rate: 0.724031007752 
epoch 5, loss: 2.352348, error rate: 0.723255813953 
epoch 6, loss: 2.270014, error rate: 0.713178294574 
epoch 7, loss: 2.193340, error rate: 0.709302325581 
epoch 8, loss: 2.121609, error rate: 0.704651162791 
epoch 9, loss: 2.054287, error rate: 0.698449612403 
epoch 10, loss: 1.990933, error rate: 0.696899224806 
epoch 11, loss: 1.931170, error rate: 0.692248062016 
epoch 12, loss: 1.874676, error rate: 0.686821705426 
epoch 13, loss: 1.821167, error rate: 0.681395348837 
epoch 14, loss: 1.770400, error rate: 0.680620155039 
epoch 15, loss: 1.722157, error rate: 0.67519379845 
epoch 16, loss: 1.676249, error rate: 0.672868217054 
epoch 17, loss: 1.632506, error rate: 0.667441860465 
epoch 18, loss: 1.590776, error rate: 0.