In [4]:
import numpy as np 
from matplotlib import pyplot as plt

#########################################
#Part1 readFile()
#########################################   
# Args : 
#   fileName  - string
#
# Return:
#   (x,y)  - tuple
#   x,y:numpy array
def readFile (filename):
    data = np.loadtxt(filename)
    print(type(data),np.shape(data))
   
    data = np.matrix(data)
    shape = np.shape(data)
    print(shape)
    cols = shape[1]-1
    # number of data
    m = data.shape[0]
    #seperate data to features and label
    x = data[:,np.arange(0,cols)]
    y = data[:,[57]]
    return (x,y)

#########################################
#Part2 standardize
#########################################   
def standardize (np_Xy):
    x  = np_Xy[0]
    y = np_Xy[1]
    avg = np.mean(x,axis=0)
    std = np.std(x,axis=0)
    x = (x - avg)/std
    return (x,y)

#########################################
#Part3 train
#########################################
def sigmoid(x):
    #deal with overflow
    if(x>1000):
        return 1
    if(x<-1000):
        return 0
    res = 1/(1 + np.math.exp(-x))
    return res

def sigmoid_v(x):
    return 1/(1 + np.exp(-x))
#to accept vctor as input
sigmoid_vec = sigmoid_v

#define cost function: 
cost_list = []
#deal with overflow
def mlog(x):
    if(x == 0):
        return - 1000000000
    return np.math.log(x)

log_vec = np.vectorize(mlog)

def cost_function(w,y,y_estimate,m,lambda_reg):
    A =  np.dot(y.T,log_vec(y_estimate)) 
    t1 = 1- y
    t2 = 1- y_estimate
    A = A + np.dot(t1.T,log_vec(t2))
    cost_reg = (-A + lambda_reg/2*(w.T*w))/m
    return float(cost_reg)

def predict_test(x):
    if x >0.8:
        return 1
    return 0

def train (np_Xy, iterations, learning_rate, lambda_reg):
    test_v = np.vectorize(predict_test)
    #seperate data to features and label
    x = np_Xy[0]
    y = np_Xy[1]
    print(x.shape,y.shape)
    shape = np.shape(x)
    # number of data
    m = shape[0]
    # number of features
    cols = shape[1]
    #initial params
    w = np.ones(shape=(cols,1),dtype=float)
    dw = np.zeros(shape=(cols,1),dtype=float)
    print(np.shape(w))
    w = np.matrix(w)
    dw = np.matrix(dw)
    x = np.matrix(x)
    y = np.matrix(y)
    b = 0
    db = 0
    #iterations for training
    for i in range(iterations):
        print("iteration",i)
        #propagation
        t = np.dot(x,w) 
        A = sigmoid_vec(t)
        dy = A - y
        #compute cost fucntion
        cost = cost_function(w,y,A,m,lambda_reg)
        cost_list.append(cost)
        #compute accuracy
        predict_y = test_v(A)
        acu = 1 - (np.mean(np.abs(predict_y - y)))
        accuracy_list.append(acu)
        #back propagation
        dw = (np.dot(x.T,dy)+lambda_reg*w)/m
        db =(np.sum(dy))/m
        #update params(by gradient descent)
        w = w - learning_rate * dw
        b = b - learning_rate * db
    return w,b

def predict(w,b,X):
    t = np.dot(X,w) + b
    y = sigmoid_vec(t)
    # try to explore a proper threshold I set 0.7 here
    return predict_test(y)

accuracy_list = []
def accuracy (w, b, np_Xy):
    X = np_Xy[0]
    Y = np_Xy[1]
    predict_value =np.matrix(np.array([predict(w,b,x) for x in X])).T
    return 1 - (np.mean(np.abs(predict_value - Y)))

#########################################
#Part4 Cross Validation
#########################################

def shuffle_transfrom(data,k_fold):
    u_data = np.hstack((data[0],data[1]))
    print('union data')
    print(u_data.shape)
    np.random.shuffle(u_data)
    print('shuffled data')
    print(u_data.shape)
    rows = np.shape(u_data)[0]
    len_fold = int(rows/k_fold)
    data_cv = []
    for i in range(k_fold):
        fold = u_data[i*len_fold:(i+1)*len_fold - 1]
        x = fold[:,np.arange(0,57)]
        y = fold[:,[57]]
        data_cv.append((x,y))
    return data_cv

def get_block_data(data_cv,i):
    test_data = data_cv[i]
    train_features = None
    train_label = None
    for k in range(len(data_cv)):
        print('get',k)
        fold = data_cv[k]
        if k != i :
            if train_features is None:
                train_features= np.copy(fold[0])
                train_label = np.copy(fold[1])
                print('init',train_features.shape,train_label.shape)
            else:
                print('concatenate',train_features.shape,train_label.shape)
                train_features = np.concatenate((train_features,fold[0]),0)
                train_label = np.concatenate((train_label,fold[1]),0)
    train_data = (train_features,train_label)
    return test_data,train_data    

def cross_validation(k_fold = 10,iterations=30, learning_rate=0.7, lambda_reg=0.4):
    cv_accs = []
    #
    data = readFile('../data/spam.data')
    data = standardize(data)
    data_cv = shuffle_transfrom(data,k_fold)
    for i in range(k_fold):
        print('cv',i)
        test_data,train_data = get_block_data(data_cv,i)
        w,b = train(train_data,iterations, learning_rate, lambda_reg)
        acc = accuracy(w,b,test_data)
        cv_accs.append(acc)
        print('acc',acc)
    accus = np.array(cv_accs)
    print('average accuracy:',accus.mean())
    plt.figure()
    plt.plot(cv_accs)
    plt.figure()
    plt.plot(accuracy_list)
    plt.show()
 

def test():
    data = readFile('../data/spam.data')
    data2 = standardize(data)
    w,b = train(data2,20,0.6,0.2)
    print("accuracy",accuracy(w,b,data2))
    # draw cost and accuracy curve in training iterations
    plt.subplot(2,1,1)
    plt.plot(cost_list)
    plt.title("Train :cost")
    plt.subplot(2,1,2)
    plt.title("Train :accuracy")
    plt.plot(accuracy_list)
    plt.show()

#########################################
#Part5 train and test
#########################################


In [5]:
#test train
test()

<class 'numpy.ndarray'> (4601, 58)
(4601, 58)
(4601, 57) (4601, 1)
(57, 1)
iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
accuracy 0.8356878939361009


In [6]:
cross_validation()

<class 'numpy.ndarray'> (4601, 58)
(4601, 58)
union data
(4601, 58)
shuffled data
(4601, 58)
cv 0
get 0
get 1
init (459, 57) (459, 1)
get 2
concatenate (459, 57) (459, 1)
get 3
concatenate (918, 57) (918, 1)
get 4
concatenate (1377, 57) (1377, 1)
get 5
concatenate (1836, 57) (1836, 1)
get 6
concatenate (2295, 57) (2295, 1)
get 7
concatenate (2754, 57) (2754, 1)
get 8
concatenate (3213, 57) (3213, 1)
get 9
concatenate (3672, 57) (3672, 1)
(4131, 57) (4131, 1)
(57, 1)
iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
acc 0.8474945533769063
cv 1
get 0
init (459, 57) (459, 1)
get 1
get 2
concatenate (459, 57) (459, 1)
get 3
concatenate (918, 57) (918, 1)
get 4
conca