In [1]:
import numpy as np
import math
import copy

In [8]:
def read_data(file_name):
    file = open(file_name, 'r')
    lines = file.readlines()
    k = len(lines[0].split())
    n = len(lines)
    data = np.zeros((n,k))
    for i in range(n):
        line_words = lines[i].split()
        for j in range(k):
            data[i,j] = eval(line_words[j])
    labels = data[:,0]
    features = data[:,1:]
    features = (features - np.mean(features,axis = 0))/np.var(features, axis=0) #normalize features
    return labels, features

In [18]:
def leave_one_accuracy(labels, features, mask):
    n = len(labels)
    correct_count = 0
    for i in range(n):
        distance_array = np.sum(np.power((features - features[i])*mask,2), axis = 1)
        distance_array[i] = 100000  #max value
        nearest_neighbor = int(np.argmin(distance_array))
        correct_count += (labels[i] == labels[nearest_neighbor])
    return correct_count / n

In [19]:
def forward_selection(file_name):
    labels, features = read_data(file_name)
    m = np.shape(features)[1]
    print("The data has {} features in total.".format(m))
    indeces = [i for i in range(m)]
    selection = []
    best_features = []
    best_accuracy = 0
    for i in range(m):
        print("in {}th round".format(i+1))
        max_accuracy = 0
        max_index = -1
        for f in indeces:
            current_select = copy.copy(selection)
            current_select.append(f)
            mask = np.zeros(m)
            mask[current_select] = 1
            accuracy = leave_one_accuracy(labels, features, mask)
            print("select feature {}, with accuracy {}".format(f+1, accuracy))
            if (accuracy > max_accuracy):
                max_accuracy = accuracy
                max_index = f
        selection.append(max_index)
        indeces.remove(max_index)
        print("feature set {} was best, with accuracy {}".format([i+1 for i in selection], max_accuracy))
        if max_accuracy > best_accuracy :
            best_accuracy = max_accuracy
            best_features = copy.copy(selection)
    print("Finished search! The best feature set is {}. with accuracy {}.".format([i+1 for i in best_features], best_accuracy))

In [20]:
def backward_elimination(file_name):
    labels, features = read_data(file_name)
    m = np.shape(features)[1]
    print("The data has {} featuers in total.".format(m))
    selection = [i for i in range(m)]
    best_features = []
    best_accuracy = 0
    for i in range(m-1):
        print("in {}th round".format(i+1))
        max_accuracy = 0
        max_index = -1
        for f in selection:
            current_indeces = copy.copy(selection)
            current_indeces.remove(f)
            mask = np.zeros(m)
            mask[current_indeces] = 1
            accuracy = leave_one_accuracy(labels, features, mask)
            print("remove feature {}, with accuracy {}".format(f+1,accuracy))
            if (accuracy > max_accuracy):
                max_accuracy = accuracy
                max_index = f
        selection.remove(max_index)
        print("feature set {} was best, with accuracy {}".format([i+1 for i in selection], max_accuracy))
        if max_accuracy > best_accuracy :
            best_accuracy = max_accuracy
            best_features = copy.copy(selection)
    print("Finished search! The best feature set is {}. with accuracy {}.".format([i+1 for i in best_features], best_accuracy))

In [21]:
forward_selection("data/small/CS205_small_testdata__10.txt")

The data has 10 features in total.
in 1th round
select feature 1, with accuracy 0.6966666666666667
select feature 2, with accuracy 0.8133333333333334
select feature 3, with accuracy 0.6733333333333333
select feature 4, with accuracy 0.6866666666666666
select feature 5, with accuracy 0.6666666666666666
select feature 6, with accuracy 0.6866666666666666
select feature 7, with accuracy 0.6933333333333334
select feature 8, with accuracy 0.67
select feature 9, with accuracy 0.6466666666666666
select feature 10, with accuracy 0.66
feature set [2] was best, with accuracy 0.8133333333333334
in 2th round
select feature 1, with accuracy 0.8433333333333334
select feature 3, with accuracy 0.81
select feature 4, with accuracy 0.8533333333333334
select feature 5, with accuracy 0.79
select feature 6, with accuracy 0.87
select feature 7, with accuracy 0.8466666666666667
select feature 8, with accuracy 0.8533333333333334
select feature 9, with accuracy 0.9633333333333334
select feature 10, with accurac

In [177]:
backward_elimination("data/small/CS205_small_testdata__10.txt")

The data has 10 featuers in total.
in 1th round
remove feature 1, with accuracy 0.71
remove feature 2, with accuracy 0.66
remove feature 3, with accuracy 0.7166666666666667
remove feature 4, with accuracy 0.76
remove feature 5, with accuracy 0.7266666666666667
remove feature 6, with accuracy 0.72
remove feature 7, with accuracy 0.7166666666666667
remove feature 8, with accuracy 0.7266666666666667
remove feature 9, with accuracy 0.7033333333333334
remove feature 10, with accuracy 0.7633333333333333
feature set [1, 2, 3, 4, 5, 6, 7, 8, 9] was best, with accuracy 0.7633333333333333
in 2th round
remove feature 1, with accuracy 0.7433333333333333
remove feature 2, with accuracy 0.7233333333333334
remove feature 3, with accuracy 0.7433333333333333
remove feature 4, with accuracy 0.7633333333333333
remove feature 5, with accuracy 0.7833333333333333
remove feature 6, with accuracy 0.7433333333333333
remove feature 7, with accuracy 0.75
remove feature 8, with accuracy 0.78
remove feature 9, wit

In [178]:
forward_selection("data/small/CS205_small_testdata__19.txt")

The data has 10 features in total.
in 1th round
select feature 1, with accuracy 0.6933333333333334
select feature 2, with accuracy 0.7533333333333333
select feature 3, with accuracy 0.6966666666666667
select feature 4, with accuracy 0.7566666666666667
select feature 5, with accuracy 0.68
select feature 6, with accuracy 0.7066666666666667
select feature 7, with accuracy 0.7266666666666667
select feature 8, with accuracy 0.77
select feature 9, with accuracy 0.7666666666666667
select feature 10, with accuracy 0.8566666666666667
feature set [10] was best, with accuracy 0.8566666666666667
in 2th round
select feature 1, with accuracy 0.8433333333333334
select feature 2, with accuracy 0.8133333333333334
select feature 3, with accuracy 0.8433333333333334
select feature 4, with accuracy 0.9766666666666667
select feature 5, with accuracy 0.8633333333333333
select feature 6, with accuracy 0.8433333333333334
select feature 7, with accuracy 0.8166666666666667
select feature 8, with accuracy 0.85333

In [2]:
a = np.array([1,2,3])
b = np.array([3,2,1])
m = np.array([[3,2,1],[2,2,2]])

In [20]:
x = (m - a)

array([[ 0,  0, -2],
       [ 0,  0, -1]])

In [16]:
y = np.array([2,3])

In [17]:
np.amin(y)

2

In [7]:
math.sqrt(sum(np.power(a-b,2)))

2.8284271247461903

In [28]:
a = 0

In [29]:
a += (1==1)

In [4]:
a = np.zeros(5)
a

array([0., 0., 0., 0., 0.])

In [5]:
a[[0,2]] = 1

array([1., 0., 1., 0., 0.])