In [8]:
import numpy as np
from scipy import stats

def Slide_window(my_window,layers):#Stop instead of useless counting
    return (my_window[layers-2] > my_window[layers-1]) and (my_window[layers-1] > my_window[layers])

#Normalizing data 
def Normalize_data(data):
    data_nor = stats.zscore(data)
    return data_nor

#Performing Leave One Out Cross Validation and minimax like pruning
def CrossValidation_ML(features_in,data_nor,select_f, p, tolerate_error):
    error_counter = 0
    feature=list(features_in)
    if p==1:
        feature.append(select_f)
    if p ==2:
        feature.remove(select_f)
    count = 0
    dis = np.inf
    final=0
    for i in range(0,len(data_nor)):
        dis = np.inf
        for k in range(0,len(data_nor)):
            if not np.array_equal(k,i):
                ss = 0
                for j in range(0, len(feature)):#Calculating Euclidean Distance to determine the nearest neighbors
                    ss=ss+pow((data_nor[k][feature[j]] - data_nor[i][feature[j]]),2)
                    d=np.sqrt(ss)
                if d < dis:
                    dis = d
                    final = k
        if (data_nor[final][0]==data_nor[i][0]):
            count += 1
            accuracy = (count / (len(data_nor)-1))#Calculating accuracy
        else:
            error_counter += 1
        
        if tolerate_error < error_counter:
            print("Using feature(s), the feature has been stop by minimax like pruning")
            return 0
    print("Using feature(s)",feature, "accuracy is", round(accuracy*100,2), "%")
    return accuracy

#Performing Leave One Out Cross Validation
def CrossValidation(features_in,data_nor,select_f, p):
    feature=list(features_in)
    if p==1:
        feature.append(select_f)
    if p ==2:
        feature.remove(select_f)
    count = 0
    dis = np.inf
    final=0
    for i in range(0,len(data_nor)):
        dis = np.inf
        for k in range(0,len(data_nor)):
            if not np.array_equal(k,i):
                ss = 0
                for j in range(0, len(feature)):#Calculating Euclidean Distance to determine the nearest neighbors
                    ss=ss+pow((data_nor[k][feature[j]] - data_nor[i][feature[j]]),2)
                    d=np.sqrt(ss)
                if d < dis:
                    dis = d
                    final = k
        if (data_nor[final][0]==data_nor[i][0]):
            count += 1
            accuracy = (count / (len(data_nor)-1))#Calculating accuracy
    print("Using feature(s)",feature, "accuracy is", round(accuracy*100,2), "%")
    return accuracy


def ForwardSelection(data_nor,NF):
    print("Beginning search.\n")
    current_features = []
    final_acc = 0
    best_feature=[]
    my_windows = [0 for _ in range(NF)]
    counter =  0
    for i in range(1, NF+1):
        print("\n On level %d of the search tree" % (i),"contains", current_features)
        feature_select = 0
        cur_acc=0.0
        for j in range(1, NF+1):
            if j not in current_features:
                if j == 1:
                    acc = CrossValidation(current_features,data_nor,j,1)
                else:
                    acc = CrossValidation_ML(current_features,data_nor,j,1,(int) ((1-cur_acc) * len(data_nor) ))
                if acc> cur_acc:
                    cur_acc = acc
                    feature_select = j
        
        current_features.append(feature_select)
        print("\n On level %d of the search tree," % (i),"feature %d was added to the current set" % (feature_select))
        print("\n With ", len(current_features), " features, the accuracy is: ", round(cur_acc * 100,2), "%")
        
        my_windows[counter] = cur_acc        
        if counter > 2 and Slide_window(my_windows, counter):
            break
        counter += 1
        if cur_acc >= final_acc: 
            final_acc= cur_acc
            best_feature = list(current_features)

    print()
    print("Finish search!! The best feature subset is:", best_feature,"which has an accuracy of", round(final_acc * 100,2), "%")


def BackwardElimination(data_nor,NF):
    param_stop= input("\nType the accuracy param to early abandon(0-100): \n You may input 100 if you don't want early abondon \n \n")
    param_stop= int(param_stop)
    
    print("Beginning search.\n")
    final_acc = 0
    best_feature=[]
    current_features = [i for i in range(1, NF+1)]

    for i in range(1, NF):
        print("\n On level %d of the search tree" % (i),"contains", current_features)
        feature_select = 0
        cur_acc = 0
        for j in range(1,NF):
            if (j in current_features):
                acc = CrossValidation(current_features,data_nor,j,2)
                if acc > cur_acc:
                    cur_acc = acc
                    feature_select = j
        if feature_select in current_features: 
            current_features.remove(feature_select) 
            print("\n On level ", i, " feature ", feature_select, " was removed from the current set")
            print("\n With ", len(current_features), " features, the accuracy is: ", round(cur_acc * 100,2), "%")
        if cur_acc >= final_acc: 
            final_acc = cur_acc
            best_feature= list(current_features)
        if final_acc * 100 > param_stop:
            print()
            print("Early abandon!! The best feature subset is:", best_feature,"which has an accuracy of", round(final_acc * 100,2), "%")
            return 

    print()
    print("Finish search!! The best feature subset is:", best_feature,"which has an accuracy of", round(final_acc * 100,2), "%")


def main():
    print("Welcome to the Feature Selection Algorithm:")
    files = input("Type in the name of the file to test: ")
    
    algorithm=input("\nType the algorithm you want to run:\n \n 1.Forward Selection\n 2.Backward Elimination\n \n")
   
    data=np.loadtxt(files)
#     print(data[1])
#     data = data[1000:]
    

    data = data[:(int)(len(data)*0.9)]#sample the data
    N=len(data)
    
    
    
#     print(N)
    data_normal = Normalize_data(data)
    NF= len(data_normal[0])-1
    print ("\nThis dataset has "+ str(NF)+ " features (no including the class attribute), with "+str(N)+ " instances")
   
    if (algorithm == "1"):
        ForwardSelection(data_normal,NF)
    elif(algorithm == "2"):
        BackwardElimination(data_normal,NF)
    else:
        print("Error input!")
    
if __name__ == '__main__':
     main()

Welcome to the Feature Selection Algorithm:
Type in the name of the file to test: CS170_small_Data__31.txt

Type the algorithm you want to run:
 
 1.Forward Selection
 2.Backward Elimination
 
1

This dataset has 10 features (no including the class attribute), with 900 instances
Beginning search.


 On level 1 of the search tree contains []
Using feature(s) [1] accuracy is 85.76 %
Using feature(s), the feature has been stop by minimax like pruning
Using feature(s), the feature has been stop by minimax like pruning
Using feature(s), the feature has been stop by minimax like pruning
Using feature(s), the feature has been stop by minimax like pruning
Using feature(s), the feature has been stop by minimax like pruning
Using feature(s), the feature has been stop by minimax like pruning
Using feature(s), the feature has been stop by minimax like pruning
Using feature(s), the feature has been stop by minimax like pruning
Using feature(s), the feature has been stop by minimax like pruning

 On

In [5]:
from numpy import genfromtxt
# script for 
files = input("Type in the name of the file to test: ")
my_data = genfromtxt(files, delimiter=',')
print(my_data[0][0])
my_data = np.delete(my_data,0,axis= 0)

print(my_data)

last_column = my_data[:, -1].reshape(-1, 1) # convert into vector
# delete the last
other_columns = my_data[:, :-1]
# put them into together
new_data = np.hstack((last_column, other_columns))
np.savetxt('new_data.txt',new_data)

Type in the name of the file to test: diabetes.csv
nan
[[  6.    148.     72.    ...   0.627  50.      1.   ]
 [  1.     85.     66.    ...   0.351  31.      0.   ]
 [  8.    183.     64.    ...   0.672  32.      1.   ]
 ...
 [  5.    121.     72.    ...   0.245  30.      0.   ]
 [  1.    126.     60.    ...   0.349  47.      1.   ]
 [  1.     93.     70.    ...   0.315  23.      0.   ]]
