In [1]:
import pandas as pd
import math
import numpy as np
from scipy.io import arff
import heapq
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [2]:
#reading data
# getting the dataframe from weka!!
data = arff.loadarff('veh-prime.arff')
df = pd.DataFrame(data[0])
df.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f27,f28,f29,f30,f31,f32,f33,f34,f35,CLASS
0,0.063,0.16,0.509,-0.967,0.058,0.0,0.874,0.271,1.307,-0.011,...,-0.924,-0.077,0.108,-0.003,0.381,-0.314,0.929,0.184,-0.001,b'noncar'
1,-0.037,-0.325,-0.626,-0.029,0.121,-0.409,-0.002,-0.835,-0.595,-0.253,...,0.27,0.533,0.152,-0.978,0.157,0.011,-0.254,0.453,-0.621,b'noncar'
2,-0.0,1.253,0.833,-0.97,1.516,0.014,-0.378,1.197,0.546,-0.402,...,-0.408,1.55,0.01,-0.652,-0.403,-0.151,0.0,0.049,-0.113,b'car'
3,-0.743,-0.082,-0.626,0.723,-0.006,-0.0,-0.08,-0.297,0.166,0.311,...,0.819,-0.077,-0.099,-0.001,-0.291,1.633,0.686,1.528,-0.0,b'noncar'
4,-0.939,-1.054,-0.14,0.036,-0.766,0.0,-0.272,1.077,5.236,-0.366,...,0.676,0.533,-0.003,0.122,-0.179,-1.449,0.024,-1.698,0.083,b'noncar'


In [3]:
#getting the features from the dataframe
features = df.columns.drop(df.columns[-1])
#getting the target
train_targets = df[['CLASS']]
old_labels = [b'noncar',b'car']
new_labels = [0,1]
#transforming the target to 0-1 and its type is data frame.
train_n_targets = train_targets.replace(old_labels,new_labels)
train_n_targets.head()

Unnamed: 0,CLASS
0,0
1,0
2,1
3,0
4,0


In [4]:
#getting the distance matrix for every test data
def get_distance(X_train,X_test):
    matrix_distance = []
    for i in range(X_test.shape[0]):
        list_distance = []
        for j in range(X_train.shape[0]):
            d = (np.sum((X_test[i] - X_train[j])**2))**0.5
            list_distance.append(d)
        matrix_distance.append(list_distance)
    matrix_distance = pd.DataFrame(matrix_distance)
    return matrix_distance

#Comparing the accuracy
def score(ypred,testing):
    return (list(np.array(ypred) - np.array(testing)).count(0))/len(ypred)  

In [5]:
# 1 the matrix,2 k ,3 , train_targets
def get_k_outcome(matrix_distance,k,train_targets,test_traget):
    test_outcome = []
    x, y = matrix_distance.shape
    for i in k:
        list_test_target = []
        Y = []
        for j in range(x) :
            instances = matrix_distance.iloc[j]
            test_target = heapq.nsmallest(i,instances)
            min_num_index_list = list(map(list(instances).index, test_target))
            list_test_target.append(min_num_index_list)
            
        for n in list_test_target :
            t = train_targets[n]
            t_array = np.array(t)
            if t_array.sum()*2 < i :
                y = int(0)
            else :
                y = int(1)
            Y.append(y)
        test_outcome = score(Y,test_traget)
    return test_outcome    # its knn's prediction accuracy    

In [6]:
#Pearson product-moment correlation coefficient
def get_ppc(feature, target_tocheck):
    sum_f = 0
    sum_t = 0
    product = 0
    mean_f = 0
    mean_t = 0
    target_tocheck = list(target_tocheck)
    for i in range(len(feature)):
        sum_f += float(feature[i])**2
        sum_t += float(target_tocheck[i])**2
        product += float(feature[i])* float(target_tocheck[i])
        mean_f += float(feature[i])
        mean_t += float(target_tocheck[i])
        
    mean_f = mean_f/len(feature)
    mean_t = mean_t/len(feature)
    pop_sd_f = ((sum_f/len(feature)) - (mean_f **2))**0.5
    pop_sd_t = ((sum_t/len(feature)) - (mean_t **2))**0.5  
    cov = (product / len(feature)) - (mean_f * mean_t)
    correlation = cov / (pop_sd_t * pop_sd_f)
    return correlation


In [9]:
pcc = []
for f in features:
    y = abs(get_ppc(df[f], target_tocheck = train_n_targets['CLASS']))
    pcc.append(y)
index = list(range(36))

c={"Feature" : features,
   "Pcc" : pcc}
#getting the features' PCC
filter_result = pd.DataFrame(c)
print(filter_result)

   Feature       Pcc
0       f0  0.069795
1       f1  0.308811
2       f2  0.195732
3       f3  0.009214
4       f4  0.436922
5       f5  0.000098
6       f6  0.035295
7       f7  0.352141
8       f8  0.087773
9       f9  0.013005
10     f10  0.056876
11     f11  0.042117
12     f12  0.002179
13     f13  0.368269
14     f14  0.368224
15     f15  0.031478
16     f16  0.366025
17     f17  0.113945
18     f18  0.017931
19     f19  0.137636
20     f20  0.299049
21     f21  0.056605
22     f22  0.351350
23     f23  0.005508
24     f24  0.007780
25     f25  0.153096
26     f26  0.341043
27     f27  0.015606
28     f28  0.156904
29     f29  0.020829
30     f30  0.008955
31     f31  0.290783
32     f32  0.093174
33     f33  0.038810
34     f34  0.266093
35     f35  0.030855


In [11]:
#Sorting
ranking = filter_result.sort_values(by='Pcc',ascending=False,)
print(ranking)

   Feature       Pcc
4       f4  0.436922
13     f13  0.368269
14     f14  0.368224
16     f16  0.366025
7       f7  0.352141
22     f22  0.351350
26     f26  0.341043
1       f1  0.308811
20     f20  0.299049
31     f31  0.290783
34     f34  0.266093
2       f2  0.195732
28     f28  0.156904
25     f25  0.153096
19     f19  0.137636
17     f17  0.113945
32     f32  0.093174
8       f8  0.087773
0       f0  0.069795
10     f10  0.056876
21     f21  0.056605
11     f11  0.042117
33     f33  0.038810
6       f6  0.035295
15     f15  0.031478
35     f35  0.030855
29     f29  0.020829
18     f18  0.017931
27     f27  0.015606
9       f9  0.013005
3       f3  0.009214
30     f30  0.008955
24     f24  0.007780
23     f23  0.005508
12     f12  0.002179
5       f5  0.000098


In [12]:
#getting the list of oerdered feature
filter_feature = ranking['Feature']
f_feature = []
for i in filter_feature:
    f_feature.append(i)
print(f_feature)

['f4', 'f13', 'f14', 'f16', 'f7', 'f22', 'f26', 'f1', 'f20', 'f31', 'f34', 'f2', 'f28', 'f25', 'f19', 'f17', 'f32', 'f8', 'f0', 'f10', 'f21', 'f11', 'f33', 'f6', 'f15', 'f35', 'f29', 'f18', 'f27', 'f9', 'f3', 'f30', 'f24', 'f23', 'f12', 'f5']


In [13]:
#Normalization the data
df = df[features]
zdf = (df - df.mean())/df.std()
zdf.head()

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f26,f27,f28,f29,f30,f31,f32,f33,f34,f35
0,0.122985,0.15999,0.509079,-2.539525,0.057933,0.05444,2.234571,0.270975,1.307035,-0.043243,...,-0.328991,-2.340529,-0.076986,0.299813,-0.059698,0.38098,-0.314029,2.463273,0.183995,0.066622
1,-0.14239,-0.325026,-0.626,-0.08122,0.120933,-1.083403,0.004844,-0.835001,-0.594944,-0.687021,...,-0.061992,0.662641,0.532987,0.419432,-2.581561,0.156995,0.010974,-0.691686,0.452991,-1.573437
2,-0.044201,1.253025,0.833102,-2.547387,1.515941,0.093388,-0.952208,1.196956,0.546044,-1.083396,...,0.072008,-1.042676,1.549942,0.033388,-1.738354,-0.402969,-0.151027,-0.01429,0.048997,-0.229646
3,-2.015936,-0.082018,-0.626,1.889617,-0.006067,0.05444,-0.193693,-0.297013,0.166048,0.813354,...,-1.26399,2.043495,-0.076986,-0.262941,-0.054525,-0.290976,1.63299,1.815213,1.527976,0.069268
4,-2.536071,-1.054049,-0.139966,0.089132,-0.766071,0.05444,-0.682401,1.076958,5.235993,-0.987627,...,7.283997,1.683819,0.532987,-0.001954,0.263618,-0.178984,-1.44904,0.049716,-1.697978,0.288824


In [14]:
# normalizing get the same result
filter2 = []
for f in features:
    y = abs(get_ppc(zdf[f], target_tocheck = train_n_targets['CLASS']))
    filter2.append(y)
z={"Feature" : features,
   "Pcc" : filter2}
filter_2 = pd.DataFrame(z)
print(filter_2)

   Feature       Pcc
0       f0  0.069795
1       f1  0.308811
2       f2  0.195732
3       f3  0.009214
4       f4  0.436922
5       f5  0.000098
6       f6  0.035295
7       f7  0.352141
8       f8  0.087773
9       f9  0.013005
10     f10  0.056876
11     f11  0.042117
12     f12  0.002179
13     f13  0.368269
14     f14  0.368224
15     f15  0.031478
16     f16  0.366025
17     f17  0.113945
18     f18  0.017931
19     f19  0.137636
20     f20  0.299049
21     f21  0.056605
22     f22  0.351350
23     f23  0.005508
24     f24  0.007780
25     f25  0.153096
26     f26  0.341043
27     f27  0.015606
28     f28  0.156904
29     f29  0.020829
30     f30  0.008955
31     f31  0.290783
32     f32  0.093174
33     f33  0.038810
34     f34  0.266093
35     f35  0.030855


In [16]:
list_ACC = []
for j in list(range(1,37,1)):
    loo = LeaveOneOut()
    X = zdf[f_feature[0:j]]
    train_x = []
    test_x = []
    train_y = []
    test_y = []
    loo.get_n_splits(X)
    for train_index, test_index in loo.split(X):
        #print("TRAIN:", train_index, "TEST:", test_index)
        y = np.array(train_n_targets).reshape(-1,1)
        X_train, X_test = X.iloc[train_index].values, X.iloc[test_index].values
        y_train, y_test = y[train_index], y[test_index]
        #print(X_train, X_test, y_train, y_test)
        train_x.append(X_train)
        test_x.append(X_test)
        train_y.append(y_train)
        test_y.append(y_test)
    S = []
    k = [7]
    for i in range(len(train_x)):
        _ = get_distance(train_x[i],test_x[i])
        #print(_)
        Y = get_k_outcome(_,k,train_y[i],test_y[i])
        #print(Y)
        S.append(Y)
        #print(S)
    Ave_acc = np.mean(S)
    #print(Ave_acc)
    list_ACC.append(Ave_acc)
print(list_ACC)

[0.6808510638297872, 0.789598108747045, 0.8156028368794326, 0.8238770685579196, 0.8356973995271868, 0.8368794326241135, 0.8617021276595744, 0.8747044917257684, 0.8947990543735225, 0.8853427895981087, 0.9018912529550828, 0.9042553191489362, 0.8888888888888888, 0.8912529550827423, 0.9089834515366431, 0.8971631205673759, 0.9101654846335697, 0.9243498817966903, 0.91725768321513, 0.925531914893617, 0.9113475177304965, 0.9042553191489362, 0.8947990543735225, 0.8841607565011821, 0.8782505910165485, 0.8794326241134752, 0.875886524822695, 0.8699763593380615, 0.8546099290780141, 0.8498817966903073, 0.8380614657210402, 0.83451536643026, 0.8262411347517731, 0.8274231678486997, 0.8108747044917257, 0.8262411347517731]


In [None]:
print(max(list_ACC))
Final = {"Feature" : f_feature,
   "Acc" : list_ACC}
final = pd.DataFrame(Final)
print(final)
final.to_csv('filter_answers.csv',index = False)

0.6808510638297872
0.789598108747045
0.8156028368794326
0.8238770685579196
0.8356973995271868
0.8368794326241135
0.8617021276595744
0.8747044917257684
0.8947990543735225
0.8853427895981087
0.9018912529550828
0.9042553191489362
0.8888888888888888
0.8912529550827423
0.9089834515366431
0.8971631205673759
0.9101654846335697
0.9243498817966903
0.91725768321513
0.925531914893617
0.9113475177304965
0.9042553191489362
0.8947990543735225
0.8841607565011821
0.8782505910165485
0.8794326241134752
0.875886524822695
0.8699763593380615
0.8546099290780141
0.8498817966903073
0.8380614657210402
0.83451536643026
0.8262411347517731
0.8274231678486997
0.8108747044917257
0.8262411347517731
[0.6808510638297872, 0.789598108747045, 0.8156028368794326, 0.8238770685579196, 0.8356973995271868, 0.8368794326241135, 0.8617021276595744, 0.8747044917257684, 0.8947990543735225, 0.8853427895981087, 0.9018912529550828, 0.9042553191489362, 0.8888888888888888, 0.8912529550827423, 0.9089834515366431, 0.8971631205673759, 0.9

0.925531914893617
   Feature       Acc
0       f4  0.680851
1      f13  0.789598
2      f14  0.815603
3      f16  0.823877
4       f7  0.835697
5      f22  0.836879
6      f26  0.861702
7       f1  0.874704
8      f20  0.894799
9      f31  0.885343
10     f34  0.901891
11      f2  0.904255
12     f28  0.888889
13     f25  0.891253
14     f19  0.908983
15     f17  0.897163
16     f32  0.910165
17      f8  0.924350
18      f0  0.917258
19     f10  0.925532
20     f21  0.911348
21     f11  0.904255
22     f33  0.894799
23      f6  0.884161
24     f15  0.878251
25     f35  0.879433
26     f29  0.875887
27     f18  0.869976
28     f27  0.854610
29      f9  0.849882
30      f3  0.838061
31     f30  0.834515
32     f24  0.826241
33     f23  0.827423
34     f12  0.810875
35      f5  0.826241


In [1]:
a = [1,2,3,4,5,6,7,8,9,10]

In [3]:
b = a.copy()
b = b[:(2*2)]

In [4]:
b


[1, 2, 3, 4]

In [13]:
list_features = list(features)
max_score = []
best_kernel_set = []
for draw in range(36):
    result_of_feature_combine = []
    kernel_set = []
    print('the Featureas we have to do {}'.format(list_features))
    for j in list_features:
        
        kernel = list_features.copy()
        print('the feature we ignoring : {}'.format(j))
        kernel.remove(j)
        print('the Featurea we ARE LOOPING are {}'.format(kernel))
        loo = LeaveOneOut()
        X = zdf[kernel]
        train_x = []
        test_x = []
        train_y = []
        test_y = []# training & testing set 
        loo.get_n_splits(X)
        kernel_set.append(kernel)
        for train_index, test_index in loo.split(X):
            #print("TRAIN:", train_index, "TEST:", test_index)
            y = np.array(train_n_targets).reshape(-1,1)
            X_train, X_test = X.iloc[train_index].values, X.iloc[test_index].values
            y_train, y_test = y[train_index], y[test_index]
            #print(X_train, X_test, y_train, y_test)
            train_x.append(X_train)
            test_x.append(X_test)
            train_y.append(y_train)
            test_y.append(y_test)
        S = []
        k = [7]
        for i in range(len(train_x)):
            _ = get_distance(train_x[i],test_x[i])
            #print(_)
            Y = get_k_outcome(_,k,train_y[i],test_y[i])
            #print(Y)
            S.append(Y)
            #print(S)
        Ave_acc = np.mean(S)# getting the mean 
        print('the Accuraccy is {}'.format(Ave_acc))
        result_of_feature_combine.append(Ave_acc)
    print('the Feature set is {}'.format(list_features))
    print('the Max Accuraccy in this set is {}'.format(max(result_of_feature_combine)))
    
    max_score.append(max(result_of_feature_combine))
    get = result_of_feature_combine.index(max_score[-1])
    print('the Best Feature set is {}'.format(kernel_set[get]))
    
    list_features = kernel_set[get].copy()
    best_kernel_set.append(list_features)
print('the Max Accuraccy for each set are {}'.format(max_score))
dictonary_outcome = {'score':max_score,'kernel':best_kernel_set}
data_backwarsd = pd.DataFrame(dictonary_outcome)
data_backwarsd.to_csv('kernel_backward.csv',index = False)

the Featureas we have to do ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35']
the feature we ignoring : f0
the Featurea we ARE LOOPING are ['f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35']
the Accuraccy is 0.8274231678486997
the feature we ignoring : f1
the Featurea we ARE LOOPING are ['f0', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35']
the Accuraccy is 0.8132387706855791
the feature we ignoring : f2
the Featurea we ARE LOOPING are 

the Accuraccy is 0.8274231678486997
the feature we ignoring : f24
the Featurea we ARE LOOPING are ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35']
the Accuraccy is 0.8203309692671394
the feature we ignoring : f25
the Featurea we ARE LOOPING are ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f26', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35']
the Accuraccy is 0.8215130023640662
the feature we ignoring : f26
the Featurea we ARE LOOPING are ['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22', 'f23', 'f24', 'f25', 'f27', 'f28', 'f29', 'f30', 'f31', 'f32', 'f33', 'f34', 'f35'

KeyboardInterrupt: 