In [1]:
import numpy as np
import pandas as pd
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [2]:
class KNN(object):
    def __init__(self,feature_subset_size,train_set,resample_times,model):
        if not isinstance(train_set,pd.DataFrame):
            raise ValueError('train_set must be a DataFrame object')
        new_index = list(range(len(train_set)))
        train_set.index = new_index
        colums = [i for i in train_set.columns.tolist() if i!='target']
        self.feature_subset_size = feature_subset_size
        self.train_set = train_set[colums]
        self.resample_times = resample_times
        self.model = model
        self.train_label = train_set['target'].values
        
    def fit(self):
        all_colums_list = self.train_set.columns.tolist()
        # remain_colums = [colums_name for colums_name in all_colums_list if colums_name!='target']
        S=[]
        F=[]
        for i in range(self.resample_times):
            select_colums = np.random.choice(all_colums_list,self.feature_subset_size,replace=False)
            
            select_ = self.train_set[select_colums].copy()
            X = select_.values
            # select_['target'] = self.train_label
            new_model = clone(self.model)
            
            # print("X:",X.shape)
            # Y = select_['target'].values
            Y = self.train_label
            S.append(new_model.fit(X,Y))
            F.append(select_colums)
            self.S = S
            self.F = F
    def predict(self,test_set,C):
        F = self.F
        S = self.S
        colums = [i for i in train_set.columns.tolist() if i!='target']
        self.test_set = test_set[colums]
        self.test_label = test_set['target'].values
        label=[]
        for i in range(self.resample_times):
            sample = self.test_set[F[i]].values
            label.append(S[i].predict(sample))
        label_ = np.array(label).T
        true_label = []
        for i in label_:
            unique, counts = np.unique(i, return_counts=True)
            most_number = unique[np.argmax(counts)]
            true_label.append(most_number)
        sum=0
        cs=0
        for i in range(len(true_label)):
            if true_label[i] == self.test_label[i]:
                sum+=1
            cs+=C[true_label[i]][self.test_label[i]]
        return sum*1.0 / len(true_label),cs

In [3]:
def make_cost_matrix(train_set):
    label_set = train_set['target'].values
    unique,counts = np.unique(label_set,return_counts=True)
    # print(unique)
    # print(counts)
    cost_matrix = np.zeros((len(unique),len(unique)),dtype=int)
    for i in range(len(unique)):
        for j in range(len(unique)):
            if i==j:
                cost_matrix[i][j] = np.random.uniform(0,1000)
            else:
                # print(unique[i]," ",unique[j])
                cost_matrix[i][j] = 2000*counts[unique[i]] / counts[unique[j]]
    return cost_matrix
    

In [5]:
Wine_data = pd.read_csv('data_set/wine.data',header=None)
Wine_data.columns = ['target'] + [i for i in range(0,Wine_data.shape[1]-1)]
unique = np.unique(Wine_data['target'].values,return_counts=False)
replace_dict = {unique[i]:i for i in range(len(unique))}
# print(replace_dict)
Wine_data['target'].replace(replace_dict,inplace=True)
print(Wine_data)
Wine_data_ = (Wine_data.iloc[:,1:]-Wine_data.iloc[:,1:].min())/(Wine_data.iloc[:,1:].max()-Wine_data.iloc[:,1:].min())
Wine_data_['target'] = Wine_data['target'].values
# print(data)
print(Wine_data_)
train_DataFrame = Wine_data_

     target      0     1     2     3    4     5     6     7     8      9  \
0         0  14.23  1.71  2.43  15.6  127  2.80  3.06  0.28  2.29   5.64   
1         0  13.20  1.78  2.14  11.2  100  2.65  2.76  0.26  1.28   4.38   
2         0  13.16  2.36  2.67  18.6  101  2.80  3.24  0.30  2.81   5.68   
3         0  14.37  1.95  2.50  16.8  113  3.85  3.49  0.24  2.18   7.80   
4         0  13.24  2.59  2.87  21.0  118  2.80  2.69  0.39  1.82   4.32   
..      ...    ...   ...   ...   ...  ...   ...   ...   ...   ...    ...   
173       2  13.71  5.65  2.45  20.5   95  1.68  0.61  0.52  1.06   7.70   
174       2  13.40  3.91  2.48  23.0  102  1.80  0.75  0.43  1.41   7.30   
175       2  13.27  4.28  2.26  20.0  120  1.59  0.69  0.43  1.35  10.20   
176       2  13.17  2.59  2.37  20.0  120  1.65  0.68  0.53  1.46   9.30   
177       2  14.13  4.10  2.74  24.5   96  2.05  0.76  0.56  1.35   9.20   

       10    11    12  
0    1.04  3.92  1065  
1    1.05  3.40  1050  
2    1.03  3.17

In [6]:
# iris = load_iris()
# iris_feature = iris.data
# iris_class = iris.target
# iris_DataFrame_feature= pd.DataFrame(iris_feature)
# iris_DataFrame = iris_DataFrame_feature.copy()
# iris_DataFrame['target'] = iris_class
# train_DataFrame = iris_DataFrame

In [7]:

# print(iris_DataFrame)
# C = np.array([[0, 1000, 1500], [2810, 0, 2292], [11, 16, 0]])
sums=[]
cs_sum=[]
for i in range(20):
    train_set,test_set = train_test_split(train_DataFrame,test_size=0.3)
    C = make_cost_matrix(train_set)
    knn =KNeighborsClassifier(n_neighbors=2)
    knn_classifier = KNN(2,train_set,50,knn)
    knn_classifier.fit()
    acc,cs = knn_classifier.predict(test_set,C)
    sums.append(acc)
    cs_sum.append(cs/len(test_set))
    print("acc:",acc,"   cs:",cs/len(test_set))
print("average_acc:",sum(sums)*1.0/len(sums))
print("average_cs:",sum(cs_sum)*1.0/len(cs_sum))
# print(type(sum))

acc: 0.8703703703703703    cs: 1003.5555555555555
acc: 0.9814814814814815    cs: 359.27777777777777
acc: 0.9444444444444444    cs: 446.75925925925924
acc: 0.9629629629629629    cs: 299.8703703703704
acc: 0.8703703703703703    cs: 1029.5555555555557
acc: 0.9444444444444444    cs: 433.72222222222223
acc: 0.7962962962962963    cs: 722.2037037037037
acc: 0.9074074074074074    cs: 387.48148148148147
acc: 0.8888888888888888    cs: 440.2962962962963
acc: 0.8703703703703703    cs: 659.1296296296297
acc: 0.8703703703703703    cs: 653.0555555555555
acc: 0.9629629629629629    cs: 688.7407407407408
acc: 0.9074074074074074    cs: 501.74074074074076
acc: 0.9814814814814815    cs: 550.425925925926
acc: 0.9629629629629629    cs: 619.5185185185185
acc: 0.8888888888888888    cs: 647.8518518518518
acc: 0.8703703703703703    cs: 533.1481481481482
acc: 0.8703703703703703    cs: 734.9444444444445
acc: 0.9259259259259259    cs: 410.1296296296296
acc: 0.9629629629629629    cs: 748.925925925926
average_acc: 0.