In [123]:
import graphlab
import numpy as np
import sklearn as skl
import time
import itertools
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import KFold
graphlab.canvas.set_target('ipynb')

In [76]:
wine_data = graphlab.SFrame('winequality-white.csv')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[float,float,float,float,float,float,float,float,float,float,float,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [77]:
features = ['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides',
            'free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']

In [78]:
train_data,validation_data = wine_data.random_split(.8,seed=1)

In [79]:
training_label = train_data['quality']
validating_label = validation_data['quality']
training_data = train_data[features]
validating_data = validation_data[features]
validation_data

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide
6.3,0.3,0.34,1.6,0.049,14.0,132.0
8.1,0.22,0.43,1.5,0.044,28.0,129.0
8.3,0.42,0.62,19.25,0.04,41.0,172.0
7.4,0.34,0.42,1.1,0.033,17.0,171.0
6.6,0.27,0.41,1.3,0.052,16.0,142.0
8.3,0.14,0.34,1.1,0.042,7.0,47.0
7.4,0.25,0.36,2.05,0.05,31.0,100.0
6.5,0.39,0.23,5.4,0.051,25.0,149.0
7.3,0.24,0.39,17.95,0.057,45.0,149.0
7.2,0.19,0.31,1.6,0.062,31.0,173.0

density,pH,sulphates,alcohol,quality
0.994,3.3,0.49,9.5,6
0.9938,3.22,0.45,11.0,6
1.0002,2.98,0.67,9.7,5
0.9917,3.12,0.53,11.3,6
0.9951,3.42,0.47,10.0,6
0.9934,3.47,0.4,10.2,6
0.992,3.19,0.44,10.8,6
0.9934,3.24,0.35,10.0,5
0.9999,3.21,0.36,8.6,5
0.9917,3.35,0.44,11.7,6


In [80]:
training_data_np = training_data.to_numpy()
training_label_np = training_label.to_numpy()
validation_data_np = validating_data.to_numpy()
validation_label_np = validating_label.to_numpy()

In [119]:
wine_data_np = wine_data['quality'].to_numpy()

In [81]:
class_names = ['1','2','3','4','5','6','7','8','9','10']

In [82]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [83]:
def knn_model(neighbournum,algorithm,metric):
    for i in range(1,neighbournum):
        print '************** neighbours_num=',i,' ***************'
        starttime = time.time()
        knn_lin_mandis = KNeighborsClassifier(n_neighbors=i,algorithm=algorithm,metric=metric)
        knn_lin_mandis.fit(training_data_np,training_label_np)
        stoptime = time.time()
        usetime = stoptime - starttime
        print 'time:',usetime
        pre = knn_lin_mandis.predict(validation_data_np)
        pre_correct = sum(pre == validation_label_np)
        accuracy = pre_correct / float(len(validation_label_np))
        print 'accuracy:', accuracy
        # Compute confusion matrix
        cnf_matrix = confusion_matrix(validation_label_np, pre)
        np.set_printoptions(precision=2)
        # Plot non-normalized confusion matrix
        plt.figure()
        plot_confusion_matrix(cnf_matrix, classes=class_names,
                          title='Confusion matrix, without normalization')

In [126]:
folds = graphlab.cross_validation.KFold(wine_data,10)

In [130]:
def knn_model_with10N_kfold(algorithm,metric):
    for train_set,validation_set in folds:
        t_d_np = train_set[features].to_numpy()
        t_l_np = train_set['quality'].to_numpy()
        v_d_np = validation_set[features].to_numpy()
        v_l_np = validation_set['quality'].to_numpy()
        starttime = time.time()
        knn_lin_mandis = KNeighborsClassifier(n_neighbors=10,algorithm=algorithm,metric=metric)
        knn_lin_mandis.fit(t_d_np,t_l_np)
        stoptime = time.time()
        usetime = stoptime - starttime
        print 'time:',usetime
        pre = knn_lin_mandis.predict(v_d_np)
        pre_correct = sum(pre == v_l_np)
        accuracy = pre_correct / float(len(v_l_np))
        print 'accuracy:', accuracy
        # Compute confusion matrix
        cnf_matrix = confusion_matrix(v_l_np, pre)
        np.set_printoptions(precision=2)
        # Plot non-normalized confusion matrix
        plt.figure()
        plot_confusion_matrix(cnf_matrix, classes=class_names,
                          title='Confusion matrix, without normalization')

# 1. k-nn use k-d tree

## 1.1 use train and validation set with Manhattan distance

In [84]:
knn_model(11,'kd_tree','manhattan')

************** neighbours_num= 1  ***************
time: 0.00522398948669
accuracy: 0.604125736739
Confusion matrix, without normalization
[[  0   1   1   4   0   0   0]
 [  0   7  16  12   2   0   0]
 [  0   7 179  67  23   3   1]
 [  0   2 110 312  52   9   0]
 [  0   1  23  41 100   3   0]
 [  0   0   4  11   9  17   0]
 [  0   0   0   0   1   0   0]]
************** neighbours_num= 2  ***************
time: 0.0100500583649
accuracy: 0.511787819253
Confusion matrix, without normalization
[[  0   1   4   1   0   0   0]
 [  1   8  20   7   1   0   0]
 [  0  17 200  54   8   1   0]
 [  0  10 193 257  25   0   0]
 [  0   2  37  77  52   0   0]
 [  0   0  12  14  11   4   0]
 [  0   0   0   1   0   0   0]]
************** neighbours_num= 3  ***************
time: 0.00424814224243
accuracy: 0.501964636542
Confusion matrix, without normalization
[[  0   1   2   3   0   0   0]
 [  1   7  20   8   1   0   0]
 [  1  11 155  95  17   1   0]
 [  0  11 132 283  56   3   0]
 [  1   4  41  58  61   3  

## 1.2 use train and validation set with Euclidean distance

In [18]:
knn_model(11,'kd_tree','euclidean')

************** neighbours_num= 1  ***************
time: 0.00609016418457
accuracy: 0.602161100196
Confusion matrix, without normalization
[[  0   1   2   3   0   0   0]
 [  1   7  13  13   3   0   0]
 [  0   7 183  69  19   1   1]
 [  0   7 105 309  55   9   0]
 [  0   1  23  43  97   4   0]
 [  0   0   3  12   9  17   0]
 [  0   0   0   0   1   0   0]]
************** neighbours_num= 2  ***************
time: 0.00489902496338
accuracy: 0.486247544204
Confusion matrix, without normalization
[[  0   1   4   1   0   0   0]
 [  1   8  19   8   1   0   0]
 [  0  17 201  54   7   1   0]
 [  0  14 203 239  29   0   0]
 [  1   1  53  69  43   1   0]
 [  0   0  12  13  12   4   0]
 [  0   0   0   1   0   0   0]]
************** neighbours_num= 3  ***************
time: 0.0105559825897
accuracy: 0.481335952849
Confusion matrix, without normalization
[[  0   1   2   3   0   0   0]
 [  1   5  23   7   1   0   0]
 [  1  12 152  97  17   1   0]
 [  0   8 143 274  59   1   0]
 [  1   1  45  63  55   3  

## 1.3 use train and validation set with chebyshev distance

In [28]:
knn_model(11,'kd_tree','chebyshev')

************** neighbours_num= 1  ***************
time: 0.00736117362976
accuracy: 0.595284872299
Confusion matrix, without normalization
[[  0   1   3   1   1   0   0]
 [  1   8  14  12   1   1   0]
 [  0   4 184  70  20   2   0]
 [  0   6  95 304  69  11   0]
 [  0   1  27  41  94   5   0]
 [  0   0   5  12   8  16   0]
 [  0   0   0   0   1   0   0]]
************** neighbours_num= 2  ***************
time: 0.0120341777802
accuracy: 0.494106090373
Confusion matrix, without normalization
[[  0   1   4   0   1   0   0]
 [  1   9  18   8   0   1   0]
 [  1  13 204  54   7   1   0]
 [  0  14 198 240  32   1   0]
 [  1   1  49  69  46   2   0]
 [  0   0  10  20   7   4   0]
 [  0   0   0   1   0   0   0]]
************** neighbours_num= 3  ***************
time: 0.00272989273071
accuracy: 0.470530451866
Confusion matrix, without normalization
[[  0   1   3   1   1   0   0]
 [  0   5  23   8   0   1   0]
 [  2  12 151  96  17   2   0]
 [  0  12 150 261  60   2   0]
 [  1   1  46  60  57   3  

## 2.4 using k-fold cross validate with Manhattan distance

In [131]:
knn_model_with10N_kfold('kd_tree','manhattan')

time: 0.00491499900818
accuracy: 0.39387755102
Confusion matrix, without normalization
[[  0   1   1   2   0   0]
 [  0   2   5   9   0   0]
 [  1   1  69  83   8   1]
 [  0   1  80 113  19   0]
 [  0   0  13  56   9   0]
 [  0   0   1  10   5   0]]
time: 0.00419402122498
accuracy: 0.416326530612
Confusion matrix, without normalization
[[ 0  0  1  1  0  0  0]
 [ 0  2 11  6  0  0  0]
 [ 0  3 90 54  1  0  0]
 [ 0  2 77 95 15  2  0]
 [ 0  0 25 63 17  0  0]
 [ 0  0  7 12  2  0  0]
 [ 0  0  1  3  0  0  0]]
time: 0.00366497039795
accuracy: 0.40612244898
Confusion matrix, without normalization
[[  0   0   1   2   0   0]
 [  0   0  11  10   1   0]
 [  0   2  66  60   9   0]
 [  0   1  68 121  16   0]
 [  0   0  22  62  12   0]
 [  0   0   4  16   6   0]]
time: 0.00372290611267
accuracy: 0.414285714286
Confusion matrix, without normalization
[[  0   0   3   0   0   0   0]
 [  0   1  11   8   1   0   0]
 [  0   2  65  76   7   2   0]
 [  0   1  87 124  15   1   0]
 [  0   1  20  40  13   0   0]


## 2.5 using k-fold cross validate with Euclidean distance

In [132]:
knn_model_with10N_kfold('kd_tree','euclidean')

time: 0.00397396087646
accuracy: 0.4
Confusion matrix, without normalization
[[  0   1   2   1   0   0]
 [  0   2   6   8   0   0]
 [  0   2  61  88  11   1]
 [  0   1  73 127  12   0]
 [  0   0  17  54   6   1]
 [  0   0   1  10   5   0]]
time: 0.00357604026794
accuracy: 0.418367346939
Confusion matrix, without normalization
[[  0   1   0   1   0   0   0]
 [  0   3   9   6   1   0   0]
 [  0   3  81  60   4   0   0]
 [  0   2  70 102  15   2   0]
 [  0   0  21  65  19   0   0]
 [  0   0   7  13   1   0   0]
 [  0   0   2   2   0   0   0]]
time: 0.00354695320129
accuracy: 0.383673469388
Confusion matrix, without normalization
[[  0   0   1   2   0   0]
 [  0   0  13   8   1   0]
 [  0   2  61  65   9   0]
 [  0   1  68 116  21   0]
 [  0   1  20  64  11   0]
 [  0   0   5  15   6   0]]
time: 0.00488209724426
accuracy: 0.402040816327
Confusion matrix, without normalization
[[  0   0   2   1   0   0   0]
 [  0   1  10   8   2   0   0]
 [  0   4  63  69  12   4   0]
 [  0   1  91 120  14 

## using k-fold cross validate with chebyshev distance

In [133]:
knn_model_with10N_kfold('kd_tree','chebyshev')

time: 0.00505685806274
accuracy: 0.40612244898
Confusion matrix, without normalization
[[  0   1   2   1   0   0]
 [  0   1   6   9   0   0]
 [  0   0  59  93  10   1]
 [  0   2  66 129  16   0]
 [  0   1  18  49  10   0]
 [  0   0   1  10   5   0]]
time: 0.00255393981934
accuracy: 0.40612244898
Confusion matrix, without normalization
[[ 0  1  0  1  0  0  0]
 [ 0  0 10  9  0  0  0]
 [ 0  5 78 60  5  0  0]
 [ 0  1 71 99 18  2  0]
 [ 0  0 17 66 22  0  0]
 [ 0  0  4 16  1  0  0]
 [ 0  0  1  3  0  0  0]]
time: 0.00316500663757
accuracy: 0.391836734694
Confusion matrix, without normalization
[[  0   0   1   2   0   0]
 [  0   1  11   9   1   0]
 [  0   2  58  68   9   0]
 [  0   0  71 121  14   0]
 [  0   2  19  63  12   0]
 [  0   0   5  16   5   0]]
time: 0.0027379989624
accuracy: 0.4
Confusion matrix, without normalization
[[  0   0   2   1   0   0   0]
 [  0   2  10   8   1   0   0]
 [  0   7  65  65  13   2   0]
 [  0   1  86 121  18   2   0]
 [  0   0  24  42   8   0   0]
 [  0   0   

# 2. k-nn use linear search

## 2.1 use train and validation set with Manhattan distance

In [9]:
validation_data_np.shape,training_data_np.shape,validation_label.shape,training_label_np.shape

((1018, 11), (3880, 11), (1018,), (3880,))

In [30]:
knn_model(11,'brute','manhattan')

************** neighbours_num= 1  ***************
time: 0.00127005577087
accuracy: 0.604125736739
Confusion matrix, without normalization
[[  0   1   1   4   0   0   0]
 [  0   7  16  12   2   0   0]
 [  0   7 179  67  23   3   1]
 [  0   2 110 312  52   9   0]
 [  0   1  23  41 100   3   0]
 [  0   0   4  11   9  17   0]
 [  0   0   0   0   1   0   0]]
************** neighbours_num= 2  ***************
time: 0.000601053237915
accuracy: 0.511787819253
Confusion matrix, without normalization
[[  0   1   4   1   0   0   0]
 [  1   8  20   7   1   0   0]
 [  0  17 200  54   8   1   0]
 [  0  10 193 257  25   0   0]
 [  0   2  37  77  52   0   0]
 [  0   0  12  14  11   4   0]
 [  0   0   0   1   0   0   0]]
************** neighbours_num= 3  ***************
time: 0.000591039657593
accuracy: 0.501964636542
Confusion matrix, without normalization
[[  0   1   2   3   0   0   0]
 [  1   7  20   8   1   0   0]
 [  1  11 155  95  17   1   0]
 [  0  11 132 283  56   3   0]
 [  1   4  41  58  61   

## 2.2 use train and validation set with euclidean distance

In [32]:
knn_model(11,'brute','euclidean')

************** neighbours_num= 1  ***************
time: 0.00154995918274
accuracy: 0.602161100196
Confusion matrix, without normalization
[[  0   1   2   3   0   0   0]
 [  1   7  13  13   3   0   0]
 [  0   7 183  69  19   1   1]
 [  0   7 105 309  55   9   0]
 [  0   1  23  43  97   4   0]
 [  0   0   3  12   9  17   0]
 [  0   0   0   0   1   0   0]]
************** neighbours_num= 2  ***************
time: 0.00058913230896
accuracy: 0.486247544204
Confusion matrix, without normalization
[[  0   1   4   1   0   0   0]
 [  1   8  19   8   1   0   0]
 [  0  17 201  54   7   1   0]
 [  0  14 203 239  29   0   0]
 [  1   1  53  69  43   1   0]
 [  0   0  12  13  12   4   0]
 [  0   0   0   1   0   0   0]]
************** neighbours_num= 3  ***************
time: 0.00103402137756
accuracy: 0.481335952849
Confusion matrix, without normalization
[[  0   1   2   3   0   0   0]
 [  1   5  23   7   1   0   0]
 [  1  12 152  97  17   1   0]
 [  0   8 143 274  59   1   0]
 [  1   1  45  63  55   3 

## 2.3 use train and validation set with cosine distance

In [43]:
train_data

fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide
7.0,0.27,0.36,20.7,0.045,45.0,170.0
6.3,0.3,0.34,1.6,0.049,14.0,132.0
8.1,0.28,0.4,6.9,0.05,30.0,97.0
7.2,0.23,0.32,8.5,0.058,47.0,186.0
7.2,0.23,0.32,8.5,0.058,47.0,186.0
8.1,0.28,0.4,6.9,0.05,30.0,97.0
6.2,0.32,0.16,7.0,0.045,30.0,136.0
7.0,0.27,0.36,20.7,0.045,45.0,170.0
8.1,0.27,0.41,1.45,0.033,11.0,63.0
8.6,0.23,0.4,4.2,0.035,17.0,109.0

density,pH,sulphates,alcohol
1.001,3.0,0.45,8.8
0.994,3.3,0.49,9.5
0.9951,3.26,0.44,10.1
0.9956,3.19,0.4,9.9
0.9956,3.19,0.4,9.9
0.9951,3.26,0.44,10.1
0.9949,3.18,0.47,9.6
1.001,3.0,0.45,8.8
0.9908,2.99,0.56,12.0
0.9947,3.14,0.53,9.7


In [95]:
knn = graphlab.nearest_neighbor_classifier.create(train_data,target='quality',distance='cosine')
knn.summary()

Class                                : NearestNeighborClassifier

Schema
------
Number of examples                   : 3880
Number of feature columns            : 11
Number of unpacked features          : 11
Number of distance components        : 1
Number of classes                    : 7

Training Summary
----------------
Training time (seconds)              : 0.0836



In [140]:
eva1 = knn.evaluate(validation_data,metric='accuracy',max_neighbors=10)
eva2 = knn.evaluate(validation_data,metric='confusion_matrix',max_neighbors=10)
pre = knn.predict(validation_data,max_neighbors=10)
print 'accuracy:', eva
print 'confusion_matrix:' ,eva2





accuracy: {'accuracy': 0.47544204322200395}
confusion_matrix: {'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 26

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      4       |        8        |   1   |
|      4       |        7        |   1   |
|      6       |        8        |   1   |
|      6       |        7        |   55  |
|      7       |        8        |   3   |
|      8       |        5        |   5   |
|      8       |        7        |   12  |
|      5       |        6        |  108  |
|      5       |        8        |   2   |
|      3       |        5        |   2   |
+--------------+-----------------+-------+
[26 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}


## 2.4 using k-fold cross validate with Manhattan distance


In [134]:
knn_model_with10N_kfold('brute','manhattan')

time: 0.000869989395142
accuracy: 0.39387755102
Confusion matrix, without normalization
[[  0   1   1   2   0   0]
 [  0   2   5   9   0   0]
 [  1   1  69  83   8   1]
 [  0   1  80 113  19   0]
 [  0   0  13  56   9   0]
 [  0   0   1  10   5   0]]
time: 0.000609159469604
accuracy: 0.416326530612
Confusion matrix, without normalization
[[ 0  0  1  1  0  0  0]
 [ 0  2 11  6  0  0  0]
 [ 0  3 90 54  1  0  0]
 [ 0  2 77 95 15  2  0]
 [ 0  0 25 63 17  0  0]
 [ 0  0  7 12  2  0  0]
 [ 0  0  1  3  0  0  0]]
time: 0.000692844390869
accuracy: 0.40612244898
Confusion matrix, without normalization
[[  0   0   1   2   0   0]
 [  0   0  11  10   1   0]
 [  0   2  66  60   9   0]
 [  0   1  68 121  16   0]
 [  0   0  22  62  12   0]
 [  0   0   4  16   6   0]]
time: 0.00140881538391
accuracy: 0.414285714286
Confusion matrix, without normalization
[[  0   0   3   0   0   0   0]
 [  0   1  11   8   1   0   0]
 [  0   2  65  76   7   2   0]
 [  0   1  87 124  15   1   0]
 [  0   1  20  40  13   0   

## 2.5 using k-fold cross validate with euclidean distance

In [136]:
knn_model_with10N_kfold('brute','euclidean')

time: 0.00065803527832
accuracy: 0.4
Confusion matrix, without normalization
[[  0   1   2   1   0   0]
 [  0   2   6   8   0   0]
 [  0   2  61  88  11   1]
 [  0   1  73 127  12   0]
 [  0   0  17  54   6   1]
 [  0   0   1  10   5   0]]
time: 0.00096607208252
accuracy: 0.418367346939
Confusion matrix, without normalization
[[  0   1   0   1   0   0   0]
 [  0   3   9   6   1   0   0]
 [  0   3  81  60   4   0   0]
 [  0   2  70 102  15   2   0]
 [  0   0  21  65  19   0   0]
 [  0   0   7  13   1   0   0]
 [  0   0   2   2   0   0   0]]
time: 0.00084400177002
accuracy: 0.383673469388
Confusion matrix, without normalization
[[  0   0   1   2   0   0]
 [  0   0  13   8   1   0]
 [  0   2  61  65   9   0]
 [  0   1  68 116  21   0]
 [  0   1  20  64  11   0]
 [  0   0   5  15   6   0]]
time: 0.00090503692627
accuracy: 0.402040816327
Confusion matrix, without normalization
[[  0   0   2   1   0   0   0]
 [  0   1  10   8   2   0   0]
 [  0   4  63  69  12   4   0]
 [  0   1  91 120  14 

## 2.6 using k-fold cross validate with cosine distance

In [139]:
for train,validation in folds:
    knn = graphlab.nearest_neighbor_classifier.create(train,target='quality',distance='cosine')
    knn.summary()
    eva1 = knn.evaluate(validation,metric='accuracy',max_neighbors=10)
    eva2 = knn.evaluate(validation,metric='confusion_matrix',max_neighbors=10)
    pre = knn.predict(validation,max_neighbors=10)
    print 'accuracy:', eva
    print 'confusion_matrix:' ,eva2

Class                                : NearestNeighborClassifier

Schema
------
Number of examples                   : 4408
Number of feature columns            : 11
Number of unpacked features          : 11
Number of distance components        : 1
Number of classes                    : 7

Training Summary
----------------
Training time (seconds)              : 0.1879





accuracy: {'accuracy': 0.47544204322200395}
confusion_matrix: {'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 23

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      7       |        8        |   5   |
|      7       |        4        |   2   |
|      7       |        5        |   12  |
|      4       |        4        |   2   |
|      3       |        6        |   3   |
|      4       |        5        |   7   |
|      6       |        4        |   1   |
|      8       |        5        |   2   |
|      3       |        5        |   1   |
|      4       |        6        |   7   |
+--------------+-----------------+-------+
[23 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}


Class                                : NearestNeighborClassifier

Schema
------
Number of examples                   : 4408
Number of feature columns            : 11
Number of unpacked features          : 11
Number of distance components        : 1
Number of classes                    : 7

Training Summary
----------------
Training time (seconds)              : 0.1942





accuracy: {'accuracy': 0.47544204322200395}
confusion_matrix: {'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 22

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      8       |        5        |   1   |
|      9       |        6        |   3   |
|      5       |        7        |   3   |
|      3       |        6        |   2   |
|      9       |        5        |   1   |
|      7       |        4        |   1   |
|      4       |        7        |   1   |
|      8       |        7        |   4   |
|      8       |        6        |   15  |
|      4       |        4        |   2   |
+--------------+-----------------+-------+
[22 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}


Class                                : NearestNeighborClassifier

Schema
------
Number of examples                   : 4408
Number of feature columns            : 11
Number of unpacked features          : 11
Number of distance components        : 1
Number of classes                    : 7

Training Summary
----------------
Training time (seconds)              : 0.2064





accuracy: {'accuracy': 0.47544204322200395}
confusion_matrix: {'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 20

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      6       |        4        |   2   |
|      8       |        6        |   19  |
|      3       |        6        |   2   |
|      8       |        7        |   3   |
|      4       |        7        |   5   |
|      7       |        8        |   1   |
|      4       |        6        |   6   |
|      5       |        4        |   2   |
|      5       |        5        |   60  |
|      5       |        7        |   11  |
+--------------+-----------------+-------+
[20 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}


Class                                : NearestNeighborClassifier

Schema
------
Number of examples                   : 4408
Number of feature columns            : 11
Number of unpacked features          : 11
Number of distance components        : 1
Number of classes                    : 7

Training Summary
----------------
Training time (seconds)              : 0.2394





accuracy: {'accuracy': 0.47544204322200395}
confusion_matrix: {'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 22

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      7       |        8        |   1   |
|      7       |        7        |   11  |
|      9       |        6        |   1   |
|      5       |        7        |   11  |
|      5       |        5        |   71  |
|      4       |        7        |   1   |
|      8       |        7        |   2   |
|      7       |        5        |   25  |
|      4       |        4        |   1   |
|      6       |        7        |   21  |
+--------------+-----------------+-------+
[22 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}


Class                                : NearestNeighborClassifier

Schema
------
Number of examples                   : 4408
Number of feature columns            : 11
Number of unpacked features          : 11
Number of distance components        : 1
Number of classes                    : 7

Training Summary
----------------
Training time (seconds)              : 0.3509





accuracy: {'accuracy': 0.47544204322200395}
confusion_matrix: {'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 20

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      6       |        4        |   1   |
|      6       |        7        |   24  |
|      4       |        5        |   12  |
|      8       |        6        |   3   |
|      4       |        7        |   2   |
|      8       |        7        |   2   |
|      7       |        6        |   23  |
|      4       |        6        |   11  |
|      5       |        4        |   2   |
|      8       |        5        |   8   |
+--------------+-----------------+-------+
[20 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}


Class                                : NearestNeighborClassifier

Schema
------
Number of examples                   : 4408
Number of feature columns            : 11
Number of unpacked features          : 11
Number of distance components        : 1
Number of classes                    : 7

Training Summary
----------------
Training time (seconds)              : 0.216





accuracy: {'accuracy': 0.47544204322200395}
confusion_matrix: {'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 20

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      7       |        8        |   1   |
|      7       |        7        |   26  |
|      8       |        7        |   5   |
|      4       |        8        |   1   |
|      6       |        4        |   3   |
|      6       |        7        |   37  |
|      8       |        6        |   14  |
|      4       |        6        |   2   |
|      5       |        4        |   2   |
|      8       |        5        |   2   |
+--------------+-----------------+-------+
[20 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}


Class                                : NearestNeighborClassifier

Schema
------
Number of examples                   : 4408
Number of feature columns            : 11
Number of unpacked features          : 11
Number of distance components        : 1
Number of classes                    : 7

Training Summary
----------------
Training time (seconds)              : 0.1999





accuracy: {'accuracy': 0.47544204322200395}
confusion_matrix: {'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 21

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      7       |        8        |   1   |
|      3       |        6        |   4   |
|      5       |        4        |   1   |
|      4       |        6        |   4   |
|      7       |        4        |   1   |
|      4       |        5        |   4   |
|      8       |        5        |   2   |
|      7       |        5        |   4   |
|      4       |        4        |   1   |
|      6       |        4        |   3   |
+--------------+-----------------+-------+
[21 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}


Class                                : NearestNeighborClassifier

Schema
------
Number of examples                   : 4408
Number of feature columns            : 11
Number of unpacked features          : 11
Number of distance components        : 1
Number of classes                    : 7

Training Summary
----------------
Training time (seconds)              : 0.1736





accuracy: {'accuracy': 0.47544204322200395}
confusion_matrix: {'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 19

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      4       |        7        |   1   |
|      8       |        7        |   8   |
|      4       |        5        |   4   |
|      7       |        8        |   1   |
|      7       |        7        |   22  |
|      7       |        5        |   17  |
|      6       |        4        |   2   |
|      4       |        6        |   7   |
|      3       |        5        |   1   |
|      8       |        5        |   1   |
+--------------+-----------------+-------+
[19 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}


Class                                : NearestNeighborClassifier

Schema
------
Number of examples                   : 4409
Number of feature columns            : 11
Number of unpacked features          : 11
Number of distance components        : 1
Number of classes                    : 7

Training Summary
----------------
Training time (seconds)              : 0.1718





accuracy: {'accuracy': 0.47544204322200395}
confusion_matrix: {'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 19

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      4       |        7        |   2   |
|      7       |        6        |   40  |
|      6       |        4        |   1   |
|      6       |        7        |   62  |
|      6       |        8        |   4   |
|      7       |        8        |   3   |
|      4       |        4        |   1   |
|      7       |        5        |   7   |
|      5       |        4        |   1   |
|      4       |        6        |   8   |
+--------------+-----------------+-------+
[19 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}


Class                                : NearestNeighborClassifier

Schema
------
Number of examples                   : 4409
Number of feature columns            : 11
Number of unpacked features          : 11
Number of distance components        : 1
Number of classes                    : 7

Training Summary
----------------
Training time (seconds)              : 0.1555





accuracy: {'accuracy': 0.47544204322200395}
confusion_matrix: {'confusion_matrix': Columns:
	target_label	int
	predicted_label	int
	count	int

Rows: 19

Data:
+--------------+-----------------+-------+
| target_label | predicted_label | count |
+--------------+-----------------+-------+
|      8       |        6        |   3   |
|      4       |        5        |   4   |
|      6       |        5        |   43  |
|      8       |        7        |   2   |
|      5       |        4        |   1   |
|      4       |        6        |   5   |
|      7       |        4        |   1   |
|      7       |        5        |   8   |
|      6       |        7        |   65  |
|      6       |        8        |   5   |
+--------------+-----------------+-------+
[19 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.}


## 2.7 using k-fold cross validate with chebyshev distance

In [141]:
knn_model_with10N_kfold('brute','chebyshev')

time: 0.000767230987549
accuracy: 0.395918367347
Confusion matrix, without normalization
[[  0   1   2   1   0   0]
 [  0   1   6   9   0   0]
 [  0   0  58  93  11   1]
 [  0   1  72 126  14   0]
 [  0   1  17  51   9   0]
 [  0   0   0  11   5   0]]
time: 0.000631093978882
accuracy: 0.422448979592
Confusion matrix, without normalization
[[  0   1   0   1   0   0   0]
 [  0   0   9  10   0   0   0]
 [  0   5  81  57   5   0   0]
 [  0   0  65 107  17   2   0]
 [  0   0  16  70  19   0   0]
 [  0   0   7  13   1   0   0]
 [  0   0   1   3   0   0   0]]
time: 0.000849008560181
accuracy: 0.383673469388
Confusion matrix, without normalization
[[  0   0   1   2   0   0]
 [  0   0   9  12   1   0]
 [  0   2  58  69   8   0]
 [  0   0  68 118  20   0]
 [  0   2  18  64  12   0]
 [  0   0   7  13   6   0]]
time: 0.000625133514404
accuracy: 0.4
Confusion matrix, without normalization
[[  0   0   2   1   0   0   0]
 [  0   2  10   9   0   0   0]
 [  0   6  66  65  13   2   0]
 [  0   1  89 120 