In [1]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.metrics import classification_report
from random import shuffle
import pickle

def pickle_operating(fname, item):
    # save or load the pickle file.
    file_name = '%s.pickle' % fname
    print(file_name)
    if not item:
        with open(file_name, 'rb') as fs:
            item = pickle.load(fs)
            return item
    else:
        with open(file_name, 'wb') as fs:
            pickle.dump(item, fs, protocol=pickle.HIGHEST_PROTOCOL)

In [2]:
def learner(model):
    if model == 'knn':
        clf = KNeighborsClassifier(n_neighbors=3)
    elif model == 'tree':
        clf = tree.DecisionTreeClassifier()
    else:
        raise NameError('Unknown machine learning model. Please us one of: rf, svm, nb')
    return clf


def experiment(train_data, test_data, model):
    shuffle(train_data)
    shuffle(test_data)
    X_train, y_train = [x[0] for x in train_data], [x[1] for x in train_data]
    X_test, y_test = [x[0] for x in test_data], [x[1] for x in test_data]
    model.fit(X_train, y_train)
    print("Scores on test set: %s" % classification_report(y_test, model.predict(X_test)))
    print()

In [9]:
dataset = pickle_operating('Caltech_data_3', None)
print(len(dataset['train']), len(dataset['test']))
model = learner('knn')
experiment(dataset['train'], dataset['test'], model)
model = learner('tree')
experiment(dataset['train'], dataset['test'], model)

Caltech_data_3.pickle
(320, 324)
Scores on test set:              precision    recall  f1-score   support

          1       0.18      0.61      0.27        36
          2       0.41      0.35      0.38        31
          3       0.54      0.58      0.56        33
          4       0.00      0.00      0.00        21
          5       0.83      0.56      0.67        43
          6       0.32      0.20      0.24        46
          7       0.32      0.22      0.26        32
          8       0.07      0.04      0.05        27
          9       0.88      0.28      0.42        25
         10       0.94      0.57      0.71        30

avg / total       0.46      0.36      0.37       324

()
Scores on test set:              precision    recall  f1-score   support

          1       0.21      0.19      0.20        36
          2       0.26      0.29      0.28        31
          3       0.47      0.58      0.52        33
          4       0.26      0.33      0.29        21
          5       0

In [10]:
dataset = pickle_operating('MNIST_data_2', None)
print(len(dataset['train']), len(dataset['test']))
model = learner('knn')
experiment(dataset['train'], dataset['test'], model)
model = learner('tree')
experiment(dataset['train'], dataset['test'], model)

MNIST_data_2.pickle
(60000, 10000)
Scores on test set:              precision    recall  f1-score   support

          0       0.97      0.99      0.98       980
          1       0.99      1.00      0.99      1135
          2       0.96      0.98      0.97      1032
          3       0.96      0.96      0.96      1010
          4       0.95      0.94      0.95       982
          5       0.97      0.95      0.96       892
          6       0.98      0.98      0.98       958
          7       0.97      0.95      0.96      1028
          8       0.97      0.95      0.96       974
          9       0.92      0.94      0.93      1009

avg / total       0.96      0.96      0.96     10000

()
Scores on test set:              precision    recall  f1-score   support

          0       0.90      0.90      0.90       980
          1       0.96      0.96      0.96      1135
          2       0.85      0.86      0.86      1032
          3       0.82      0.82      0.82      1010
          4      