In [6]:
# from torchvision import dataset of MNIST and transforms
from torchvision import datasets,transforms

import numpy as np

# import evaluation metrics of accyracy_score
from sklearn.metrics import accuracy_score
from time import time,sleep

import torch

In [2]:
# 1.download and load MNIST dataset and split for data and target
def load_and_split_dataset():

    # download and load train dataset,params of download=true means need download dataset first,params transform means need change data type from ndarray to tensor(张量)
    train_dataset = datasets.MNIST(
        root='./data',
        download=True,
        transform=transforms.ToTensor(),
        train=True
    )

    # download and load train dataset,params of download=true means need download dataset first,params transform means need change data type from ndarray to tensor(张量)
    test_dataset = datasets.MNIST(
        root='./data',
        download=True,
        transform=transforms.ToTensor(),
        train=False
    )

    # split data and target from train dataset
    train_x = []
    train_y = []

    # iter for each samples
    for i in range(len(train_dataset)):

        # get samples and target
        images,target = train_dataset[i]


        # append each samples to list
        train_x.append(images.view(-1)) # reshape tensor
        train_y.append(target)

        # only need 5000 train samples
        if i > 5000:
            break

    # split data and target from test dataset
    test_x = []
    test_y = []

    # iter for each samples
    for i in range(len(test_dataset)):

        # get samples and target
        images, target = test_dataset[i]


        # append each samples to list
        test_x.append(images.view(-1))  # reshape tensor
        test_y.append(target)

        # only need 200 test samples
        if i > 200:
            break

    print('samples all classes:{}'.format(set(train_y)))

    return train_x,train_y,test_x,test_y

In [3]:
# 2.define the KNN model
def KNN_Model(train_x,train_y,test_x,test_y,k):
    '''
    build the knn model
    :param train_x: train dataset
    :param train_y: train target(label)
    :param test_x: test dataset
    :param test_y: test target(label)
    :param k: neighbor counts
    :return:
    '''

    # get local timestamp
    since = time()

    # get train dataset and test dataset of samples number
    m = test_x.size(0)
    n = train_x.size(0)

    # test dataset and train dataset original(原本的) dimension(维度) is m*1,**2 is square(平方) for each samples,sum(dim=1,means sum for line(sample),keepdim=true means keep 2 dimension),expand means change the disension for keep m*n
    xx = (test_x ** 2).sum(dim=1, keepdim=True).expand(m, n)
    yy = (train_x ** 2).sum(dim=1, keepdim=True).expand(n, m).transpose(0, 1) # transpose means reshape(转置)

    # cal neighbors distance matrix
    dist_mat = xx + yy - 2 * test_x.matmul(train_x.transpose(0, 1)) # code means:(x-y)**2

    # sort for neighbors distance metrix,need to find which samples is the nearest neighbor
    mink_idxs = dist_mat.argsort(dim=-1)

    # empty list,to save k nums nearest neighbors
    res = []
    for idxs in mink_idxs:
        # voting
        res.append(np.bincount(np.array([train_y[idx] for idx in idxs[:k]])).argmax())

    # 断言调试，断定找到的k个最近的邻居的个数与测试集样本个数
    assert len(res) == len(test_y)

    # cal accuracy_score
    print("accuracy_score:{}".format(accuracy_score(test_y, res)))

    # cal training time
    time_elapsed = time() - since

    print('KNN mat training complete in {}m {}s'.format(time_elapsed // 60, time_elapsed % 60))

In [4]:
def test():
    train_x,train_y,test_x,test_y = load_and_split_dataset()
    KNN_Model(torch.stack(train_x), train_y, torch.stack(test_x), test_y, 7)

In [7]:
test()

samples all classes:{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
accuracy_score:0.9405940594059405
KNN mat training complete in 0.0m 0.41864442825317383s
