# PyTorch1实现kNN分类和回归

In [1]:
import sys
from collections import Counter
from collections.abc import Iterable

import numpy as np
import torch
from sklearn.datasets import load_iris, load_boston
from sklearn.model_selection import train_test_split


print("python version:", sys.version_info)
print(torch.__name__, torch.__version__)

python version: sys.version_info(major=3, minor=7, micro=2, releaselevel='final', serial=0)
torch 1.6.0


## 1. PyTorch实现kNN基类

In [2]:
class TorchKNNBase:
    """KNN basic class with PyTorch.
    
    Attributes:
        n_neighbors: A int number, number of neighbors.
        _metric: A method object, choose from {_manhattan_distance, _euclidean_distance, _chebyshev_distance}.
        _X_train: feature data for training. A tf.Tensor matrix of (n_samples, n_features) shape, 
            data type must be continuous value type. 
        _y_train: label data for training. A tf.Tensor array of (n_samples, ) shape, 
            data type must be discrete value.
    """
    def __new__(cls, *args, **kwargs):
        raise Exception("Can't instantiate an object from TorchKNNBase! ")
        
    def __init__(self, n_neighbors=5, metric="euclidean"):
        """Init method.
    
        Args:
            n_neighbors: int, optional (default = 5), the integer must greater then 0.
                Number of neighbors to use by default for :meth:`kneighbors` queries.
            metric: {"manhattan", "euclidean", "chebyshev"}, optional, default 'euclidean'.
        
        Raises:
            ValueError: metric value is out of options.
            AssertionError: n_neighbors value is not a integer or n_neighbors > 0. 
        """
        assert isinstance(n_neighbors, int) and n_neighbors > 0
        self.n_neighbors = n_neighbors
        if metric == "manhattan":
            self._metric = self._manhattan_distance
        elif metric == "euclidean":
            self._metric = self._euclidean_distance
        elif metric == "chebyshev":
            self._metric = self._chebyshev_distance
        else:
            raise ValueError(f'No such metric as {metric}, please option from: {"manhattan", "euclidean", "chebyshev"}')
        self._X_train, self._y_train = [None] * 2
    
    def fit(self, X_train, y_train):
        """method for training model. 
        
        Args:
            X_train: A matrix of (n_samples, n_features) shape, data type must be continuous value type. 
            y_train: A array of (n_samples, ) shape.
        
        Raises:
            AssertionError: X_train value or y_train value with a mismatched shape.
        """
        assert isinstance(X_train, Iterable) and isinstance(y_train, Iterable)
        assert len(X_train) == len(y_train)
        self._X_train = torch.as_tensor(X_train, dtype=torch.float32)
        self._y_train = y_train if isinstance(y_train, torch.Tensor) else torch.as_tensor(y_train)
    
    def predict(self, X_test):
        """predict test data.
        
        Args:
            X_test: A np.ndarray matrix of (n_samples, n_features) shape, 
                or a np.ndarray array of (n_features, ) shape.
            
        Returns:
            A list for samples predictions or a single prediction.
        
        Raises:
            ValueError: X_test value with a mismatched shape.
        """
        assert isinstance(X_test, Iterable)
        X_test = torch.as_tensor(X_test, dtype=torch.float32)
        
        if X_test.shape == (self._X_train.shape[1], ):
            y_pred = self._predict_sample(X_test)
        elif X_test.shape[1] == self._X_train.shape[1]: 
            y_pred = []
            for sample in X_test:
                y_pred.append(self._predict_sample(sample))
        else:
            raise ValueError("Mismatched shape for X_test")
        return y_pred
    
    def _manhattan_distance(self, x):
        return torch.sum(torch.abs(self._X_train - x), dim=1)    
    
    def _euclidean_distance(self, x):
        return torch.sqrt(torch.sum(torch.square(self._X_train - x), dim=1))
    
    def _chebyshev_distance(self, x):
        return torch.max(torch.abs(self._X_train - x), dim=1)
    
    def _find_k_labels(self, sample):
        distance = self._metric(sample)
        _, k_nearest_index = torch.topk(distance, self.n_neighbors, largest=False)
        k_labels = self._y_train[k_nearest_index]
        return k_labels
    
    def _predict_sample(self, sample):
        raise Exception("Can call predict method for NumpyKNNBase object! ")
        
    def _score_validation(self, X_test, y_test):
        assert isinstance(X_test, Iterable) and isinstance(y_test, Iterable)
        assert len(X_test) == len(y_test)
        y_test = y_test if isinstance(y_test, torch.Tensor) else torch.as_tensor(y_test)
        return X_test, y_test

## 2. PyTorch实现kNN Classifier

In [3]:
class TorchKNNClassifier(TorchKNNBase):
    """kNN Classifier with PyTorch, explicitly inherits from TorchKNNBase already.
    
    Attributes:
        n_neighbors: A int number, number of neighbors.
        _metric: A method object, choose from {_manhattan_distance, _euclidean_distance, _chebyshev_distance}.
        _X_train: feature data for training. A torch.Tensor matrix of (n_samples, n_features) shape, 
            data type must be continuous value type. 
        _y_train: label data for training. A torch.Tensor array of (n_samples, ) shape, 
            data type must be discrete value.
    """
    def __new__(cls, *args, **kwargs):
        return object.__new__(cls, *args, **kwargs)
    
    def score(self, X_test, y_test):
        """Use test dataset to evaluate the trained model.
        
        Args:
            X_test: A np.ndarray matrix of (n_samples, n_features) shape.
            y_test: A np.ndarray array of (n_samples, ) shape. data type must be
                discrete value.
        Returns:
            return accuracy, a float number. accuracy = correct_count / y_test.shape[0]
        """
        X_test, y_test = self._score_validation(X_test, y_test)
        
        y_pred = torch.as_tensor(self.predict(X_test))
        correct_count = torch.sum(y_pred == y_test)
        accuracy = correct_count.item() / y_test.shape[0]
        return accuracy
    
    def _predict_sample(self, sample):
        k_labels = self._find_k_labels(sample)
        pred = Counter(k_labels.numpy()).most_common(1)[0][0]
        return pred

In [4]:
X_data, y_data = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, shuffle=True, random_state=1)
cls = TorchKNNClassifier()
cls.fit(X_train, y_train)
cls.score(X_test, y_test)

1.0

## 3. PyTorch实现kNN Regressor

In [5]:
class TorchKNNRegressor(TorchKNNBase): 
    """kNN Regressor with Pytorch, explicitly inherits from TorchKNNBase already.
    
    Attributes:
        n_neighbors: A int number, number of neighbors.
        _metric: A method object, choose from {_manhattan_distance, _euclidean_distance, _chebyshev_distance}.
        _X_train: feature data for training. A torch.Tensr matrix of (n_samples, n_features) shape, 
            data type must be continuous value type. 
        _y_train: label data for training. A torch.Tensor array of (n_samples, ) shape, 
            data type must be discrete value.
    """
    def __new__(cls, *args, **kwargs):
        return object.__new__(cls, *args, **kwargs)
    
    def score(self, X_test, y_test):
        """Use test dataset to evaluate the trained model.
        
        Args:
            X_test: A np.ndarray matrix of (n_samples, n_features) shape.
            y_test: A np.ndarray array of (n_samples, ) shape. data type must be
                discrete value.
        Returns:
            return R^2, R^2 = 1 - u / v. u = sum((y_pred - y_true)^2), v = sum((y_true - y_true_mean)^2)
        """
        X_test, y_test = self._score_validation(X_test, y_test)
        
        y_pred = torch.as_tensor(self.predict(X_test))
        y_true_mean = torch.mean(y_test)
        u = torch.sum(torch.square(y_pred - y_test), dim=0)
        v = torch.sum(torch.square(y_test - y_true_mean), dim=0)
        r_squared = 1 - u / v
        return r_squared.item()
    
    def _predict_sample(self, sample):
        k_labels = self._find_k_labels(sample)
        pred = torch.mean(k_labels)
        return pred.item()

### 3.1 boston房价数据集验证算法

In [6]:
X_data, y_data = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, shuffle=True, random_state=1)
cls = TorchKNNRegressor()
cls.fit(X_train, y_train)
cls.score(X_test, y_test)

0.5281871701699631

## 4. 总结

1. PyTorch API
    > torch.sum(input, dim)   
    > torch.mean(input, dim)  
    > torch.max(input, dim)  
    >  
    > torch.topk(input, k, dim, largest)  # 求topk独家神器
    >  
    > tensor.item()  
    > tensor.numpy()  
    >  
    > torch.sqrt()  
    > torch.square()  
    > torch.abs()  
    > torch.as_tensor(data, dtype)  
    >  
    > torch.float32
    
2. Numpy vs TensorFlow2 vs PyTorch1
    > |Numpy|TensorFlow2|PyTorch1|
    > |-|-|-|
    > |np.sum(a, axis)|tf.reduce_sum(tensor, axis)|torch.sum(tensor, dim)|
    > |np.mean(a, axis)|tf.reduce_mean(tensor, axis)|torch.mean(tensor, dim)|  
    > |np.max(a, axis)|tf.reduce_max(input_tensor, axis)|torch.max(tensor, dim)|
    > |np.argsort(a, axis)|tf.argsort(input_tensor, axis)|torch.argsort(input, dim)|
    > |-|tf.math.top_k(input, k, sorted, name)|torch.topk(input, k, dim, largest, sorted)| 
    > |tensor1[tensor2]|tf.gather(params=tensor1, indices=tensor2)|tensor1[tensor2]|
    > |1|a = tf.constant(1) a.numpy() == 1| a = torch.Tensor(1) a.numpy() == np.ndarray(1)|
    > |np.array(object, dtype)|tf.constant(value, dtype), tf.Variable()|torch.as_tensor(data, dtype)|
    > |np.float32|tf.float32|torch.float32|
    
3. tf2转torch1之基础API（一）
    > tf.reduce_改torch.
    > axis改dim
    > tf.constant改torch.as_tensor
    > tf.float32改torch.float32
    > tf.cast(tensor, dtype=tf.float32)改tensor.to(torch.float32)  
    > torch.IntTensor不能直接除以一个int值！tf.Tensor可以，并返回float64的Tensor

## 作业

1. 惯例，敲两遍以上。  
2. 重新梳理1.1～1.5节的内容，整理笔记，画思维导图，变成自己的知识。
3. 试着实现kd树

## 相关链接

<a href="./01.1.kNN.ipynb" style=""> 1.1 kNN k近邻算法原理 </a>  
<a href="./01.2.kNN-sklearn.ipynb" style=""> 1.2 sklearn中使用kNN做分类、回归任务 </a>  
<a href="./01.3.kNN-numpy.ipynb" style=""> 1.3 numpy实现kNN分类和回归 </a>    
<a href="./01.4.kNN-tf2.ipynb"> 1.4 TensorFlow2实现kNN分类和回归 </a>    
  
<a href="./02.1.LinearRegression.ipynb"> 2.1 Linear Regression线性回归算法原理 </a>

## 项目源码  

https://github.com/LossJ     
进入后点击Statistic-Machine-Learning

## 其他

备注：学校/公司-真实姓名。  
**不要问你自己的项目遇到的问题！不要问课堂以外的问题！不要问课上留的作业！**  

<img src="./imgs/1.WeChatQRCode.jpg" width="450px" align="left">