In [1]:
import numpy as np
from sklearn import datasets

In [11]:
class KNN:
    def __init__(self, method='k',k=None, r=None, weight=False, distance_metric='euclidean'):
        """
        distance_metric: str 'euclidean', 'manhattan', 'chebyshev'
        k: str  k-nearest
        r: str  radius
        """
    
        self.k = k
        self.r = r
        self.method = method
        self.weight = weight
        self.k_name = ['k','k_nearest']
        self.r_name = ['r','radius','radius_nearest']
        self.name = "k" if k else "r"
        self.distance_metric = distance_metric
    
    def feed(self, X_tr, y_tr, seed=44):
        """
        ---------------------------------------------------
        Description: it takes an array and a vector and mixes them with the same order
        ---------------------------------
        Input:
        X_tr: Matrix type.
        y_tr: Vector type.
        seed: Seed, like numpy.seed()
        ---------------------------------
        Output: 
        X,y
        X is a matrix and y is vector shuflled with same order.
        ---------------------------------------------------
        """
        entropy = np.random.RandomState(seed)
        index = np.array([i for i in range(len(X_tr))])
        
        entropy.shuffle(index)

        X = X_tr[index]
        y = y_tr[index]
        
        return X,y
        
    
    def stack(self, X_tr, y_tr, test_percentage=.2):
        """
        splits both dataset into train and test sets by the percentage passed as a parameter
        if not included we are going to follow the 80/20 rule.
        
        test_percentage: single number between:(0,1) not included.
        
        Output: X_train, y_train, X_test, y_test (suddivisi secondo richiesta)
        """
        idx = int(X_tr.shape[0] * test_percentage)
        
        X_train = X_tr[idx:]
        y_train = y_tr[idx:]
        
        X_test = X_tr[:idx]
        y_test = y_tr[:idx]
        
        return X_train,y_train,X_test,y_test
    
    def single_predict(self, X_tr, y_tr, x_test):
        distance = np.array([])
        
        if self.distance_metric == 'euclidean':
            for el in X_tr:
                d = np.sqrt(np.sum((x_test - el)**2))
                distance = np.append(distance, d)
        elif self.distance_metric == 'manhattan':
            for el in X_tr:
                d = np.sum(np.abs(x_test - el))
                distance = np.append(distance, d)
        elif self.distance_metric == 'chebyshev':
            for el in X_tr:
                d = np.max(np.abs(x_test - el))
                distance = np.append(distance, d)
        
        if self.method in self.k_name:
            k_neighbors = np.argsort(distance)[:self.k]
            
            if self.weight:
                weight = 1 / distance
                occurrences = np.bincount(y_tr[k_neighbors], minlength=len(np.unique(y_tr)), weights=weight[k_neighbors])
                new_pred = np.argmax(occurrences)
            else:
                occurrences = np.bincount(y_tr[k_neighbors], minlength=len(np.unique(y_tr)))
                new_pred = np.argmax(occurrences)
            
            return new_pred
        
        elif self.method in self.r_name:
            radius = distance < int(self.r)
            
            if self.weight:
                weights = 1 / distance[radius]
                occurrences = np.bincount(y_tr[radius], minlength=len(np.unique(y_tr)), weights=weights)
                new_pred = np.argmax(occurrences)
            else:
                occurrences = np.bincount(y_tr[radius], minlength=len(np.unique(y_tr)))
                new_pred = np.argmax(occurrences)
            
            return new_pred
        
    def best_fit(self, X, y, low=1, high=31, weight=True,list_prev=False,list_ytest=False):
        
        """ 
        IMPUT:
        list_prev: if 'True' return the list of preisions based on the percentage
        list_ytest: if 'True' return the list of y_test for each iteration
        y_final_predict: is the basic output if nothing is set to True
        -----------------------------------------------------------------------
        ITERATION:
        low= lowest limit 
        high= highest limit 
        --------------------------------------------------------------------
        
        OUTPUT: y_final_predict,percentage_previsions,list_of_ytest
        ---------------------------------------------------------------------
        """
        
        tools = [list_prev,list_ytest]
        fits = []
        previsions = np.array([])
        for i in range(low, high):
            if self.method in self.r_name:
                self.r = i
                X_tr,y_tr = self.feed(X,y)
                X_train,y_train,X_test,y_test = self.stack(X_tr, y_tr)
                final_predict = self.knn_predict(X_test, X_train, y_train)
                fits.append(final_predict)
            elif self.method in self.k_name:
                self.k = i
                X_tr,y_tr = self.feed(X,y)
                X_train,y_train,X_test,y_test = self.stack(X_tr, y_tr)
                final_predict = self.knn_predict(X_test, X_train, y_train)
                fits.append(final_predict)
                
        fits = np.array(fits)

        for prevision in fits:
            
            x = np.round(np.mean(prevision==y_test),2)
            previsions = np.append(previsions, x)
            
        best = np.max(previsions)
        idx = np.argmax(previsions) + low
            
        print(f"""
        In a range of {low}:{high} 
        The best fit  is with {self.name} = {idx} which have {best} percent of trueth""")
        
        if tools[0] and tools[1]:
            return fits[idx],previsions,fits
        elif tools[0]:
            return fits[idx],previsions
        elif tools[1]:
            return fits[idx],fits
        else:
            return fits[idx]
            
        return fits[idx]
        
                
    def knn_predict(self, X_tst, x_tr, y_tr):
        """
        Extend the prediction to a set of points
        """
        ser_pred = np.array([])
        
        for element in X_tst:
            x = self.single_predict(x_tr, y_tr, element)
            ser_pred = np.append(ser_pred, x)
        return ser_pred
    

# Usage examples

## 1) Iris Dataset

### Found best r for the dataset

In [33]:
data = datasets.load_iris()
X0 = data['data']
y0 = data['target']
knn_model_without_weight = KNN2(method='r', weight=True)
final_predict = knn_model_without_weight.best_fit(X0,y0)


        In a range of 1:31 
        The best fit  is with r = 1 which have 0.97 percent of trueth


### Test The veracity of the found parameters

In [34]:
final_model =  KNN2(method='r', r=1, weight=True)

X_tr, y_tr = final_model.feed(X0,y0)
X_train,y_train,X_test,y_test = final_model.stack(X_tr,y_tr)

In [35]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((120, 4), (120,), (30, 4), (30,))

In [36]:
final_predict = final_model.knn_predict(X_test, X_train, y_train)

In [37]:
residuals = final_predict - y_test

In [38]:
print("Test set score: {:.2f}".format(np.mean(final_predict == y_test)))

Test set score: 0.97


## 2) Wine Dataset

### Found best r for the dataset

In [12]:
data2 = datasets.load_wine()

X = data2['data']
y = data2['target']

wine_model = KNN2(method='r', weight=True, distance_metric='chebyshev')

best_fit = wine_model.best_fit(X,y)


        In a range of 1:31 
        The best fit  is with r = 14 which have 0.77 percent of trueth


In [13]:
data2 = datasets.load_wine()

X = data2['data']
y = data2['target']

wine_model = KNN2(method='r', weight=True, distance_metric='manhattan')

best_fit = wine_model.best_fit(X,y)


        In a range of 1:31 
        The best fit  is with r = 25 which have 0.77 percent of trueth


In [14]:
data2 = datasets.load_wine()

X = data2['data']
y = data2['target']

wine_model = KNN2(method='r', weight=True, distance_metric='euclidean')

best_fit = wine_model.best_fit(X,y)


        In a range of 1:31 
        The best fit  is with r = 23 which have 0.77 percent of trueth


### Test The veracity of the found parameters

In [16]:
wine_knn_model = KNN2(method='r',r=23, weight=True)

X0,y0 = wine_knn_model.feed(X,y)
X_train,y_train,X_test,y_test = wine_knn_model.stack(X0,y0)

In [17]:
# check for correct split
X_test.shape,X_train.shape,y_test.shape,y_train.shape

((35, 13), (143, 13), (35,), (143,))

In [18]:
# final model
wine_predict = wine_knn_model.knn_predict(X_test, X_train, y_train)

In [19]:
print("Test set score: {:.2f}".format(np.mean(wine_predict == y_test),2))

Test set score: 0.77
