In [1]:
import numpy as np
from collections import Counter
import pandas as pd

def minkowski_dist(x1,x2,p):
    return round(np.power(np.sum(abs((x1-x2))**p),1/p),2)

In [2]:
class KNN:

    def __init__(self, k = 5, weight = False, p = 2 , show_nn = False , solver = 'mean'):
        self.k = k
        self.weight = weight
        self.p = p
        self.show_nn = show_nn
        self.solver = solver

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        # Compute distances between x and all examples in the training set
        distances = [minkowski_dist(x, x_train ,self.p) for x_train in self.X_train]
        # Sort by distance and return indices of the first k neighbors
        k_idx = np.argsort(distances)[:self.k]
        # Extract the labels of the k nearest neighbor training samples
        k_neighbor_labels = [self.y_train[i] for i in k_idx]  
        k_distance = [distances[i] for i in k_idx]
        if self.weight == True:
            weighted_nn = [1 / i for i in k_distance]
            df = pd.DataFrame(list(zip(k_neighbor_labels,weighted_nn)), columns= ['nn_label','weights'])
            w=df['weights'].sum()     
            s=(df['nn_label']*df['weights']).sum()
            cls=(s/w)
            
        else :    
            # return the most common class label
            df = pd.DataFrame(list(zip(k_neighbor_labels,k_distance)), columns= ['nn_label','Distance'])
            if self.solver == 'median':
                cls=df['nn_label'].median()
            else:
                cls=df['nn_label'].mean()
                        
        if self.show_nn == True:
            print("For point {} ,class = {}".format(list(round(num,3) for num in list(x)),round(cls,3)))
            print(df)
            print('\n')
        return cls
        
    def score(self,X,y):
        y_pred=self.predict(X)
        d = y-y_pred
        mse=np.mean(d**2)
        mae=np.mean(abs(d))
        rmse=np.sqrt(mse)
        r2=1-(sum(d**2)/sum((y-np.mean(y))**2))
        print("MSE = {}".format(mse))
        print("MAE = {}".format(mae))
        print("RMSE = {}".format(rmse))
        print("R_squared = {}".format(r2))

In [3]:
df=pd.read_csv(r"Fish.csv")

In [4]:
X=df.iloc[:,df.columns !='Weight']
y=df.iloc[:,df.columns =='Weight']

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3 , random_state=100)

In [7]:
from feature_engine.encoding import OneHotEncoder
encoder=OneHotEncoder(variables=['Species'] ,drop_last=True)
X_train=encoder.fit_transform(X_train)
X_test=encoder.transform(X_test)

In [8]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [9]:
y_train=y_train.values
y_test=y_test.values

In [10]:
y_train=y_train.reshape(-1)
y_test=y_test.reshape(-1)

In [12]:
k = 3
clf = KNN(k = k)
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

MSE = 3618.3039120370363
MAE = 40.37708333333333
RMSE = 60.15233920669284
R_squared = 0.9662565199799055


In [13]:
k = 3
clf = KNN(k = k,show_nn = True,weight = True)
clf.fit(X_train, y_train)

In [14]:
clf.score(X_test,y_test)

For point [-0.551, -0.529, -0.489, -0.44, -0.355, -0.511, -0.751, 2.746, -0.348, -0.297, -0.297] ,class = 155.459
   nn_label   weights
0     160.0  5.882353
1     145.0  4.761905
2     161.0  4.166667


For point [0.251, 0.246, 0.432, 1.34, 0.242, 1.956, -0.751, -0.364, -0.348, -0.297, -0.297] ,class = 480.688
   nn_label    weights
0     500.0  10.000000
1     475.0   5.555556
2     450.0   5.263158


For point [1.462, 1.433, 1.31, 0.883, 1.911, -0.511, 1.332, -0.364, -0.348, -0.297, -0.297] ,class = 1070.47
   nn_label   weights
0    1100.0  5.000000
1    1000.0  2.857143
2    1100.0  1.818182


For point [0.319, 0.338, 0.516, 1.599, 0.701, 1.956, -0.751, -0.364, -0.348, -0.297, -0.297] ,class = 647.558
   nn_label   weights
0     600.0  3.703704
1     700.0  3.225806
2     650.0  2.857143


For point [0.143, 0.155, 0.331, 0.927, 0.167, 1.956, -0.751, -0.364, -0.348, -0.297, -0.297] ,class = 411.623
   nn_label   weights
0     363.0  4.000000
1     450.0  3.571429
2     430.0  3.125

   nn_label   weights
0     850.0  2.857143
1     690.0  2.000000
2    1100.0  1.886792


For point [-0.023, -0.027, -0.142, -0.249, -0.125, -0.511, 1.332, -0.364, -0.348, -0.297, -0.297] ,class = 244.502
   nn_label   weights
0     260.0  5.263158
1     218.0  3.846154
2     250.0  3.703704


For point [-0.404, -0.392, -0.472, -0.377, -0.414, -0.511, 1.332, -0.364, -0.348, -0.297, -0.297] ,class = 157.871
   nn_label   weights
0     145.0  4.347826
1     180.0  3.846154
2     150.0  3.703704


For point [0.964, 0.931, 0.837, -0.595, -0.261, -0.511, -0.751, -0.364, 2.872, -0.297, -0.297] ,class = 332.25
   nn_label   weights
0     300.0  5.263158
1     430.0  2.325581
2     300.0  1.785714


For point [1.364, 1.341, 1.243, -0.258, 0.431, -0.511, -0.751, -0.364, 2.872, -0.297, -0.297] ,class = 523.429
   nn_label  weights
0     500.0  2.50000
1     510.0  2.12766
2     567.0  2.00000


For point [-0.697, -0.666, -0.725, -0.766, -0.515, -0.511, 1.332, -0.364, -0.348, -0.297, -0.297] ,cla