In [1]:
import numpy as np
from collections import Counter
import pandas as pd

def minkowski_dist(x1,x2,p):
    return round(np.power(np.sum(abs((x1-x2))**p),1/p),2)

In [2]:
class RNN:

    def __init__(self, r = 1, weight = False, p = 2 , show_nn = False ,mark_outlier=True):
        self.r = r
        self.weight = weight
        self.p = p
        self.show_nn = show_nn
        self.mark_outlier=mark_outlier
        self.outliers={}

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = [self._predict(x,idx) for idx,x in enumerate(X)]
        return np.array(y_pred)

    def _predict(self, x, idx):
        # Compute distances between x and all examples in the training set
        distances = np.array([minkowski_dist(x, x_train ,self.p) for x_train in self.X_train])
        # Sort by distance and return indices of the first k neighbors
        k_idx = np.argwhere(distances<self.r)
        if k_idx.size==0:
            print('{} is detected as outlier'.format(x))
            self.outliers[idx] = x.tolist()
        else:       
        # Extract the labels of the k nearest neighbor training samples
            k_neighbor_labels = [self.y_train[i] for i in k_idx] 
            k_neighbor_labels = [x[0] for x in k_neighbor_labels]
            k_distance = [distances[i] for i in k_idx]
            k_distance = [x[0] for x in  k_distance]
            if self.weight == True:
                weighted_nn = [1 / i for i in k_distance]
                df = pd.DataFrame(list(zip(k_neighbor_labels,weighted_nn)), columns= ['nn_label','weights'])
                p = df.groupby('nn_label', as_index=False).agg('sum')
                cls = p.nn_label[p['weights'].idxmax()]
            else :    
            # return the most common class label
                df = pd.DataFrame(list(zip(k_neighbor_labels,k_distance)), columns= ['nn_label','Distance'])
                most_common = Counter(k_neighbor_labels).most_common(1)
                cls = most_common[0][0]  
            
            if self.show_nn == True:
                print("For point {} ,class = {}".format(list(x),cls))
                print(df)
                print('\n')
            return cls
        
    def score(self,X,y):
        y_pred=self.predict(X)
        accuracy = np.sum(y == y_pred) / (len(y)-len(self.outliers))
        return accuracy
    
    def outliers_detected(self):
        return self.outliers
                                                                        

In [3]:
df=pd.read_csv(r'heart.csv')

In [4]:
df['cp']=df['cp'].astype('object')
df['slope']=df['slope'].astype('object')
df['thal']=df['thal'].astype('object')

In [5]:
X=df.iloc[:,df.columns !='target']
y=df.iloc[:,df.columns =='target']

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y, test_size=0.25 , random_state=100)

In [7]:
from feature_engine.encoding import OneHotEncoder
encoder=OneHotEncoder(variables=['cp','slope','thal'] ,drop_last=True)
X_train=encoder.fit_transform(X_train)
X_test=encoder.transform(X_test)

In [8]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [9]:
y_train=y_train.values
y_test=y_test.values

In [10]:
y_train=y_train.reshape(-1)
y_test=y_test.reshape(-1)

In [11]:
R = 1
clf = RNN(r = R,show_nn=True)
clf.fit(X_train, y_train)
print("custom RNN classification accuracy", clf.score(X_test,y_test))

[-0.72477224 -1.57870443 -0.07718592  0.5402931  -0.4197211   0.87416566
 -0.48688003 -0.66772739 -0.72018086 -0.67771959 -0.96110812  1.52912893
 -0.42695628  1.08751658 -0.96110812  0.93596638 -0.81799612 -0.24647041] is detected as outlier
[-0.94800602  0.63343079  1.10667927 -0.28636859 -0.4197211   0.87416566
 -0.13447902 -0.66772739  2.26337033 -0.67771959 -0.96110812  1.52912893
 -0.42695628 -0.91952621  1.04046567  0.93596638 -0.81799612 -0.24647041] is detected as outlier
[-1.95255804 -1.57870443 -0.66911852 -0.58697284 -0.4197211   0.87416566
  0.87867389 -0.66772739 -0.89568387 -0.67771959 -0.96110812  1.52912893
 -0.42695628  1.08751658 -0.96110812  0.93596638 -0.81799612 -0.24647041] is detected as outlier
[-1.84094115  0.63343079 -0.66911852 -0.28636859 -0.4197211   0.87416566
  1.40727541  1.49761715  2.43887334 -0.67771959 -0.96110812 -0.65396709
 -0.42695628 -0.91952621  1.04046567 -1.06841445  1.22249969 -0.24647041] is detected as outlier
[ 0.83786424 -1.57870443  1.

[-1.50609047  0.63343079 -1.26105112 -0.21121752 -0.4197211   0.87416566
  0.12982174 -0.66772739 -0.89568387 -0.67771959 -0.96110812 -0.65396709
  2.34216018  1.08751658 -0.96110812  0.93596638 -0.81799612 -0.24647041] is detected as outlier
[-1.05962291 -1.57870443 -1.1426646  -1.62029995 -0.4197211   0.87416566
 -0.53093016 -0.66772739 -0.89568387 -0.67771959 -0.96110812 -0.65396709
  2.34216018 -0.91952621  1.04046567  0.93596638 -0.81799612 -0.24647041] is detected as outlier
[ 0.61463046  0.63343079 -0.07718592 -0.75606273 -0.4197211  -1.03386901
 -0.79523092  1.49761715  1.21035226  1.28205553  1.04046567 -0.65396709
 -0.42695628 -0.91952621  1.04046567 -1.06841445  1.22249969 -0.24647041] is detected as outlier
[ 1.3959487   0.63343079 -0.66911852 -0.32394412 -0.4197211  -1.03386901
 -0.9273813   1.49761715  1.38585527  1.28205553  1.04046567 -0.65396709
 -0.42695628 -0.91952621  1.04046567 -1.06841445  1.22249969 -0.24647041] is detected as outlier
[ 0.39139668  0.63343079  0.

In [12]:
lst=clf.outliers_detected()
print(len(lst))
lst

72


{0: [-0.7247722359769968,
  -1.5787044347526529,
  -0.07718592453760997,
  0.5402931022362157,
  -0.41972110157675935,
  0.8741656633016291,
  -0.4868800328576335,
  -0.6677273939351784,
  -0.7201808570313808,
  -0.677719589715212,
  -0.9611081175181706,
  1.5291289331242306,
  -0.4269562819149832,
  1.0875165781229765,
  -0.9611081175181706,
  0.9359663764533636,
  -0.8179961167749047,
  -0.24647041110730084],
 1: [-0.9480060182928426,
  0.6334307917217434,
  1.1066792693838392,
  -0.2863685866631904,
  -0.41972110157675935,
  0.8741656633016291,
  -0.13447902063385406,
  -0.6677273939351784,
  2.2633703263117204,
  -0.677719589715212,
  -0.9611081175181706,
  1.5291289331242306,
  -0.4269562819149832,
  -0.9195262123966631,
  1.0404656685059095,
  0.9359663764533636,
  -0.8179961167749047,
  -0.24647041110730084],
 2: [-1.9525580387141481,
  -1.5787044347526529,
  -0.6691185214983345,
  -0.5869728371720654,
  -0.41972110157675935,
  0.8741656633016291,
  0.8786738895095118,
  -0.6677

In [13]:
R = 3
clf = RNN(r = R,show_nn=True,weight=True)
clf.fit(X_train, y_train)
print("custom RNN classification accuracy", clf.score(X_test,y_test))

For point [-0.7247722359769968, -1.5787044347526529, -0.07718592453760997, 0.5402931022362157, -0.41972110157675935, 0.8741656633016291, -0.4868800328576335, -0.6677273939351784, -0.7201808570313808, -0.677719589715212, -0.9611081175181706, 1.5291289331242306, -0.4269562819149832, 1.0875165781229765, -0.9611081175181706, 0.9359663764533636, -0.8179961167749047, -0.24647041110730084] ,class = 1
   nn_label   weights
0         1  0.369004
1         1  0.384615
2         1  0.336700
3         1  0.348432
4         1  0.490196
5         1  0.335570
6         0  0.369004
7         1  0.347222
8         1  0.436681
9         1  0.352113


For point [-0.9480060182928426, 0.6334307917217434, 1.1066792693838392, -0.2863685866631904, -0.41972110157675935, 0.8741656633016291, -0.13447902063385406, -0.6677273939351784, 2.2633703263117204, -0.677719589715212, -0.9611081175181706, 1.5291289331242306, -0.4269562819149832, -0.9195262123966631, 1.0404656685059095, 0.9359663764533636, -0.817996116774904

   nn_label   weights
0         1  0.393701
1         1  0.423729


[ 0.27977978  0.63343079  1.10667927  0.55908087 -0.4197211  -1.03386901
 -1.67623345  1.49761715 -0.36917484  0.30216797  1.04046567 -0.65396709
 -0.42695628 -0.91952621  1.04046567 -1.06841445 -0.81799612  4.05728215] is detected as outlier
[ 0.39139668 -1.57870443  0.27797363  1.36695479  2.38253449 -1.03386901
  0.08577161 -0.66772739 -0.89568387  1.28205553 -0.96110812 -0.65396709
  2.34216018  1.08751658 -0.96110812  0.93596638 -0.81799612 -0.24647041] is detected as outlier
For point [1.1727149137076915, -1.5787044347526529, 1.4026455678642014, 0.4275665082953876, -0.41972110157675935, 0.8741656633016291, -0.09042889410588163, -0.6677273939351784, -0.19367182467671581, -0.677719589715212, -0.9611081175181706, 1.5291289331242306, -0.4269562819149832, 1.0875165781229765, -0.9611081175181706, 0.9359663764533636, -0.8179961167749047, -0.24647041110730084] ,class = 1
   nn_label   weights
0         1  0.336700
1     

4         0  0.357143


For point [-0.3899215625032283, -1.5787044347526529, -0.6691185214983345, 0.9160484153723094, -0.41972110157675935, -1.0338690056355806, 0.30602224464587024, -0.6677273939351784, -0.3691748354616042, -0.677719589715212, -0.9611081175181706, 1.5291289331242306, -0.4269562819149832, 1.0875165781229765, -0.9611081175181706, 0.9359663764533636, -0.8179961167749047, -0.24647041110730084] ,class = 1
   nn_label   weights
0         1  0.854701
1         1  0.534759
2         1  0.990099
3         1  0.502513
4         1  0.377358
5         1  0.420168


[ 2.17726693 -1.57870443 -0.66911852  0.42756651 -0.4197211  -1.03386901
 -1.27978231  1.49761715 -0.72018086  0.30216797 -0.96110812 -0.65396709
  2.34216018  1.08751658 -0.96110812  0.93596638 -0.81799612 -0.24647041] is detected as outlier
For point [0.056546002128463085, 0.6334307917217434, 0.041200594854534936, 2.0057388234669813, -0.41972110157675935, 0.8741656633016291, -0.7952309185534405, 1.4976171549689001, 0.

In [14]:
lst=clf.outliers_detected()
print(len(lst))
lst

26


{3: [-1.8409411475562254,
  0.6334307917217434,
  -0.6691185214983345,
  -0.2863685866631904,
  -0.41972110157675935,
  0.8741656633016291,
  1.407275407845181,
  1.4976171549689001,
  2.4388733370966085,
  -0.677719589715212,
  -0.9611081175181706,
  -0.6539670908958973,
  -0.4269562819149832,
  -0.9195262123966631,
  1.0404656685059095,
  -1.0684144485929907,
  1.2224996910042532,
  -0.24647041110730084],
 6: [-1.6177073652403797,
  0.6334307917217434,
  0.5147466724231146,
  -0.8875770876809403,
  -0.41972110157675935,
  0.8741656633016291,
  1.2310749017332911,
  1.4976171549689001,
  0.33283720767794894,
  -0.677719589715212,
  -0.9611081175181706,
  -0.6539670908958973,
  -0.4269562819149832,
  1.0875165781229765,
  -0.9611081175181706,
  -1.0684144485929907,
  1.2224996910042532,
  -0.24647041110730084],
 7: [-1.5060904740824568,
  0.6334307917217434,
  0.2187803739427523,
  -0.8124260250537216,
  -0.41972110157675935,
  0.8741656633016291,
  -0.7952309185534405,
  -0.6677273939