In [1]:
from time import perf_counter, perf_counter_ns
from typing import Literal
from copy import deepcopy

import numpy as np
import pandas as pd

import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors

---

## Подготовка датасета

In [2]:
df = pd.read_csv('../datasets/alzheimers_disease_data.csv', header=0)
df.head()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis,DoctorInCharge
0,4751,73,0,0,2,22.927749,0,13.297218,6.327112,1.347214,...,0,0,1.725883,0,0,0,1,0,0,XXXConfid
1,4752,89,0,0,0,26.827681,0,4.542524,7.619885,0.518767,...,0,0,2.592424,0,0,0,0,1,0,XXXConfid
2,4753,73,0,3,1,17.795882,0,19.555085,7.844988,1.826335,...,0,0,7.119548,0,1,0,1,0,0,XXXConfid
3,4754,74,1,0,1,33.800817,1,12.209266,8.428001,7.435604,...,0,1,6.481226,0,0,0,0,0,0,XXXConfid
4,4755,89,0,0,0,20.716974,0,18.454356,6.310461,0.795498,...,0,0,0.014691,0,0,1,1,0,0,XXXConfid


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2149 entries, 0 to 2148
Data columns (total 35 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   PatientID                  2149 non-null   int64  
 1   Age                        2149 non-null   int64  
 2   Gender                     2149 non-null   int64  
 3   Ethnicity                  2149 non-null   int64  
 4   EducationLevel             2149 non-null   int64  
 5   BMI                        2149 non-null   float64
 6   Smoking                    2149 non-null   int64  
 7   AlcoholConsumption         2149 non-null   float64
 8   PhysicalActivity           2149 non-null   float64
 9   DietQuality                2149 non-null   float64
 10  SleepQuality               2149 non-null   float64
 11  FamilyHistoryAlzheimers    2149 non-null   int64  
 12  CardiovascularDisease      2149 non-null   int64  
 13  Diabetes                   2149 non-null   int64

In [4]:
df.describe()

Unnamed: 0,PatientID,Age,Gender,Ethnicity,EducationLevel,BMI,Smoking,AlcoholConsumption,PhysicalActivity,DietQuality,...,FunctionalAssessment,MemoryComplaints,BehavioralProblems,ADL,Confusion,Disorientation,PersonalityChanges,DifficultyCompletingTasks,Forgetfulness,Diagnosis
count,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,...,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0,2149.0
mean,5825.0,74.908795,0.506282,0.697534,1.286645,27.655697,0.288506,10.039442,4.920202,4.993138,...,5.080055,0.208004,0.156817,4.982958,0.205212,0.158213,0.150768,0.158678,0.301536,0.353653
std,620.507185,8.990221,0.500077,0.996128,0.904527,7.217438,0.453173,5.75791,2.857191,2.909055,...,2.892743,0.405974,0.363713,2.949775,0.40395,0.365026,0.357906,0.365461,0.459032,0.478214
min,4751.0,60.0,0.0,0.0,0.0,15.008851,0.0,0.002003,0.003616,0.009385,...,0.00046,0.0,0.0,0.001288,0.0,0.0,0.0,0.0,0.0,0.0
25%,5288.0,67.0,0.0,0.0,1.0,21.611408,0.0,5.13981,2.570626,2.458455,...,2.566281,0.0,0.0,2.342836,0.0,0.0,0.0,0.0,0.0,0.0
50%,5825.0,75.0,1.0,0.0,1.0,27.823924,0.0,9.934412,4.766424,5.076087,...,5.094439,0.0,0.0,5.038973,0.0,0.0,0.0,0.0,0.0,0.0
75%,6362.0,83.0,1.0,1.0,2.0,33.869778,1.0,15.157931,7.427899,7.558625,...,7.546981,0.0,0.0,7.58149,0.0,0.0,0.0,0.0,1.0,1.0
max,6899.0,90.0,1.0,3.0,3.0,39.992767,1.0,19.989293,9.987429,9.998346,...,9.996467,1.0,1.0,9.999747,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
df = df.drop(columns=['PatientID', 'DoctorInCharge'])

In [6]:
X, y = df.drop(columns=['Diagnosis']), df['Diagnosis']
X.shape

(2149, 32)

In [7]:
int_cols = X.select_dtypes(include=['int']).columns.tolist()
float_cols = X.select_dtypes(include=['float']).columns.tolist()
print('Int cols: ', len(int_cols))
print('Float cols: ', len(float_cols))
print(len(int_cols) + len(float_cols))

Int cols:  20
Float cols:  12
32


In [8]:
X[int_cols].nunique()

Age                          31
Gender                        2
Ethnicity                     4
EducationLevel                4
Smoking                       2
FamilyHistoryAlzheimers       2
CardiovascularDisease         2
Diabetes                      2
Depression                    2
HeadInjury                    2
Hypertension                  2
SystolicBP                   90
DiastolicBP                  60
MemoryComplaints              2
BehavioralProblems            2
Confusion                     2
Disorientation                2
PersonalityChanges            2
DifficultyCompletingTasks     2
Forgetfulness                 2
dtype: int64

In [9]:
bin_cols = []
cat_cols = []
for col in int_cols:
    n_unique = X[col].nunique()
    if n_unique == 2:
        bin_cols.append(col)
    elif n_unique > 2 and n_unique < 10:
        cat_cols.append(col)

for col in (bin_cols + cat_cols):
    int_cols.remove(col)

print('int_cols: ', len(int_cols))
print('cat_cols: ', len(cat_cols))
print('binary cols: ', len(bin_cols))

int_cols:  3
cat_cols:  2
binary cols:  15


In [10]:
col_transformer = ColumnTransformer(
    transformers = [
        ('float_cols', StandardScaler(), float_cols + int_cols),
        ('cat_cols', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), [cat_cols[0]]),
        ('ord_cols', OrdinalEncoder(), [cat_cols[1]]),
        ('bin_cols', 'passthrough', bin_cols),
    ],
    remainder = 'passthrough'
)

In [11]:
y.value_counts()

Diagnosis
0    1389
1     760
Name: count, dtype: int64

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify=y, random_state=14122025)
# y_train, y_test = y_train.values, y_test.values
print(X_train.shape, X_test.shape)

(1074, 32) (1075, 32)


In [13]:
X_train = col_transformer.fit_transform(X_train)
X_test  = col_transformer.transform(X_test)

- "D:\Books\Computer Science\ML_DL\[Information Science and Statistics] Christopher M. Bishop - Pattern Recognition and Machine Learning (Information Science and Statistics) (2006, Springer Science+Business Media, LLC) - libgen.li.pdf" -- страница 126
- "D:\Books\Computer Science\ML_DL\ML\Hastie_T_Tibshirani_R_and_Friedman_J_The_Elemenets_of_Statistical_Learning_Data_Mining_Inference_and_Prediction_2nd_ed.pdf" -- страница 208
- "D:\Books\Computer Science\ML_DL\ML\Probabilistic_Machine_Learning_An_Introduction_2022_Kevin_P_Murphy.pdf" -- страница 557

---

In [14]:
def uniform(r):
    return (r <= 1).astype(float)

def inverse(r):
    return (1. / r)

def triangular(r):
    return np.maximum(0.0, 1.0 - r)

def epanechnikov(r):
    return np.maximum(0.0, 1.0 - r*r)

def gaussian(r):
    return np.exp(-2.0 * r * r)

def tricube(r):
    return np.maximum(0.0, (1.0 - np.abs(r)**3)**3)

In [15]:
class KNearestNeighbors:

    def __init__(self, kernel: callable = uniform):
        self.kernel = kernel
        self.eps = 1e-15

    def fit(self, X, y):
        self.X_train = np.asarray(X)
        self.y_train = np.asarray(y)
        self.proto_idx = None

    def predict(self, X, k=1, norm: int = 2):
        dists = self.compute_distances(X, norm)
        labels, _ = self.predict_labels(dists, k=k)
        return labels
    
    def predict_proba(self, X, k=1, norm: int = 2):
        dists = self.compute_distances(X, norm)
        _, probs = self.predict_labels(dists, k=k)
        return probs
    
    def compute_distances(self, X, norm: int = 2):
        num_test = X.shape[0]

        if self.proto_idx is None:
            X_ref = self.X_train
        else:
            X_ref = self.X_train[self.proto_idx]

        num_train = X_ref.shape[0]
        dists = np.zeros((num_test, num_train))
        
        if norm == 2:

            dists = np.sqrt(
                np.maximum(
                    0.0, 
                    np.power(X, 2).sum(axis=1, keepdims=True) +       # (num_test, 1)
                    np.power(X_ref, 2).sum(axis=1, keepdims=True).T + # (1, num_train)
                    -2 * (X @ X_ref.T)
                )
            )
        else:
            diff = X[:, None, :] - X_ref[None, :, :] # (num_test, num_train, D)
            # numpy norn
            dists = np.linalg.norm(diff, ord=norm, axis=2)
            
            # # manual norm
            # if norm == np.inf:
            #     dists = np.max(np.abs(diff), axis=2)
            # elif norm == -np.inf:
            #     dists = np.min(np.abs(diff), axis=2)
            # else:
            #     dists = np.sum(np.abs(diff) ** norm, axis=2) ** (1 / norm)

        return dists
        
    def predict_labels(self, dists, k=1):
        is_masked = np.ma.isMaskedArray(dists)

        num_test = dists.shape[0]
        y_pred = np.zeros(num_test)

        if self.proto_idx is None:
            y_ref = self.y_train
        else:
            y_ref = self.y_train[self.proto_idx]
        
        n_classes = int(np.max(self.y_train)) + 1
        class_probs = np.empty((num_test, n_classes))
        for i in range(num_test):
            row = dists[i]

            if is_masked:
                row_filled = np.ma.filled(row, np.inf) # masked = +inf
            else:
                row_filled = row

            top_k1 = np.argpartition(row_filled, k)[:k+1]

            d = row_filled[top_k1]

            # If everything was masked / inf -- can't predict
            if not np.isfinite(d).any():
                y_pred[i] = -1
                continue

            h = np.max(d) + self.eps          # bandwidth
            non_max_mask = np.isfinite(d) & (d < (h - self.eps)) # self.eps решает (k=12/13)
            r = d[non_max_mask] / h           # normalized distances
            w = self.kernel(r)                # kernel weights

            k_neighbors_classes = y_ref[top_k1[non_max_mask]]
            row_class_probs = np.bincount(k_neighbors_classes, weights=w, minlength=n_classes)
            y_pred[i] = np.argmax(row_class_probs)
            class_probs[i] = row_class_probs

        return y_pred, class_probs

    def leave_one_out(self, k_min: int, k_max: int, norm: int = 2, plot: bool = False):

        if self.proto_idx is None:
            X_ref = self.X_train
            y_ref = self.y_train
        else:
            X_ref = self.X_train[self.proto_idx]
            y_ref = self.y_train[self.proto_idx]

        assert k_min > 0, "k_min should be at least 1"
        n = X_ref.shape[0]
        assert k_max <= n - 2, "k_max should be <= n-2 (because you need k+1 neighbors after removing self)."

        dists = self.compute_distances(X_ref, norm)
        dists = np.ma.array(dists, mask=False)
        np.fill_diagonal(dists.mask, True)

        ks = np.arange(k_min, k_max + 1)
        errors = np.zeros((k_max + 1 - k_min,))

        for i, k in enumerate(ks):
            y_pred, _ = self.predict_labels(dists, k)
            errors[i] = (1. - np.mean(y_pred == y_ref)) # rate of mistakes (0-1 loss)

        best_idx = int(np.argmin(errors))
        opt_k = int(ks[best_idx])
        opt_err = float(errors[best_idx])

        if plot:
            fig = go.Figure()

            fig.add_trace(go.Scatter(
                x=ks,
                y=errors,
                mode="lines",
                name="LOO empirical risk",
            ))

            # Highlight optimum
            fig.add_trace(go.Scatter(
                x=[opt_k],
                y=[opt_err],
                mode="markers",
                name=f"min @ k={opt_k}",
                marker=dict(size=12, symbol="x"),
            ))

            fig.update_layout(
                title="LOO empirical risk vs k",
                xaxis_title="k (number of neighbors)",
                yaxis_title="Empirical risk (LOO error rate)",
                hovermode="x unified",
            )

            # Optional: show a vertical dashed line at the optimum
            fig.add_vline(
                x=opt_k,
                line_dash="dash",
                annotation_text=f"k* = {opt_k}",
                annotation_position="top",
            )

            fig.show()

        return opt_k, errors

    def select_prototypes(
        self, 
        k: int = 1, 
        norm: int = 2,
        method: Literal['add', 'remove'] = 'remove',
        n_iter: int = 10,
        min_size: int = 10,     # min prototype set size
        remove_fraction: float = 1.0,
        seed: int = 14122025,
        inplace: bool = False,
        verbose: bool = False
    ):
        assert k >= 1
        n_total = self.X_train.shape[0]
        assert n_total >= (k + 2)

        rng = np.random.default_rng(seed)

        if method == 'remove':
            history = {"sizes": [], "loo_errors": [], "removed_each_pass": []}
            proto_idx = np.arange(n_total, dtype=int)
            
            for it in range(n_iter):
                if proto_idx.shape[0] <= min_size:
                    if verbose:
                        print(f"[ENN] Stop: |prototype_set|={proto_idx.shape[0]} <= min_size={min_size}")
                
                X_ref = self.X_train[proto_idx]
                y_ref = self.y_train[proto_idx]

                old_proto = self.proto_idx
                self.proto_idx = proto_idx

                dists = self.compute_distances(X_ref, norm=norm)
                dists = np.ma.array(dists, mask=False)
                np.fill_diagonal(dists.mask, True)

                y_pred, _ = self.predict_labels(dists, k=k)
                self.proto_idx = old_proto

                loo_err = 1.0 - np.mean(y_pred == y_ref)
                history["sizes"].append(int(proto_idx.shape[0]))
                history["loo_errors"].append(float(loo_err))

                mis_mask = (y_pred != y_ref)
                mis_idx_local = np.where(mis_mask)[0]
                n_mis = mis_idx_local.size

                if verbose:
                    print(f"[ENN] pass {it+1}/{n_iter}: |prototype_set|={proto_idx.size}, LOO_err={loo_err:.4f}, mis={n_mis}")

                if n_mis == 0:
                    break
                
                if remove_fraction < 1.0:
                    m = max(1, int(np.ceil(remove_fraction * n_mis)))
                    chosen_local = rng.choice(mis_idx_local, size=m, replace=False)
                else:
                    chosen_local = mis_idx_local

                # Remove selected misclassified indices from Ω
                keep_mask = np.ones(proto_idx.shape[0], dtype=bool)
                keep_mask[chosen_local] = False
                removed = proto_idx[~keep_mask]
                proto_idx = proto_idx[keep_mask]

                history["removed_each_pass"].append(removed)

            self.proto_idx = proto_idx.copy()

            if inplace:
                self.X_train = self.X_train[self.proto_idx]
                self.y_train = self.y_train[self.proto_idx]
                self.proto_idx = None  # now all remaining points are prototypes

        return proto_idx, history
            

In [16]:
# Baseline model
print(f'Baseline accuracy: {(y_test == 0).mean() * 100:.2f}%')

Baseline accuracy: 64.65%


In [17]:
knn_clf = KNearestNeighbors(gaussian)
knn_clf.fit(X_train, y_train)

In [18]:
opt_k, _ = knn_clf.leave_one_out(1, 200, plot=True)

In [31]:
tmp, _ = knn_clf.select_prototypes(opt_k, 2, 'remove', 100, remove_fraction=0.1, verbose=True)

[ENN] pass 1/100: |prototype_set|=1074, LOO_err=0.2132, mis=229
[ENN] pass 2/100: |prototype_set|=1051, LOO_err=0.1998, mis=210
[ENN] pass 3/100: |prototype_set|=1030, LOO_err=0.1854, mis=191
[ENN] pass 4/100: |prototype_set|=1010, LOO_err=0.1703, mis=172
[ENN] pass 5/100: |prototype_set|=992, LOO_err=0.1573, mis=156
[ENN] pass 6/100: |prototype_set|=976, LOO_err=0.1465, mis=143
[ENN] pass 7/100: |prototype_set|=961, LOO_err=0.1342, mis=129
[ENN] pass 8/100: |prototype_set|=948, LOO_err=0.1266, mis=120
[ENN] pass 9/100: |prototype_set|=936, LOO_err=0.1207, mis=113
[ENN] pass 10/100: |prototype_set|=924, LOO_err=0.1093, mis=101
[ENN] pass 11/100: |prototype_set|=913, LOO_err=0.1051, mis=96
[ENN] pass 12/100: |prototype_set|=903, LOO_err=0.0986, mis=89
[ENN] pass 13/100: |prototype_set|=894, LOO_err=0.0895, mis=80
[ENN] pass 14/100: |prototype_set|=886, LOO_err=0.0835, mis=74
[ENN] pass 15/100: |prototype_set|=878, LOO_err=0.0763, mis=67
[ENN] pass 16/100: |prototype_set|=871, LOO_err=0.

In [32]:
start = perf_counter_ns()
y_pred = knn_clf.predict(X_test, k=5)
end = perf_counter_ns()

acc     = accuracy_score(y_test, y_pred)
f1      = f1_score(y_test, y_pred)
print(f'Accuracy: {acc*100:.2f} %')
print(f'F1:       {f1*100:.2f} %')
print(f'Estimation time: {(end - start)/1e6:.4f} ms.')

Accuracy: 69.12 %
F1:       24.55 %
Estimation time: 76.5231 ms.


In [33]:
y_pred = knn_clf.predict(X_test, k=opt_k)
acc     = accuracy_score(y_test, y_pred)
f1      = f1_score(y_test, y_pred)
print(f'Accuracy: {acc*100:.2f} %')
print(f'F1:       {f1*100:.2f} %')

Accuracy: 67.26 %
F1:       13.73 %


## Sklearn

In [None]:
class SklearnParzenKNN:

    def __init__(self, k: int, kernel, metric="minkowski", p=2, eps=1e-12):
        assert k >= 1
        self.k = k
        self.kernel = kernel
        self.metric = metric
        self.p = p
        self.eps = eps

        # sklearn will use k+1 neighbors so weights() sees k+1 distances.
        self.model = KNeighborsClassifier(
            n_neighbors=k + 1,
            weights=self._weights,
            metric=metric,
            p=p if metric == "minkowski" else None
        )

    def _weights(self, distances: np.ndarray) -> np.ndarray:

        d = np.asarray(distances)

        # Single row case
        if d.ndim == 1:
            h = np.max(d) + self.eps
            r = d / h
            w = self.kernel(r)

            # Exclude the farthest neighbor(s): those at distance == max(d)
            w[d >= (h - self.eps)] = 0.0
            return w

        # Batch case
        h = np.max(d, axis=1, keepdims=True) + self.eps
        r = d / h
        w = self.kernel(r)

        # Zero out farthest neighbor(s) per row
        w[d >= (h - self.eps)] = 0.0
        return w

    def fit(self, X, y):
        self.X_train_ = np.asarray(X)
        self.y_train_ = np.asarray(y)
        self.model.fit(X, y)
        self.classes_ = self.model.classes_
        return self

    def predict(self, X):
        return self.model.predict(X)

    def predict_proba(self, X):
        # kernel weights, normalized by sum of weights internally
        return self.model.predict_proba(X)

    def score(self, X, y):
        return self.model.score(X, y)
    
    def leave_one_out(self, k_min: int, k_max: int, plot: bool = False):
        assert k_min > 0, "k_min should be at least 1"
        n = self.X_train_.shape[0]
        assert k_max <= n - 2, "k_max should be <= n-2 for Parzen-kNN LOO."

        # We will compute neighbors for each point among the other n-1 points.
        # Request k+2 because the nearest neighbor is the point itself at distance 0.
        # Then drop that self neighbor => we get k+1 neighbors needed for Parzen bandwidth.
        ks = np.arange(k_min, k_max + 1)
        errors = np.zeros(len(ks), dtype=float)

        nn = NearestNeighbors(metric=self.metric, p=self.p if self.metric == "minkowski" else None)
        nn.fit(self.X_train_)

        for idx_k, k in enumerate(ks):
            distances, indices = nn.kneighbors(self.X_train_, n_neighbors=k+2, return_distance=True)

            # Drop self neighbor (distance 0, index = i) at column 0
            d = distances[:, 1:]     # shape (n, k+1)
            ind = indices[:, 1:]     # shape (n, k+1)

            # bandwidth h(x) = max distance among these k+1
            h = np.max(d, axis=1, keepdims=True) + self.eps

            # weights for all k+1 according to kernel
            r = d / h
            w = self.kernel(r)

            # exclude the farthest neighbor(s) (distance == h - eps)
            w[d >= (h - self.eps)] = 0.0

            # Now compute weighted vote per row for binary/multiclass labels:
            # We'll accumulate weights into class bins.
            # This expects non-negative integer labels (like your np.bincount approach).
            y_pred = np.empty(n, dtype=int)

            num_classes = int(np.max(self.y_train_) + 1)
            for i in range(n):
                labels_i = self.y_train_[ind[i]]
                weights_i = w[i]
                class_scores = np.bincount(labels_i, weights=weights_i, minlength=num_classes)
                y_pred[i] = int(np.argmax(class_scores))

            errors[idx_k] = 1.0 - np.mean(y_pred == self.y_train_)

        best_idx = int(np.argmin(errors))
        opt_k = int(ks[best_idx])
        opt_err = float(errors[best_idx])

        if plot:
            fig = go.Figure()
            fig.add_trace(go.Scatter(x=ks, y=errors, mode="lines", name="LOO empirical risk"))
            fig.add_trace(go.Scatter(
                x=[opt_k], y=[opt_err],
                mode="markers",
                name=f"min @ k={opt_k}",
                marker=dict(size=12, symbol="x")
            ))
            fig.add_vline(x=opt_k, line_dash="dash",
                          annotation_text=f"k* = {opt_k}",
                          annotation_position="top")
            fig.update_layout(
                title="LOO empirical risk vs k (sklearn-based Parzen-kNN)",
                xaxis_title="k",
                yaxis_title="Empirical risk (LOO error rate)",
                hovermode="x unified",
            )
            fig.show()

        return opt_k, errors

In [None]:
ref = SklearnParzenKNN(k=5, kernel=gaussian, metric="minkowski", p=2)
ref.fit(X_train, y_train)

In [None]:
opt_k, _ = ref.leave_one_out(1, 200, plot=True)

In [None]:
ref = SklearnParzenKNN(k=opt_k, kernel=gaussian, metric="minkowski", p=2)
ref.fit(X_train, y_train)

start = perf_counter_ns()
y_pred_ref = ref.predict(X_test)
end = perf_counter_ns()

acc     = accuracy_score(y_test, y_pred_ref)
f1      = f1_score(y_test, y_pred_ref)
print(f'Accuracy: {acc*100:.2f} %')
print(f'F1:       {f1*100:.2f} %')
print(f'Estimation time: {(end - start)/1e6:.4f} ms.')

In [None]:
print('Number of mismatched predictions: ', y_pred[np.invert(y_pred == y_pred_ref)].shape)
print('Size of the training set: ', y_pred.shape)

---