In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance
import pandas as pd

df = pd.read_csv("dermatology.csv", sep="\t", na_values="?", encoding="utf-8-sig")

df.columns = df.columns.str.strip()

y_val = df.iloc[:, -1]
X_val_all = df.iloc[:, :33]
X_val_clinc = df.iloc[:, :11]
X_val_histo = df.iloc[:, 11:33]

def getMapOfFeat(X):
    """
    Maps the feats in X into a map with index
    :param X: X value vector
    :return: Map with key value pair with [index:feats]
    """
    index = 0
    feats = {}
    for feat in X.columns:
        feats[index] = feat
        index += 1
    return feats

# Maps the index to the features for reclassification.
feats_all = getMapOfFeat(X_val_all)
feats_clinc = getMapOfFeat(X_val_clinc)
feats_histo = getMapOfFeat(X_val_histo)

# Splitting the dataset 70/30 and initialize the random forest classifier

num_of_trees = 300
seed = 42

X_train_all, X_test_all, y_train, y_test = train_test_split(X_val_all, y_val, test_size=0.3, random_state=seed, stratify=y_val)
X_train_Clinic, X_test_clinic, y_train, y_test = train_test_split(X_val_clinc, y_val, test_size=0.3, random_state=seed, stratify=y_val)
X_train_histo, X_test_histo, y_train, y_test = train_test_split(X_val_histo, y_val, test_size=0.3, random_state=seed, stratify=y_val)

from sklearn.neighbors import KNeighborsClassifier

num_neighbors = 10
seed = 42
min_seed = 0
max_seed = 100

datasets = {
    "All Features": (X_val_all, feats_all),
    "Clinical Features": (X_val_clinc, feats_clinc),
    "Histopathological Features": (X_val_histo, feats_histo)
}

algorithms = ["brute", "kd_tree", "ball_tree"]

# ===============================
# LOOP OVER DATASETS & MODELS
# ===============================
for dataset_name, (X_data, feat_map) in datasets.items():

    print("\n" + "="*60)
    print(f"DATASET: {dataset_name}")
    print("="*60)

    for algo in algorithms:

        print(f"\n--- KNN ({algo}) ---")

        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X_data, y_val,
            test_size=0.3,
            random_state=seed,
            stratify=y_val
        )

        model = KNeighborsClassifier(n_neighbors=num_neighbors, algorithm=algo)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        acc_score = accuracy_score(y_test, y_pred)

        # -----------------------
        # Permutation Importance
        # -----------------------
        result = permutation_importance(
            model, X_test, y_test,
            n_repeats=10,
            random_state=seed,
            n_jobs=-1
        )

        importances = result.importances_mean

        top_idx = np.argsort(importances)[::-1][:5]
        top_idx = sorted(top_idx)

        top_features = [feat_map[i] for i in top_idx]

        print(f"Accuracy score: {acc_score:.3f}")
        print("\nTop features:\n", top_features)
        print("\nTop importances:\n", importances[top_idx])

        # -----------------------
        # Average Accuracy Across Seeds
        # -----------------------
        avg_acc = 0

        for i in range(min_seed, max_seed):

            X_train, X_test, y_train, y_test = train_test_split(
                X_data, y_val,
                test_size=0.3,
                random_state=i,
                stratify=y_val
            )

            model = KNeighborsClassifier(
                n_neighbors=num_neighbors,
                algorithm=algo
            )

            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            avg_acc += accuracy_score(y_test, y_pred)

        avg_acc /= (max_seed - min_seed)

        print(f"\nAverage Accuracy (seeds {min_seed}-{max_seed-1}): {avg_acc:.3f}")



DATASET: All Features

--- KNN (brute) ---
Accuracy score: 0.964

Top features:
 ['Itching', 'Koebner', 'Follicular', 'Fibrosis', 'Perifollicular']

Top importances:
 [0.03636364 0.01909091 0.02545455 0.02       0.01727273]

Average Accuracy (seeds 0-99): 0.971

--- KNN (kd_tree) ---
Accuracy score: 0.973

Top features:
 ['Itching', 'Koebner', 'Follicular', 'Elongation', 'Spongiosis']

Top importances:
 [0.04272727 0.02       0.02636364 0.01818182 0.01909091]

Average Accuracy (seeds 0-99): 0.970

--- KNN (ball_tree) ---
Accuracy score: 0.973

Top features:
 ['Itching', 'Koebner', 'Follicular', 'Elongation', 'Perifollicular']

Top importances:
 [0.04545455 0.02       0.02545455 0.02272727 0.01727273]

Average Accuracy (seeds 0-99): 0.969

DATASET: Clinical Features

--- KNN (brute) ---
Accuracy score: 0.827

Top features:
 ['Itching', 'Koebner', 'Polygonal', 'Follicular', 'Knee']

Top importances:
 [0.07545455 0.08818182 0.08909091 0.06454545 0.06909091]

Average Accuracy (seeds 0-99)