# This is a sample Jupyter Notebook

Below is an example of a code cell. 
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

In [6]:
from sklearn.inspection import permutation_importance
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd

df = pd.read_csv("dermatology.csv", sep="\t", na_values="?", encoding="utf-8-sig")

df.columns = df.columns.str.strip()

y_val = df.iloc[:, -1]
X_val_all = df.iloc[:, :33]
X_val_clinc = df.iloc[:, :11]
X_val_histo = df.iloc[:, 11:33]

def getMapOfFeat(X):
    """
    Maps the feats in X into a map with index
    :param X: X value vector
    :return: Map with key value pair with [index:feats]
    """
    index = 0
    feats = {}
    for feat in X.columns:
        feats[index] = feat
        index += 1
    return feats

# Maps the index to the features for reclassification.
feats_all = getMapOfFeat(X_val_all)
feats_clinc = getMapOfFeat(X_val_clinc)
feats_histo = getMapOfFeat(X_val_histo)

seed = 42

X_train_all, X_test_all, y_train, y_test = train_test_split(X_val_all, y_val, test_size=0.3, random_state=seed, stratify=y_val)
X_train_Clinic, X_test_clinic, y_train, y_test = train_test_split(X_val_clinc, y_val, test_size=0.3, random_state=seed, stratify=y_val)
X_train_histo, X_test_histo, y_train, y_test = train_test_split(X_val_histo, y_val, test_size=0.3, random_state=seed, stratify=y_val)


k_values = [1,5,15,20,50]

for k in k_values:
    print("********")
    print()
    print("for k =", k)
    print()
    print("********")

    # ================= ALL FEATURES =================
    model_all = KNeighborsClassifier(n_neighbors=k)
    model_all.fit(X_train_all, y_train)
    y_pred_all = model_all.predict(X_test_all)

    perm_all = permutation_importance(
        model_all,
        X_test_all,
        y_test,
        n_repeats=10,
        random_state=42,
        scoring="accuracy"
    )

    # ================= CLINIC FEATURES =================
    model_clinic = KNeighborsClassifier(n_neighbors=k)
    model_clinic.fit(X_train_Clinic, y_train)
    y_pred_clinic = model_clinic.predict(X_test_clinic)

    perm_clinic = permutation_importance(
        model_clinic,
        X_test_clinic,
        y_test,
        n_repeats=10,
        random_state=42,
        scoring="accuracy"
    )

    # ================= HISTO FEATURES =================
    model_histo = KNeighborsClassifier(n_neighbors=k)
    model_histo.fit(X_train_histo, y_train)
    y_pred_histo = model_histo.predict(X_test_histo)

    perm_histo = permutation_importance(
        model_histo,
        X_test_histo,
        y_test,
        n_repeats=10,
        random_state=42,
        scoring="accuracy"
    )

    importances_all = perm_all.importances_mean


    # Generating a sample top set and probability matrix from seed = 42
    top_all = np.argsort(importances_all)[::-1][:5]
    top_all = sorted(top_all)
    top_features_all = []
    acc_score_all = accuracy_score(y_test, y_pred_all)
    for val in top_all:
        top_features_all.append(feats_all.get(val))
    print(f"Accuracy score for all features: {acc_score_all:.3f}")
    print("\nTop features for all features: \n", top_features_all)
    print("\nTop importances for all features: \n", importances_all[top_all])



    importances_clinic = perm_clinic.importances_mean

    top_clinic = np.argsort(importances_clinic)[::-1][:10]
    top_clinic = sorted(top_clinic)
    top_features_clinic = []
    acc_score_clinic = accuracy_score(y_test, y_pred_clinic)
    for val in top_clinic:
        top_features_clinic.append(feats_clinc.get(val))
    print(f"Accuracy score for clinic features: {acc_score_clinic:.3f}")
    print("\nTop features for clinic features: \n", top_features_clinic)
    print("\nTop importances for clinic features: \n", importances_clinic[top_clinic])


    importances_histo = perm_histo.importances_mean
    top_histo= np.argsort(importances_histo)[::-1][:10]
    top_histo = sorted(top_histo)
    top_features_histo = []
    acc_score_histo = accuracy_score(y_test, y_pred_histo)
    for val in top_histo:
        top_features_histo.append(feats_histo.get(val))
    print(f"Accuracy score for histopathological features: {acc_score_histo:.3f}")
    print("\nTop features for histopathological features: \n", top_features_histo)
    print("\nTop importances for histopathological features: \n", importances_histo[top_histo])

    # Getting the average accuracy rating
    min_seed = 42
    max_seed = 52

    avg_acc = 0
    avg_prob = 0
    for i in range(min_seed, max_seed, 1):
        X_train_avg, X_test_avg, y_train_avg, y_test_avg = train_test_split(
    X_val_all,
    y_val,
    test_size=0.2,
    random_state=i,  # also fix this!
    stratify=y_val
)

        model = KNeighborsClassifier(n_neighbors=k)

        model.fit(X_train_avg, y_train_avg)
        y_pred = model.predict(X_test_avg)
        avg_acc += accuracy_score(y_test_avg, y_pred)


    avg_acc /= (max_seed - min_seed)
    print(f'\nAverage Accuracy: {avg_acc:.2f}')


********

for k = 1

********
Accuracy score for all features: 0.955

Top features for all features: 
 ['Scathing', 'Itching', 'Koebner', 'Fibrosis', 'Elongation']

Top importances for all features: 
 [0.02090909 0.02727273 0.03363636 0.02545455 0.01909091]
Accuracy score for clinic features: 0.827

Top features for clinic features: 
 ['Erythema', 'Scathing', 'Definite Borders', 'Itching', 'Koebner', 'Polygonal', 'Follicular', 'Oral', 'Knee', 'Scalp']

Top importances for clinic features: 
 [0.05090909 0.06090909 0.02727273 0.04636364 0.08545455 0.10090909
 0.08454545 0.02909091 0.07090909 0.05090909]
Accuracy score for histopathological features: 0.864

Top features for histopathological features: 
 ['Eosinophils', 'PNL', 'Fibrosis', 'Elongation', 'Thinning', 'Disapperance', 'Spongiosis', 'Follicular.1', 'Perifollicular', 'Band-like']

Top importances for histopathological features: 
 [0.00454545 0.03272727 0.02090909 0.03363636 0.00545455 0.01272727
 0.01909091 0.03       0.04090909 