In [1]:
# for fetching datasets
from ucimlrepo import fetch_ucirepo 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import cross_validate

# the custom model
from test_class import PRKNeighborsClassifier

from figure_helpers import generate_score_plot


In [2]:
# for sampling
RANDOM_STATE = 33

In [3]:
uciml_repos_name_id_dict = {
    "Banknote": 267,
    "Glass": 42,
    "Ionosphere": 52,
    "Wine": 109,
    "Parkisons": 174,
    "Sonar": 151,
    "Haberman": 43,
    "Page blocks": 78,
    "Letter recognition": 59,
    "Ecoli": 39,
    "Optical digits": 80,
    "Pen digits": 81,
    "Transfusion": 176,
    "Musk2": 75
}

In [None]:
fig, ax = plt.subplots(nrows=7, ncols=2, sharex=True,  figsize=(10, 30))

axs = ax.ravel()

k_values = [5, 15, 30, 45]

scoring = ('f1_macro', 'accuracy')


for i, dataset_name_id in tqdm(enumerate(uciml_repos_name_id_dict.items())):

    dataset_name, id = dataset_name_id
    print(f"Working on {dataset_name}")

    # fetching the data
    print("fetching data")
    dataset = fetch_ucirepo(id=id)
    X = dataset.data.features
    y = dataset.data.targets

    print(f"Size: {X.shape}")

    # # TODO: remove after testing loops
    # X = X.sample(50)
    # y = y.sample(50)

    # fit with k_values and plot reslts

    print("transforming target")
    if y.dtypes.iloc[0] == 'object':
        y = LabelEncoder().fit_transform(y)
    else:
        y=np.ravel(y)

    mnmx_scaler = MinMaxScaler()
    X = mnmx_scaler.fit_transform(X)

    standard_scores = []
    weighted_scores = []
    enhanced_scores = []

    
    for k in k_values:
        
        print(f"n_k={k}")

        print(f"fitting standard model")
        pr_knn_standard = PRKNeighborsClassifier(
            pr_version='standard',
            base_knn_params={"n_neighbors": k}
            ).fit(X, y)
        
        print(f"fitting weighted model")
        pr_knn_weighted = PRKNeighborsClassifier(
            pr_version='weighted',
            base_knn_params={"n_neighbors": k}
            ).fit(X,y)
        
        print(f"fitting enhanced model")
        pr_knn_enhanced = PRKNeighborsClassifier(
            pr_version='enhanced',
            base_knn_params={"n_neighbors": k}
            ).fit(X,y)

        print("CVing standard model")
        standard_scores.append(
                cross_validate(
                estimator=pr_knn_standard,
                X=X,
                y=y,
                scoring=scoring,
                cv=10
                )
            )
        
        print("CVing weighted model")
        weighted_scores.append(
                cross_validate(
                estimator=pr_knn_weighted,
                X=X,
                y=y,
                scoring=scoring,
                cv=10
                )
        )

        print("CVing enhanced model")
        enhanced_scores.append(
                cross_validate(
                estimator=pr_knn_enhanced,
                X=X,
                y=y,
                scoring=scoring,
                cv=10
                )
        )

    scores = {}

    scores["standard"] = np.array([d['test_f1_macro'] for d in standard_scores])
    scores["enhanced"] = np.array([d['test_f1_macro'] for d in enhanced_scores])
    scores["weighted"] = np.array([d['test_f1_macro'] for d in weighted_scores])

    # plotting the averages
    x = np.arange(len(k_values))
    width = 0.25
    multiplier = 0

    generate_score_plot(
        scores=scores,
        metric= "F1",
        k_values=k_values,
        dataset_name=dataset_name,
        ax=axs[i]
    )


0it [00:00, ?it/s]

Working on Banknote
fetching data
Size: (1372, 4)
transforming target
n_k=5
fitting standard model
fitting weighted model
fitting enhanced model
CVing standard model
CVing weighted model
CVing enhanced model
n_k=15
fitting standard model
fitting weighted model
fitting enhanced model
CVing standard model
CVing weighted model
CVing enhanced model
n_k=30
fitting standard model
fitting weighted model
fitting enhanced model
CVing standard model
CVing weighted model
CVing enhanced model
n_k=45
fitting standard model
fitting weighted model
fitting enhanced model
CVing standard model
CVing weighted model
CVing enhanced model


1it [01:17, 77.18s/it]

Working on Glass
fetching data
Size: (214, 9)
transforming target
n_k=5
fitting standard model
fitting weighted model
fitting enhanced model
CVing standard model




CVing weighted model




CVing enhanced model




n_k=15
fitting standard model
fitting weighted model
fitting enhanced model
CVing standard model




CVing weighted model




CVing enhanced model




n_k=30
fitting standard model
fitting weighted model
fitting enhanced model
CVing standard model




CVing weighted model




CVing enhanced model




n_k=45
fitting standard model
fitting weighted model
fitting enhanced model
CVing standard model




CVing weighted model




CVing enhanced model


2it [01:32, 40.65s/it]

Working on Ionosphere
fetching data
Size: (351, 34)
transforming target
n_k=5
fitting standard model


  y = column_or_1d(y, warn=True)


fitting weighted model
fitting enhanced model
CVing standard model
CVing weighted model
CVing enhanced model
n_k=15
fitting standard model
fitting weighted model
fitting enhanced model
CVing standard model
CVing weighted model
CVing enhanced model
n_k=30
fitting standard model
fitting weighted model
fitting enhanced model
CVing standard model
CVing weighted model
CVing enhanced model
n_k=45
fitting standard model
fitting weighted model
fitting enhanced model
CVing standard model
CVing weighted model
CVing enhanced model


3it [04:49, 112.15s/it]

Working on Wine
fetching data
Size: (178, 13)
transforming target
n_k=5
fitting standard model
fitting weighted model
fitting enhanced model
CVing standard model
CVing weighted model
CVing enhanced model
n_k=15
fitting standard model
fitting weighted model
fitting enhanced model
CVing standard model
CVing weighted model
CVing enhanced model
n_k=30
fitting standard model
fitting weighted model
fitting enhanced model
CVing standard model
CVing weighted model
CVing enhanced model
n_k=45
fitting standard model
fitting weighted model
fitting enhanced model
CVing standard model
CVing weighted model
CVing enhanced model


4it [05:01, 72.65s/it] 

Working on Parkisons
fetching data
Size: (195, 22)
transforming target
n_k=5
fitting standard model
fitting weighted model
fitting enhanced model
CVing standard model
CVing weighted model
CVing enhanced model
n_k=15
fitting standard model
fitting weighted model
fitting enhanced model
CVing standard model
CVing weighted model
CVing enhanced model
n_k=30
fitting standard model
fitting weighted model
fitting enhanced model
CVing standard model
CVing weighted model
CVing enhanced model
n_k=45
fitting standard model
fitting weighted model
fitting enhanced model
CVing standard model
CVing weighted model
CVing enhanced model


5it [06:40, 82.08s/it]

Working on Sonar
fetching data
Size: (208, 60)
transforming target
n_k=5
fitting standard model


  y = column_or_1d(y, warn=True)


fitting weighted model
fitting enhanced model
CVing standard model
CVing weighted model
CVing enhanced model
