In [3]:
import matplotlib.pyplot as plt
import optuna
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler


In [4]:
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target
df

In [6]:
def get_split_data():
    data = load_iris()
    X, y = data.data, data.target
    scaler = StandardScaler()
    X_normalized = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    return X_test, X_train, X_val, y_test, y_train, y_val

In [7]:
def print_accuracy(y_pred, y_test):
    test_accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {test_accuracy}")

In [8]:
def pretty_print_best(best_params, best_score):
    print(f"Best parameters: {best_params}")
    print(f"Best score: {best_score}")

In [9]:
def func(trial, X_train, X_val, y_train, y_val):
    n_neighbors = trial.suggest_int('n_neighbors', 1, 30)
    p = trial.suggest_int('p', 1, 4)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    metric = trial.suggest_categorical('metric', ['euclidean', 'manhattan', 'cosine'])
    knn = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, metric=metric, p=p)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy

In [10]:
def get_best_hyperparameters(X_train, X_val, y_train, y_val):
    study = optuna.create_study(direction='maximize')
    study.optimize(
        func=lambda trial: func(trial, X_train, X_val, y_train, y_val),
        n_trials=100,
        show_progress_bar=True
    )
    best_params = study.best_params
    best_value = study.best_value
    return best_params, best_value


In [11]:
X_test, X_train, X_val, y_test, y_train, y_val = get_split_data()
best_params, best_score = get_best_hyperparameters(X_train, X_val, y_train, y_val)
print("~" * 40)
pretty_print_best(best_params, best_score)

In [12]:

def plot_accuracy_vs_k(X_test, X_train, X_val, y_test, y_train, y_val, weights, metric, p=1):
    ks = range(1, 90)
    train_accuracies = []
    test_accuracies = []

    for k in ks:
        knn = KNeighborsClassifier(n_neighbors=k, weights=weights, metric=metric, p=p)
        knn.fit(X_train, y_train)

        y_train_pred = knn.predict(X_train)
        train_accuracies.append(accuracy_score(y_train, y_train_pred))

        y_test_pred = knn.predict(X_test)
        test_accuracies.append(accuracy_score(y_test, y_test_pred))

    plt.plot(ks, train_accuracies, label='Train Accuracy')
    plt.plot(ks, test_accuracies, label='Test Accuracy')
    plt.xlabel('Number of Neighbors (k)')
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs Number of Neighbors')
    plt.legend()
    plt.show()

In [13]:
plot_accuracy_vs_k(X_test, X_train, X_val, y_test, y_train, y_val, best_params['weights'], best_params['metric'],
                   best_params['p'])

In [14]:
import numpy as np
from knn.my_knn import Kernel


# {'n_neighbors': 21, 'p': 4, 'weights': 'uniform', 'metric': 'euclidean'}

def LOWESS(X, y, kernel=None):
    weights = []
    for i in range(len(X)):
        knn = KNeighborsClassifier(
            n_neighbors=9, 
            metric='euclidean',
            p=4
        )
        knn.fit(np.delete(X, i, axis=0), np.delete(y, i, axis=0))
        new_y = (knn.predict(np.array([X[i]]))[0])
        if kernel is not None:
            weight = kernel(0 if y[i] == new_y else 1)
        else:
            weight = 0 if y[i] == new_y else 1
        weights.append(weight)

    return np.array(weights)

In [15]:
data = load_iris()
X, y = data.data, data.target
weights = LOWESS(X, y, kernel=Kernel.UNIFORM)
print(weights)
print(weights.shape)