In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from numpy import genfromtxt
import math

In [2]:
iris = load_iris()
X_iris = iris.data
y_iris = iris.target

In [3]:
ionosphere = genfromtxt('ionosphere.txt', delimiter=',')
X_ionosphere = ionosphere[:,:-1]  # All rows and all columns except the last one
y_ionosphere = ionosphere[:,-1]   # All rows and only the last column

In [4]:
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(X_iris, y_iris, test_size=0.33, random_state=2909)

In [5]:
X_train_ionosphere, X_test_ionosphere, y_train_ionosphere, y_test_ionosphere = train_test_split(X_ionosphere, y_ionosphere, test_size=0.33, random_state=2909)

In [6]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2)**2))

In [7]:
def nearest_neighbour(X_train, y_train, x_test):
    # Calculate distances between x_test and all samples in X_train
    distances = [euclidean_distance(x_test, x_train) for x_train in X_train]

    # Get the index of the smallest distance
    nn_index = np.argmin(distances)

    # Return the corresponding class label
    return y_train[nn_index]

In [8]:
def test_nearest_neighbour(X_train, y_train, X_test, y_test):
    # Predict the class for each test instance
    y_pred = [nearest_neighbour(X_train, y_train, x_test) for x_test in X_test]

    # Calculate the error rate
    error_rate = np.sum(y_pred != y_test) / len(y_test)

    return error_rate

In [9]:
iris_error_rate = test_nearest_neighbour(X_train_iris, y_train_iris, X_test_iris, y_test_iris)
print(f'Iris dataset error rate: {iris_error_rate * 100:.2f}%')

ionosphere_error_rate = test_nearest_neighbour(X_train_ionosphere, y_train_ionosphere, X_test_ionosphere, y_test_ionosphere)
print(f'Ionosphere dataset error rate: {ionosphere_error_rate * 100:.2f}%')

Iris dataset error rate: 8.00%
Ionosphere dataset error rate: 15.52%


In [10]:
def conformity_measure(X_train, y_train, x_test, y_test):
    same_class_indices = np.where(y_train == y_test)
    diff_class_indices = np.where(y_train != y_test)

    same_class_distances = [euclidean_distance(x_test, x_train) for x_train in X_train[same_class_indices]]
    diff_class_distances = [euclidean_distance(x_test, x_train) for x_train in X_train[diff_class_indices]]

    if len(same_class_distances) == 0:
        same_class_nearest_distance = math.inf
    else:
        same_class_nearest_distance = np.min(same_class_distances)

    if len(diff_class_distances) == 0:
        diff_class_nearest_distance = math.inf
    else:
        diff_class_nearest_distance = np.min(diff_class_distances)

    if same_class_nearest_distance == 0:
        return math.inf
    else:
        return diff_class_nearest_distance / same_class_nearest_distance

In [11]:
def nearest_neighbour_conformal_predictor(X_train, y_train, X_test, y_test):
    # Calculate the conformity measures for all test instances
    alphas = [conformity_measure(X_train, y_train, x_test, y_test) for x_test, y_test in zip(X_test, y_test)]

    # Calculate the p-values for all possible labels for all test instances
    p_values = [np.mean([1 if alpha >= alpha_prime else 0 for alpha_prime in alphas]) for alpha in alphas]

    # Calculate the average false p-value
    avg_false_p_value = np.mean([p for p, y in zip(p_values, y_test) if p < y])

    return avg_false_p_value

In [12]:
iris_avg_false_p_value = nearest_neighbour_conformal_predictor(X_train_iris, y_train_iris, X_test_iris, y_test_iris)
print(f'Iris dataset average false p-value: {iris_avg_false_p_value * 100:.2f}%')

ionosphere_avg_false_p_value = nearest_neighbour_conformal_predictor(X_train_ionosphere, y_train_ionosphere, X_test_ionosphere, y_test_ionosphere)
print(f'Ionosphere dataset average false p-value: {ionosphere_avg_false_p_value * 100:.2f}%')

Iris dataset average false p-value: 32.45%
Ionosphere dataset average false p-value: 64.80%
