In [26]:
from ipynb.fs.full.utils import *

In [27]:
from sklearn.neighbors import NearestNeighbors
from sklearn.svm import SVC
from sklearn import preprocessing as pre
import math
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import optuna
import numpy as np
from scipy.special import gamma

In [5]:
class K_Nearest_Neighbours():
    
    def __init__(self):
        pass
    
    def fit(X, y, k=None):
        if k == None:
            k= round(0.3*(len(X))**(4/5))
        mean_dist, min_dist, max_dist = compute_average_distance(X)
        #Take 5% of k-neighbours
        nbrs = NearestNeighbors(n_neighbors=k, radius=max_dist, algorithm='auto').fit(X)

        distances, indices = nbrs.kneighbors(X, n_neighbors=k, return_distance=True)

        return distances, indices
    
    
    def predict(X, distances, indices, y, epsilon, Plotting=None):
        final_points = []
            
        plt_coor = []
        estimates_lof = []
        estimates_lrd = []

        #For all points.
        for index, arr in enumerate(indices):

            a = np.zeros(shape=(len(arr), 2))

            for i in range(len(arr)):
                a[i] = np.array([y[arr[i]], distances[index][i]]) #Create an array with [Y_class, distance]

            a = a[a.T[0, :].argsort()]
            a = np.split(a[:,1], np.unique(a[:, 0], return_index=True)[1][1:]) #Split per class 

            max_distance_per_class = [max(x) for x in a]
            length_per_class = [len(x) for x in a]
            sum_reach_dist_per_class = [sum(x) for x in a]

            amount_classes =  len(np.unique(y, return_counts=True)[1])
            if len(sum_reach_dist_per_class) != amount_classes: #If not all classes are present.
                continue #Don't take it into account.

            
            lrd_per_class = []
            lof_per_class = []
            classes = np.unique(y)
            
            for ind in range(amount_classes):
                if max_distance_per_class[ind] == 0 and sum_reach_dist_per_class[ind] == 0: #If the only point from the class is the point itself.
                    adapted_lrd_estimate = 0 #Then the density is 0 for that class.
                    simplifiedlof_estimate = 0 #Then the density is 0 for that class.
                else:
                    point_class = classes[ind]
                    class_length = np.count_nonzero(y == point_class)
                    
                    amount_of_points_estimate = amount_of_points(length_per_class[ind], class_length)
                    adapted_lrd_estimate = adapted_lrd(sum_reach_dist_per_class[ind], length_per_class[ind], dim=len(X[0]))
                    adapted_lrd_estimate = (adapted_lrd_estimate * amount_of_points_estimate)
                    
                    simplifiedlof_estimate = simplifiedLOFEstimate(max_distance_per_class[ind], dim=len(X[0]))
                    simplifiedlof_estimate = (simplifiedlof_estimate) * amount_of_points_estimate

                lrd_per_class.append(adapted_lrd_estimate)
                lof_per_class.append(simplifiedlof_estimate)
                
            final_points.append(arr[0])
            estimates_lof.append(min(lof_per_class))
            estimates_lrd.append(min(lrd_per_class))       
            plt_coor.append([X[arr[0]]]) #scaler.inverse_transform([X[arr[0]]])[0][0])

                        
        if Plotting:
            plt.scatter(plt_coor, estimates_lrd,alpha=0.4)
            plt.scatter(Plotting[0], Plotting[1], alpha=0.4, c="red")
            plt.xlabel('Coordinates') 
            plt.ylabel('Custom Density')
            plt.title("Density Values LRD")
            plt.legend(["Estimated Density", "True Density"], fontsize=8)
            plt.show()

            plt.scatter(plt_coor, estimates_lof, alpha=0.4)
            plt.scatter(Plotting[0], Plotting[1], alpha=0.4, c="red")
            plt.xlabel('Coordinates') 
            plt.ylabel('Custom Density')
            plt.title("Density Values LOF")
            plt.legend(["Estimated Density", "True Density"], fontsize=8)
            plt.show()
                
        #Return the final points in the interval
        final_points_lof = np.asarray(final_points)[np.where(np.asarray(estimates_lof)> epsilon)[:]]
        final_points_lrd = np.asarray(final_points)[np.where(np.asarray(estimates_lrd)> epsilon)[:]]
        
        overlap_lof = None
        overlap_lrd = None

        if len(final_points_lof) != 0:
            overlap_lof = X[list(final_points_lof)[:]]
 
        if len(final_points_lrd) != 0:
            overlap_lrd = X[list(final_points_lrd)[:]]

        return overlap_lof, overlap_lrd

    
    def fit_predict(X, y, epsilon):
        distances, indices = K_Nearest_Neighbours.fit(X, y)
        return K_Nearest_Neighbours.predict(X, distances, indices, y, epsilon, Plotting=None)


In [6]:
def adapted_lrd(class_distance, length_per_class, dim):
    Vd = np.pi**(dim/2)/gamma((dim/2)+1)
    return 1/(class_distance*Vd*2/length_per_class)

def simplifiedLOFEstimate(max_distance_per_class, dim):
    Vd = np.pi**(dim/2)/gamma((dim/2)+1)
    return 1/(max_distance_per_class * Vd)

def amount_of_points(amount_point_class, total_points):
    return amount_point_class/total_points

In [30]:
def plot_density_estimates():
    np.random.seed(0)
    epsilon = 0
    size = 1000
    mean1 = 0  
    mean2 = 2 
    scale1 = 1
    scale2 = 1

    x1 = np.random.normal(mean1, scale1, size)
    x2 = np.random.normal(mean2, scale2, size)
    X = np.concatenate([x1, x2]).reshape(-1, 1)
    y = np.concatenate([np.ones(len(x1)), -np.ones(len(x2))])
    k=100
    
    
    distances, indices = K_Nearest_Neighbours.fit(X, y, k)
    
    
    f1_distribution = stats.norm.pdf(x=X, loc=mean1, scale=scale1)
    f2_distribution = stats.norm.pdf(x=X, loc=mean2, scale=scale2)
    class_ov = (f1_distribution > epsilon) & (f2_distribution > epsilon)
    X_coor = X[class_ov]
    overlap_nn = K_Nearest_Neighbours.predict(X, distances, indices, y, epsilon, Plotting=[X_coor, np.minimum(f1_distribution[class_ov], f2_distribution[class_ov])])

In [31]:
def tunning_k_plots_subplot_density_estimates():
    import scipy

    n=1000
    np.random.seed(0)
    C_0 = 0.01
    epsilon = 0
    for i in range(200):
        
        mean1 = 0  # Change those values to have the other plot.
        mean2 = 2
        scale1 = 1
        scale2 = 1

        x1 = np.random.normal(mean1, scale1, n)
        x2 = np.random.normal(mean2, scale2, n)
        X = np.concatenate([x1, x2]).reshape(-1, 1)
        y = np.concatenate([np.ones(len(x1)), -np.ones(len(x2))])

        k= round(C_0*(len(X)/2)**(4/5))
        distances, indices = K_Nearest_Neighbours.fit(X, y, k)
        
        f1_distribution = stats.norm.pdf(x=X, loc=mean1, scale=scale1)
        f2_distribution = stats.norm.pdf(x=X, loc=mean2, scale=scale2)
        class_ov = (f1_distribution > epsilon) & (f2_distribution > epsilon)
        X_coor = X[class_ov]
        K_Nearest_Neighbours.predict(X, distances, indices, y, epsilon, Plotting=[X_coor, np.minimum(f1_distribution[class_ov], f2_distribution[class_ov])])
        C_0 = C_0 + 0.01


In [70]:
def tunning_k_plots(X, y, epsilon, n, C_0, mean1, scale1, mean2, scale2):
    
    k= round(C_0*(len(X)/2)**(4/5))
    distances, indices = K_Nearest_Neighbours.fit(X, y, k)

    f1_distribution = stats.uniform.pdf(x=X, loc=mean1, scale=scale1-mean1)
    f2_distribution = stats.uniform.pdf(x=X, loc=mean2, scale=scale2-mean2)
    class_ov = (f1_distribution > epsilon) & (f2_distribution > epsilon)
    overlap = X[class_ov]

    overlap_lof, overlap_lrd = K_Nearest_Neighbours.predict(X, distances, indices, y, epsilon)

    true_interval = (0,0)
    if len(overlap) == 0: #If no true overlap
        true_interval == (0, 0)
    else: #If overlap
        true_interval = (overlap.min(), overlap.max())

    y_true = []
    for data in X:
        if len(overlap) == 0:
            y_true.append(0)
        elif data > overlap.min() and data < overlap.max():
            y_true.append(1)
        else:
            y_true.append(0)


    if overlap_lof is None: #If no estimated overlap
        estimated_interval_lof = (0, 0)
    else: #If estimated overlap
        estimated_interval_lof = (overlap_lof.min(), overlap_lof.max())
        overlap_lof = [0]
    if overlap_lrd is None: #If no estimated overlap
        estimated_interval_lrd = (0, 0)
        overlap_lrd = [0]
    else: #If estimated overlap
        estimated_interval_lrd = (overlap_lrd.min(), overlap_lrd.max())
        
    IOU_area_lof = IOU(estimated_interval_lof, true_interval)
    IOU_point_lof = iou_acc_multiple_dim(X, overlap_lof, y_true)[0]
    IOU_area_lrd = IOU(estimated_interval_lrd, true_interval)
    IOU_point_lrd = iou_acc_multiple_dim(X, overlap_lrd, y_true)[0]
    
    return (IOU_area_lof + IOU_point_lof + IOU_area_lrd + IOU_point_lrd)/4

In [73]:
def make_contour_plot():
    def objective(trial):    
        n = trial.suggest_int("nvalues", 50, 2000)
        C_0 = trial.suggest_float("C_0 values", 0.01, 2)
        np.random.seed(0)
        mean1 = 0  
        mean2 = 1 
        scale1 = 3
        scale2 = 2
        epsilon = 0.1
        x1 = np.random.uniform(mean1, scale1, n)
        x2 = np.random.uniform(mean2, scale2, n)
        X = np.concatenate([x1, x2]).reshape(-1, 1)
        y = np.concatenate([np.ones(len(x1)), -np.ones(len(x2))])

        return tunning_k_plots(X, y, epsilon, n, C_0, mean1, scale1, mean2, scale2)
    

    search_space = {
        'nvalues': np.arange (50, 2000, 50),
        'C_0 values': np.arange (0.05, 2, 0.01)
    }
    
    #direction = "maximize"
    study = optuna.create_study(sampler=optuna.samplers.GridSampler(search_space))
    study.optimize(objective, n_trials =500)
    return study 

In [2]:
# study = make_contour_plot()
# optuna.visualization.plot_contour(study)