# EVALUATION OF ESTIMATIONS

In [1]:
from iplane.interpolator import Interpolator  # type: ignore
from iplane.random_mixture_collection import PCollectionMixture, DCollectionMixture  # type: ignore
from iplane.random_mixture import RandomMixture  # type: ignore
from iplane.random_empirical import RandomEmpirical  # type: ignore
import iplane.constants as cn  # type: ignore

from collections import namedtuple
from matplotlib import pyplot as plt  # type: ignore
import pandas as pd
import numpy as np
from scipy.stats import norm, multivariate_normal # type: ignore

# Accuracy of Empirical Distributions

The accuracy of an empirical estimation depends on :
1. Number of samples used to construct the empirical distribution
2. Number of dimensions of the variate
3. How samples are selected to assess accuracy
4. For a guassian, covariance matrix

In [47]:
EvaluationResult = namedtuple("EvaluationResult",
        ["num_sample", "num_dimension", "variance", 
         "covariance", "mean_absolute_error", "fraction_evaluated", "sampling_std"])
def evaluateEmpirical(num_sample:int=100,
                      num_dimension:int=2,
                      variance:float=4,
                      covariance:float=0,
                      sampling_std:float=3,
                     ) -> EvaluationResult:
    # Constants
    NUM_ITERATION = 100
    # Initializations
    mean_arr = np.repeat(0, num_dimension)
    covariance_arr = np.repeat(covariance, num_dimension*num_dimension).reshape(num_dimension, num_dimension)
    np.fill_diagonal(covariance_arr, variance)
    weight_arr = np.array([1.0])
    pcollection = PCollectionMixture(
        mean_arr=np.array([mean_arr]),
        covariance_arr=np.array([covariance_arr]),
        weight_arr=weight_arr,
    )
    random_mixture = RandomMixture()
    random_empirical = RandomEmpirical()
    sample_arr = random_mixture.generateSample(pcollection, num_sample=num_sample)
    _ = random_empirical.estimatePCollection(sample_arr)
    cdf = random_empirical.makeCDF(sample_arr)
    variate_arr = cdf.variate_arr
    cdf_arr = cdf.cdf_arr
    #
    interpolator = Interpolator( variate_arr=variate_arr, sample_arr=cdf_arr,
            is_normalize=True, max_distance=1, size_interpolation_set=5)
    errors:list = []
    avg_errors:list = []
    results:list = []
    probabilities:list = []
    for _ in range(NUM_ITERATION):
        point = np.random.uniform(-sampling_std, sampling_std, (num_dimension,))
        if not interpolator.isWithinRange(point):
            continue
        if num_dimension == 1:
            probability = norm.cdf(point, 0, scale=variance**0.5)
        else:
            probability = multivariate_normal.cdf(point, mean=mean_arr, cov=covariance_arr)  # type: ignore
        result = interpolator.predict(point)
        if np.isnan(result[0]):
            continue
        results.append(result[0])
        probabilities.append(probability)
        errors.append(abs(probability - result[0]))
        avg_errors.append(probability - result[0])
    evaluation_result = EvaluationResult(
        num_sample=num_sample, num_dimension=num_dimension, variance=variance,
        mean_absolute_error=np.mean(errors),
        fraction_evaluated=len(errors)/NUM_ITERATION,
        covariance=covariance,
        sampling_std=sampling_std,
    )
    return evaluation_result

# Tests
evaluateEmpirical(num_dimension=5, num_sample=100, sampling_std=1, variance=1)

EvaluationResult(num_sample=100, num_dimension=5, variance=1, covariance=0, mean_absolute_error=np.float64(0.013598213469307745), fraction_evaluated=0.91, sampling_std=1)

Evaluations
1. Univariate, variance = 1. Heatmap with x,y = sample size, sampline_std. Color is frac succes, mean asolute error
2. Do the same for multivariates: 2,4, 8, 16