In [None]:
# Import libraries required for EDA
import pandas as pd
import numpy as np
import math
import scipy.stats as st
import os
import warnings
import time

# Import Scikit-learn required libraries
from sklearn.neighbors import KernelDensity

# Import file system libraries
from pathlib import Path

# Import matplotlib library and setup environment for plots
%matplotlib inline
%config InlineBackend.figure_format='retina'
 

# Set pandas DataFrame visualization parameters
pd.options.display.max_columns = None
pd.options.display.max_rows = None

# Set rendering parameters to use TeX font if not working on Juno app.
if not '/private/var/' in os.getcwd():
    rc('font', **{'family': 'serif', 'serif': ['Computer Modern'], 'size': 11})
    rc('text', usetex=True)
    
# Get current working directory path for the tool parent folder and print it.
parent_folder = 'Tool'
cwd = str(Path(os.getcwd()[:os.getcwd().index(parent_folder)+len(parent_folder)]))
print('Parent working directory: %s' % cwd)

In [None]:
import sys
sys.path.append("..")

from library.irplib import main, irpplots

In [None]:
# Import training dataset
df = pd.read_csv(os.path.join(cwd,'data','esa-challenge','train_data.csv'), 
                 sep=',', header=0, index_col=None, skipinitialspace=False)

# Sort values of dataframe by event_id and time_to_tca and re-index
df.sort_values(by=['event_id', 'time_to_tca'], axis='index', 
               ascending=[True,False], inplace=True, ignore_index=True)

# Show first n rows of dataframe
# df.head(10)

# Get only last CDM data from every event_id
df_lastCDM = df.drop_duplicates('event_id', keep='last')

# Show first n rows of dataframe with only final CDMs.
df_lastCDM.head(6)

In [None]:
import statsmodels
import statsmodels.api as sm

settings = statsmodels.nonparametric.kernel_density.EstimatorSettings(
    efficient=True,
    n_sub=len(data)//10)


dens_u = sm.nonparametric.KDEMultivariate(data=data,
                                          var_type='c', 
                                          bw='cv_ml',
                                         defaults=settings)
dens_u.bw


## 3. - [Kernel Estimator and Bandwidth Selection for Density and its Derivatives](https://cran.microsoft.com/snapshot/2015-07-29/web/packages/kedd/vignettes/kedd.pdf)

### 3.2.1. - Maximum likelihood cross-validation (MLCV)

[Kernel Estimator and Bandwidth Selection for Density and its Derivatives](https://arxiv.org/pdf/2012.06102.pdf)


$MLCV(b_w) = \left( \frac{1}{n}\cdot\sum_{i=1}^{n}\log \left[ \sum_{j\neq i} K\left( \frac{X_j-X_i}{b_w}\right)\right] - \log\left[(n-1)\cdot b_w\right] \right)$

In [None]:
def mlcv(data, bw, print_log=True):
    """Maximum likelihood cross-validation (MLCV)"""
    
    # Calculate number of data points
    n = len(data)
    
    # Create gaussian kernel function
    kernel = lambda x: 1/(np.sqrt(2*np.pi))*np.exp(-0.5*x**2)

    # Initialize output with the second term of the equation
    output = -np.log((n-1)*bw)
    
    # Iterate over all the values of the data to compute the argument of the first term's logarithm
    for i in range(n):
        
        # Get second dimension of the data
        data_j = [data[j] for j in range(n) if j!=i]
        
        # Compute the argument of the logarithm and add it to the output.
        log_arg = kernel((data_j - data[i])/bw)
        output = output + 1/n*np.log(np.sum(log_arg))
        
    if print_log==True: print('\t - Bandwidth = %.4e \t MLCV = %.4e' % (bw, output))
    
    return output

def argmax_mlcv(data, bwlim, tol=1e-3, max_iter=50, print_log=True):
    """Compute bandwidth that maximizes MLCV"""
    
    # Define initial bandwidths, steps, best bandwidth and number of iterations
    bandwidths, step = np.linspace(bwlim[0], bwlim[1], num=4, endpoint=True, retstep=True)

    best_bw = 0
    max_mlcv = -np.inf
    n_iter = 0
    
    while True and n_iter<max_iter:

        print('\nComputing MLCV for the bandwidths in range: [%.5f, %.5f] (iter = %d)' % 
              (bandwidths[0],bandwidths[-1], n_iter))

        # Compute MLCV for every bandwidth
        mlcv_array = [mlcv(data, bw, (print_log==True or print_log=='full')) for bw in bandwidths]
        
        # Get the maximum local MLCV its corresponding bandwidth bw
        
        print_format = {'mlcv': om(abs(max_mlcv - max(mlcv_array))),
                       'bw': om(abs(best_bw - bandwidths[mlcv_array==max(mlcv_array)][0]))}
        # print(print_format)
        
        max_mlcv = max(mlcv_array)
        best_local_bw = bandwidths[mlcv_array==max_mlcv][0]

        # Stop loop if preccision required is met
        precission = abs(best_bw - best_local_bw)
        if print_log!=False:
            print(" -> Best local bandwidth = {:.4e}\n"\
                  "    Max. local MLCV      = {:.4e}\n"\
                  "    Step                 = {:.4e}\n"\
                  "    Precission           = {:.4e}"\
                  .format(best_local_bw, max_mlcv, step, precission))
        
        if precission<=tol: break
            
        # Assign new best local bandwidth and reassign bandwidth limits for another iteration
        best_bw = best_local_bw
            
        # Define new bandwidth array for another iteration
        bwlim = [max(best_bw - step, best_bw*0.5), best_bw + step]
        bandwidths, step = np.linspace(bwlim[0], bwlim[1], num=4, endpoint=True, retstep=True)
            
        n_iter += 1
    
    return best_bw, max_mlcv



In [None]:
bwlim = [0.1, 2.0]
best_bw, max_mlcv = argmax_mlcv(data, bwlim, tol=1e-3, max_iter=50, print_log=True)

print("Best bandwidth = %.4e / Max. MLCV = %.4e" % (best_bw, max_mlcv))

print_kde(data, best_bw)