# Required modules

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import pickle
import random

In [2]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from utilities.data_downloader import train_val_test_downloader, choose_one_column

In [4]:
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KernelDensity
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import KernelPCA
from sklearn.pipeline import Pipeline

In [5]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
plt.style.use('ggplot')
plt.rcParams['font.family']='monospace'

In [6]:
random.seed(42)
np.random.seed(42)

# Analysis Pipeline

In [None]:
def create_KernelPCA_pipeline(n_components:int=3,
                              kernel:str='linear',
                              kernel_kwargs:dict=None
                             )->Pipeline:
    """
    Creates KernelPCA Pipeline based on the passed arguments.
    RobustScaler is always used to reduce outliers impact.
    
    Parameters
    ----------
    n_components : int, deafult=3
        Number of principal components.
    kernel : str, default='linear'
        Kernel to be used.
    kernel_kwargs : dict, default=None
        The keyword arguments to be additionally passed to KernelPCA
    
    Returns
    -------
    pipe : Pipeline
        The resulting pipeline.
    """
    if kernel_kwargs is None:
        kernel_kwargs = {}
        
    pipe = Pipeline(
        [
            ('imputer', SimpleImputer(
            missing_values=np.nan, strategy='median')
            ),
            ('scaler', RobustScaler()),
            ('KernelPCA', KernelPCA(
             n_components=n_components,
             kernel=kernel,random_state=42,
             **kernel_kwargs)
            ),
        ]
    )
    return pipe

In [None]:
def apply_KernelPCA_pipeline(pipeline:Pipeline,
                             train:pd.DataFrame,
                             val:pd.DataFrame,
                             test:pd.DataFrame)->tuple:
    """
    Fits the passed KernelPCA pipeline instance to train dataset
    and transforms train, val and test datasets accordingly.
    
    Parameters
    ----------
    pipeline : Pipeline
        KernelPCA pipeline to be fitted.
    train, val, test : pd.DataFrame
        The studied dataset.
        
    Returns
    -------
    pipeline : Pipeline
        Fitted instance of the pipeline.
    df : pd.DataFrame
        Transformed version of the dataset.
    """
    
    train_transformed = pipeline.fit_transform(train)
    n_samples, n_components = train_transformed.shape
    bandwidth = (
        1.06 * 
        np.mean(np.std(train_transformed, axis=0)).item() * 
        n_samples ** (-1/5)
    )
    kde = KernelDensity(
        kernel='gaussian', bandwidth=bandwidth
    ).fit(train_transformed)
    
    val_transformed = pipeline.transform(val)
    test_transformed = pipeline.transform(test)
    
    columns = ['%d' % i for i in range(n_components)] + ['KDE_score']
    train_df, val_df, test_df = (
        pd.DataFrame(data=np.hstack(
            (train_transformed,
             kde.score_samples(train_transformed).reshape(-1, 1)
            )
        ), index=train.index, columns=columns),
        pd.DataFrame(data=np.hstack(
            (val_transformed,
             kde.score_samples(val_transformed).reshape(-1, 1)
            )
        ), index=val.index, columns=columns),
        pd.DataFrame(data=np.hstack(
            (test_transformed,
             kde.score_samples(test_transformed).reshape(-1, 1)
            )
        ), index=test.index, columns=columns),
    )
    train_df['sample'] = 'train'
    val_df['sample'] = 'val'
    test_df['sample'] = 'test'
    df = pd.concat((train_df, val_df, test_df), axis=0,
                   ignore_index=False)
    return pipeline, df

# Visualization Utility for 2D and 3D

In [None]:
def visualize_KernelPCA(
    df:pd.DataFrame,
    title:str='Scatter plot',
    savedir:str='./tmp',
    show:bool=False)->pd.DataFrame:
    
    """
    Visualization Utility.
    Only 2D or 3D KernelPCA can be trivially visualized
    
    Parameters
    ----------
    df : pd.DataFrame
        Transormed dataframe returned by
        `apply_KernelPCA_pipeline(...)`.
    title : str, default='Scatter plot'
        Title of the resulting figure.
    savedir : str, default='./tmp'
        The directory to save the plot in.
    show : bool, default=False
        Whether to show the plot.
    Returns
    -------
    None
    """
    
    n_components = len(
        df.drop(['KDE_score', 'sample'], axis=1).columns
    )
    if n_components not in [2, 3]:
        raise ValueError(f'Unable to visualize {n_components}D data')
    
    c = df['sample'].replace({'train': 0.0, 'val': 0.5, 'test': 1.0}).values
    colors = ListedColormap(['#636EFA', '#EF553B', '#00CC96'])
    
    fig = plt.figure()
    if n_components==3:
        ax = fig.add_subplot(projection='3d')
        scatter = ax.scatter(
            df['0'].values, df['1'].values, df['2'].values,
            c=c, cmap=colors, s=32, marker='o')
        
    if n_components==2:
        ax = fig.add_subplot()
        scatter = ax.scatter(
            df['0'].values, df['1'].values,
            c=c, cmap=colors, s=32, marker='o')
    
    ax.set_title(title)
    ax.legend(handles=scatter.legend_elements()[0],
              labels=['train', 'val', 'test'],
              bbox_to_anchor=(1.1, 0.5), loc='center left')
    plt.savefig(f'{savedir}/{title}.pdf', format='pdf', bbox_inches='tight')
    if show:
        plt.show()
    else:
        plt.close()

# Scan over parameters

In [None]:
def scan_KernelPCA(
    train:pd.DataFrame,
    val:pd.DataFrame,
    test:pd.DataFrame,
    n_components:list=[2, 3],
    kernels:list=['linear', 'poly', 'rbf', 'sigmoid', 'cosine'],
    degree:list=[2, 3, 4],
    pvalue:float=0.01,
    savedir:str='./KernelPCA_tmp'):
    """
    Loop over KernelPCA parameters
    
    Parameters
    ----------
    train, val, test : pd.DataFrame
        The input dataset.
    n_components : list, default=[2, 3]
        Number of principal components to be used.
    kernels : list, default=['linear', 'poly', 'rbf',
                             'sigmoid', 'cosine']
        Kernels to be used.
    degree : list, default=[2, 3, 4]
        Degree of polynomial kernel to be used.
    pvalue : float, default=0.01
        Threshold to define anomalous samples by their KDE_score.
        Note that pvalues are enlarged manually for polynomial 
        kernels since the distances have more significant range
    savedir : str, default='./KernelPCA_tmp'
        The directory to save the plots and the result in.
    Returns
    -------
    dict
        The obtained results:
        transformed datasets & outliers.
    """
    result = dict()
    
    if not os.path.isdir(savedir):
        os.mkdir(savedir)
    else:
        shutil.rmtree(savedir)
        os.mkdir(savedir)
    
    for n in n_components:
        for ker in kernels:
            if ker=='poly':
                for deg in degree:
                    kernel_kwargs = {'degree': deg}
                    pipeline = create_KernelPCA_pipeline(
                        n, ker, kernel_kwargs=kernel_kwargs
                    )
                    pipeline, df = apply_KernelPCA_pipeline(
                        pipeline, train, val, test)
                    threshold = np.quantile(df['KDE_score'].values, q=deg*pvalue)
                    outliers, inliers = (
                        df[df['KDE_score'] <= threshold],
                        df[df['KDE_score'] > threshold]
                    )
                    most_anomalous = str(
                        outliers.index[np.argmin(outliers['KDE_score'])]
                    )
                    title = f'n_components={n}, kernel={ker}, degree={deg}'
                    visualize_KernelPCA(inliers, title=title, savedir=savedir)
                    result[title] = {
                        'pipeline' : pipeline,
                        'transformed': df,
                        'outliers': outliers,
                        'most_anomalous': most_anomalous
                    }
            else:
                pipeline = create_KernelPCA_pipeline(
                    n, ker
                )
                pipeline, df = apply_KernelPCA_pipeline(
                    pipeline, train, val, test
                )
                threshold = np.quantile(df['KDE_score'].values, q=pvalue)
                outliers, inliers = (
                    df[df['KDE_score'] <= threshold],
                    df[df['KDE_score'] > threshold]
                )
                most_anomalous = str(
                    outliers.index[np.argmin(outliers['KDE_score'])]
                )
                title = f'n_components={n}, kernel={ker}'
                visualize_KernelPCA(inliers, title=title, savedir=savedir)
                result[title] = {
                    'pipeline' : pipeline,
                    'transformed': df,
                    'outliers': outliers,
                    'most_anomalous': most_anomalous
                }
    with open(f'{savedir}/result.pickle', 'wb') as f:
        pickle.dump(result, f)
    print(f"All results are stored in '{savedir}'.")
    return result

# Extracted Features Analysis

In [7]:
train, val, test = train_val_test_downloader('features')

Datasets downloaded
 - train : 810 entries
 - val   : 174 entries
 - test  : 174 entries


In [None]:
result = scan_KernelPCA(train, val, test, savedir='./KernelPCA_features')

In [None]:
most_anomalous = {}
for key, val in result.items():
    most_anomalous[key] = val['most_anomalous']
most_anomalous = pd.DataFrame.from_dict(
    most_anomalous, orient='index', columns=['most_anomalous'])
most_anomalous

# Rebinned Lightcurves Analysis

In [8]:
train, val, test = tuple(
    map(lambda df: choose_one_column(df, 'lgRate'),
    train_val_test_downloader('interp'))
)

Datasets downloaded
 - train : 810 entries
 - val   : 174 entries
 - test  : 174 entries


In [None]:
result = scan_KernelPCA(train, val, test, savedir='./KernelPCA_interp')

In [None]:
most_anomalous = {}
for key, val in result.items():
    most_anomalous[key] = val['most_anomalous']
most_anomalous = pd.DataFrame.from_dict(
    most_anomalous, orient='index', columns=['most_anomalous'])
most_anomalous