# Required modules

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import pickle
import random

from types import MethodType

In [None]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
from utilities.data_downloader import train_val_test_downloader, choose_one_column
from utilities.plots import visualize_latent

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KernelDensity
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import KernelPCA
from sklearn.pipeline import Pipeline

In [None]:
random.seed(42)
np.random.seed(42)

# Analysis Pipeline

In [None]:
def roc_auc_scoring(estimator, X, y)->float:
    yscore = -estimator.score_samples(X) # anomalous samples have smaller log-likelihood
    ytrue = y
    return roc_auc_score(ytrue, yscore)

In [None]:
def make_pipe():
    """
    Makes GridSearchCV pipeline
    """
    return Pipeline(
        [('imputer', SimpleImputer(
            missing_values=np.nan, strategy='median')),
         ('scaler', RobustScaler()),
         ('model', GridSearchCV(
             Pipeline([
                 ('KernelPCA', KernelPCA(random_state=42)),
                 ('KDE', KernelDensity(kernel='gaussian', bandwidth='silverman'))
             ]),
             param_grid={
                 'KernelPCA__n_components': (1, 2, 3),
                 'KernelPCA__kernel': ('rbf', 'sigmoid', 'cosine'),
                 'KernelPCA__gamma': (0.1, 0.3, 1.0, 3.0)},
             scoring=roc_auc_scoring, cv=2),
         ),
        ]
    )

In [None]:
def transform_dataset(pipe:Pipeline,
                      train:pd.DataFrame,
                      val:pd.DataFrame,
                      test:pd.DataFrame,
                      labels:pd.DataFrame)->pd.DataFrame:
    """
    Creates a joint dataframe of transformed features.
    """
    train_df, val_df, test_df = [
        pd.DataFrame(data=pipe.transform(df), index=df.index).rename(
        columns=lambda col: f'feature_{col}') for df in (train, val, test)
    ]
    train_df['sample'] = 'train'
    val_df['sample'] = 'val'
    test_df['sample'] = 'test'
    df = pd.concat((train_df, val_df, test_df), axis=0, ignore_index=False)
    df = pd.concat((df, labels), axis=1, ignore_index=False)
    return df

# Extracted Features Analysis

In [None]:
train, val, test, labels = train_val_test_downloader('features')
ytrain = labels.loc[train.index, 'FlaresFlag']

In [None]:
pipe = make_pipe()
pipe.fit(train, ytrain)

In [None]:
def transform(self, dataframe:pd.DataFrame):
    return self._final_estimator.best_estimator_[0].transform(
        self[1].transform(
            self[0].transform(dataframe)
        )
    )
pipe.transform = MethodType(transform, pipe)

In [None]:
df = transform_dataset(pipe, train, val, test, labels)

In [None]:
pipe._final_estimator.best_score_

In [None]:
pipe._final_estimator.best_params_

In [None]:
if not os.path.isdir('./Figures'):
    os.mkdir('./Figures')

In [None]:
visualize_latent(df, title='best KernelPCA on LC features',
                 savedir='./Figures', show=True)

# Rebinned Lightcurves Analysis

In [None]:
train, val, test = tuple(
    map(lambda df: choose_one_column(df, 'lgRate'),
    train_val_test_downloader('interp'))
)