In [1]:
import os
import sys
import numpy as np
import pandas as pd
import random

In [2]:
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from utilities.data_downloader import train_val_test_downloader, choose_one_column
from utilities.plots import plt, COLORMAP, visualize_latent
from utilities.dtw import DTWdistanceMatrix

In [4]:
from sklearn.cluster import DBSCAN
from sklearn.manifold import Isomap
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline

In [5]:
random.seed(42)
np.random.seed(42)

# Analysis Pipeline

In [6]:
def make_pipe(eps:float, metric:str):
    """
    Makes pipeline
    """
    return Pipeline(
        [
         ('scaler', RobustScaler()),
        ] * int(metric!='precomputed') +
        [
         ('DBSCAN', DBSCAN(eps=eps, metric=metric)),
        ]
    )

# DTW distance on raw dataset

In [7]:
train, val, test, labels = train_val_test_downloader('original')
train, val, test = tuple(
    map(lambda df: df['lgRate'],
    (train, val, test))
)

Datasets downloaded
 - train  : 810 entries
 - val    : 174 entries
 - test   : 174 entries
 - labels : 1158 entries


In [8]:
X = [np.array([el for el in entry if 
               np.issubdtype(type(el), np.number)], dtype=float)
     for df in (train, val, test) for entry in df.values]

In [None]:
distance_matrix = DTWdistanceMatrix(X)

 23%|█████████                               | 262/1158 [00:14<01:17, 11.51it/s]

In [None]:
# manifold representation
n_components=3
isomap = Isomap(n_components=n_components,
                eigen_solver='dense',
                metric='precomputed').fit(distance_matrix)
features = pd.DataFrame(index=labels.index, data=isomap.embedding_,
                        columns=['feature_%d' % i 
                                 for i in range(n_components)])

In [None]:
n_samples = len(X)
pairwise_distances = np.array(
    [distance_matrix[i, j] for i in range(n_samples) for j in range(i)]
)

In [None]:
plt.hist(np.log10(pairwise_distances), bins=32)
plt.show()

In [None]:
epsilon = np.logspace(-1, 3, 1000)

In [None]:
n_classes = np.array([], dtype=int)
n_noisy = np.array([], dtype=int)

for eps in epsilon:
    pipe = make_pipe(eps=eps, metric='precomputed')
    preds = pipe.fit_predict(distance_matrix)
    classes, counts = np.unique(preds, return_counts=True)
    n_classes = np.append(n_classes, len(classes)-1)
    n_noisy = np.append(n_noisy, counts[0])

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(8, 4))

axes[0].scatter(epsilon, n_classes, marker='s', s=4, color=COLORMAP(0.3))
axes[0].set_xscale('log')
axes[0].set_title('n_classes vs eps')
axes[1].scatter(epsilon, n_noisy, marker='s', s=4, color=COLORMAP(0.7))
axes[1].set_xscale('log')
axes[1].set_title('noisy samples vs eps')

plt.show()

In [None]:
n_scan = 1000
epsilon_scan = np.logspace(1, 2, n_scan)
scores = pd.DataFrame(data=np.zeros(n_samples), index=labels.index, columns=['score'])

In [None]:
for eps in epsilon_scan:
    pipe = make_pipe(eps=eps, metric='precomputed')
    preds = pipe.fit_predict(distance_matrix)
    scores.loc[preds==-1, :] = scores.loc[preds==-1, :] + 1/n_scan

In [None]:
df = pd.concat((features, scores, labels), axis=1, ignore_index=False)

In [None]:
if not os.path.isdir('./Figures'):
    os.mkdir('./Figures')
    
visualize_latent(df, savedir='./Figures',
                 title='Isomap embedding of the DTW-defined manifold', show=True)

In [None]:
print('ROC_AUC Score:\n\t',
      round(roc_auc_score(df['FlaresFlag'].values, df['score'].values), 2)
     )

In [None]:
# top-10 anomalous
df.sort_values(by='score').tail(10)

# Euclidian distance on rebinned dataset

In [None]:
train, val, test, labels = train_val_test_downloader('interp')
train, val, test = tuple(
    map(lambda df: choose_one_column(df, 'lgRate'),
    (train, val, test))
)

In [None]:
X = pd.concat((train, val, test), axis=0, ignore_index=False)

In [None]:
epsilon = np.logspace(-1, 1, 1000)

In [None]:
n_classes = np.array([], dtype=int)
n_noisy = np.array([], dtype=int)

for eps in epsilon:
    pipe = make_pipe(eps=eps, metric='euclidean')
    preds = pipe.fit_predict(X)
    classes, counts = np.unique(preds, return_counts=True)
    n_classes = np.append(n_classes, len(classes)-1)
    n_noisy = np.append(n_noisy, counts[0])

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(8, 4))

axes[0].scatter(epsilon, n_classes, marker='s', s=4, color=COLORMAP(0.3))
axes[0].set_xscale('log')
axes[0].set_title('n_classes vs eps')
axes[1].scatter(epsilon, n_noisy, marker='s', s=4, color=COLORMAP(0.7))
axes[1].set_xscale('log')
axes[1].set_title('noisy samples vs eps')

plt.show()

In [None]:
n_scan = 1000
epsilon_scan = np.logspace(0, 1, n_scan)
scores = pd.DataFrame(data=np.zeros(n_samples), index=labels.index, columns=['score'])

In [None]:
for eps in epsilon_scan:
    pipe = make_pipe(eps=eps, metric='euclidean')
    preds = pipe.fit_predict(X)
    scores.loc[preds==-1, :] = scores.loc[preds==-1, :] + 1/n_scan

In [None]:
df = pd.concat((scores, labels), axis=1, ignore_index=False)

In [None]:
print('ROC_AUC Score:\n\t',
      round(roc_auc_score(df['FlaresFlag'].values, df['score'].values), 2)
     )

In [None]:
# top-10 anomalous
df.sort_values(by='score').tail(10)