# Basic Initialization on Dataset

Fun datasets at https://github.com/deric/clustering-benchmark/tree/master/src/main/resources/datasets/artificial

In [57]:
# Animation class definition
%matplotlib inline
import numpy as np
from matplotlib import pyplot as plt
from matplotlib import animation
from IPython.display import HTML
from sklearn.neighbors import LocalOutlierFactor

class LOFAnimation:
    def __init__(self, data, n_range=range(1,25), min_lof=1.5):
        if len(data.shape) != 2 or data.shape[1] != 2:
            raise ValueError("Input must be 2D")
        self.data = data
        self.n_range = n_range
        self.min_lof = min_lof
        self.fig, self.ax = plt.subplots(figsize=(8,8))
        self.anim = animation.FuncAnimation(
            self.fig, self.animate,
            init_func=self.setup_plot,
            frames=self.n_range,
            interval=500,
            blit=True
        )

    def setup_plot(self):
        self.scatter = self.ax.scatter(self.data[:,0], self.data[:,1], s=10, facecolors='c')
        self.outliers = self.ax.scatter([],[], facecolors='none', edgecolors='r')
        self.annotations = []
        return [self.scatter, self.outliers]

    def animate(self, n):
        clf = LocalOutlierFactor(n_neighbors=n)
        clf.fit_predict(self.data)
        out_idx = clf.negative_outlier_factor_ < -self.min_lof
        outliers = self.data[out_idx,:]
        outlier_lof = -clf.negative_outlier_factor_[out_idx]
        self.outliers.set_offsets(outliers)
        self.outliers.set_sizes(150*outlier_lof)
        # Clear old annotations
        for ann in self.annotations:
            ann.remove()
        self.annotations = []
        # Generate new annotations
        self.annotations.extend([
            self.ax.text(
                0.01,0.06,
                f'n_neighbors = {n}',
                transform=self.ax.transAxes),
            self.ax.text(
                0.01,0.02,
                f'(min LOF to consider outlier = {self.min_lof})',
                transform=self.ax.transAxes)
        ])
        for i, lof in enumerate(outlier_lof):
            try:
                self.annotations.append(
                    self.ax.annotate(round(lof,2), outliers[i,:])
                )
            except:
                print(lof, outliers[i,:])
        return [self.scatter, self.outliers]
    
    def save(self, *args, **kwargs):
        return self.anim.save(*args, **kwargs)
    
    def show(self, *args, **kwargs):
        return HTML(self.anim.to_html5_video(*args, **kwargs))

In [58]:
# Blobs with 3 centers

from sklearn.datasets import make_blobs

data, _ = make_blobs(n_samples=100, random_state=0)
anim = LOFAnimation(data, n_range=range(5,30), min_lof=1.2)
plt.close()
anim.show()

In [None]:
# Data with outliers in the middle



In [61]:
# Raw time-series data from GW150914

from gwpy.timeseries import TimeSeries

fn = 'data/H-H1_GWOSC_4KHZ_R1-1126257415-4096.hdf5'
strain = TimeSeries.read(fn,format='hdf5.losc')
data = np.array(strain.data)[::1000] # downsample
scatter_data = np.stack((np.arange(0,data.size), data), axis=-1)

anim = LOFAnimation(scatter_data, n_range=range(5,30), min_lof=1.05)
plt.close()
anim.show()