In [1]:
%matplotlib inline
import numpy as np

from matplotlib import pyplot as plt
from matplotlib import animation
from IPython.display import HTML
from tqdm.notebook import tqdm

from ts_outlier_detection import *
from ts_outlier_detection.plotting import *
from utils import *

In [2]:
from pandas import read_csv

time_header = 'GPStime'
fast_scattering = 'Fast Scattering'
koi_fish = 'Koi Fish'
scattered_light = 'Scattered Light'

h1_fast_scattering_df = read_csv('data/H1_ Fast scattering - Sheet1 (1).csv')
h1_koi_fish_df =        read_csv('data/H1_ Koi Fish - gspy (7) (2).csv')
h1_scattered_light_df = read_csv('data/H1_ Scattered light - gspy (1).csv')
l1_fast_scattering_df = read_csv('data/L1_ Fast-scattering - gspy (6) (1).csv')
l1_koi_fish_df =        read_csv('data/L1_ Koi Fish - Sheet1 (1).csv')
l1_scattered_light_df = read_csv('data/L1_ Scattered Light - gspy (1) (1).csv')

H1_GLITCHES = {
    fast_scattering: h1_fast_scattering_df[time_header].to_numpy(),
    koi_fish:        h1_koi_fish_df[time_header].to_numpy(),
    scattered_light: h1_scattered_light_df[time_header].to_numpy()
}

L1_GLITCHES = {
    fast_scattering: l1_fast_scattering_df[time_header].to_numpy(),
    koi_fish:        l1_koi_fish_df[time_header].to_numpy(),
    scattered_light: l1_scattered_light_df[time_header].to_numpy()
}

ALL_GLITCHES = {'H1': H1_GLITCHES, 'L1': L1_GLITCHES}

for det, glitches in ALL_GLITCHES.items():
    for kind, times in glitches.items():
        print(f'{det} has {times.size} {kind} glitches')

H1 has 999 Fast Scattering glitches
H1 has 100 Koi Fish glitches
H1 has 999 Scattered Light glitches
L1 has 999 Fast Scattering glitches
L1 has 100 Koi Fish glitches
L1 has 999 Scattered Light glitches


In [None]:
## Koi Fish

koi_fish_times = []
for det, glitches in ALL_GLITCHES.items():
    print(f'Fetching {koi_fish} events from {det}')
    koi_fish_times.extend([(det, glitch) for glitch in glitches[koi_fish]])

from random import randint
from _thread import start_new_thread
from time import sleep, time

koi_fish_ts = []
failed_times = []

def get_koi_fish(det, gps_time, start, retries=2):
    try:
        koi_fish_ts.append([
            get_processed_event(det, gps_time, length=2, bp=[(20, 300)])[0],
            gps_time
        ])
        print(f'Fetched {len(koi_fish_ts)} events after {round(time()-start, 2)} seconds')
    except Exception as e:
        print(f'Error fetching event from {det} at {gps_time}: {e}')
        if retries > 0:
            print(f'Retrying {retries-1} more times')
            sleep(randint(5, 15))
            get_koi_fish(det, gps_time, start, retries=retries-1)
        else:
            print('No more retries')
            failed_times.append(gps_time)

start = time()
for det, gps_time in koi_fish_times:
    start_new_thread(get_koi_fish, (det, gps_time, start, 0))
    sleep(10)

training_size = 0.8
split_idx = int(training_size*len(koi_fish_ts))
training_set = koi_fish_ts[:split_idx]
test_set = koi_fish_ts[split_idx:]

Fetching Koi Fish events from H1
Fetching Koi Fish events from L1
Fetched 1 events after 4.03 seconds
Fetched 2 events after 11.3 seconds
Fetched 3 events after 21.21 seconds
Fetched 4 events after 41.64 seconds
Fetched 5 events after 51.2 seconds
Error fetching event from H1 at 1249590821.665: HTTPSConnectionPool(host='www.gw-openscience.org', port=443): Max retries exceeded with url: /archive/links/O3a_4KHZ_R1/H1/1249590820/1249590824/json/ (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7fab507c58b0>: Failed to establish a new connection: [Errno 11] Resource temporarily unavailable'))
No more retries
Fetched 6 events after 61.24 seconds
Fetched 7 events after 71.37 seconds
Fetched 8 events after 81.5 seconds
Fetched 9 events after 91.26 seconds
Fetched 10 events after 101.23 seconds
Fetched 11 events after 111.25 seconds
Fetched 12 events after 122.9 seconds
Fetched 13 events after 131.74 seconds
Fetched 14 events after 141.22 seconds
Fetched 15 events

In [None]:
### Parameters to optimize (dims, n_neighbors, event_length)
params = np.array([3, 4, 1024]) # initial values
learning_rate = np.array([3, 3, 30])
###

## Gradient descent
rng = np.random.default_rng(42)
loss_function = diff_loss
epochs = 3
batch_size = 5
max_iter = 100
for epoch in range(epochs):
    print(f'Starting training epoch {epoch+1}/{epochs}')
    rng.shuffle(training_set)
    avg_losses = []
    
    for i in tqdm(range(0, len(training_set), batch_size)):
        for _ in range(max_iter):
            grad = np.zeros(3)
            batch_loss = 0

            for ts, actual in training_set[i:i+batch_size]:
                data = ts.value
                times = ts.times.value

                def loss(d, n, e):
                    ctof = TemporalOutlierfactor(dims=d, n_neighbors=n, event_length=e)
                    ctof.fit(data, times)
                    return loss_function(actual, times[ctof.get_outlier_indices()])

                batch_loss += loss(*params)
                grad += estimate_gradient(loss, params)

            grad /= batch_size
            avg_losses.append(batch_loss/batch_size)
            delta = (learning_rate * grad).astype(int)
            if np.max(np.abs(delta)) <= 0:
                break
            params -= delta
    
    fig, ax = plt.subplots(1, 1, figsize=(10,6))
    ax.set_title(f'Epoch {epoch} losses')
    ax.plot(np.arange(len(avg_losses)), avg_losses, 'k.')
    ax.set_xlabel('Batch number')
    ax.set_ylabel('Average loss over batch')
    ax.grid(True)
    print(f'Current optimal parameters: {params}')