In [1]:
import os
import sys
import pandas as pd
import numpy as np
import re

In [2]:
scripts_path = os.path.abspath(os.path.join('../scripts'))
if scripts_path not in sys.path:
    sys.path.insert(0,scripts_path)

In [3]:
TRAIN_SET = '../data/first-train.slist'
TRUE_VALUES_SET = '../data/2018.csv'
SPS_INDEX = 2
START_TIME_INDEX = 3
TIME_WINDOW = 10
WAIT_TIME_WINDOW = 1

In [4]:
header = pd.read_csv(TRAIN_SET, sep='\t', nrows=0).columns[0].split(', ')

# Number of samples per second
sps = int(re.findall(r'\d+', header[SPS_INDEX])[0])
# Starting time
startTime = pd.to_datetime(header[START_TIME_INDEX])

In [5]:
X = pd.read_csv(TRAIN_SET, sep='\t', header=0, names=["1", "2", "3", "4", "5", "6"])
X = X.to_numpy().reshape(-1)
X = X[~(np.isnan(X))]
X = X.reshape(X.shape[0], 1)
X

array([[-6731.],
       [-6694.],
       [-6757.],
       ...,
       [-3503.],
       [-3296.],
       [-3129.]])

In [6]:
# Compute ending time
number_of_seconds = (X.shape[0] - 1) / sps # First sample is at starting time -> subtract 1
endTime = startTime + pd.to_timedelta(number_of_seconds, unit='s')

In [88]:
catalog = pd.read_csv(TRUE_VALUES_SET)
catalog["origintime"] = pd.to_datetime(catalog["origintime"])
catalog = catalog[(catalog["origintime"] >= startTime) & (catalog["origintime"] <= endTime)]
catalog.sort_values("origintime")
catalog

Unnamed: 0,event_id,origintime,magnitude,magnitude_source,max_mmi,latitude,longitude,depth_km,err_lat,err_lon,err_depth,err_origintime,state,county,status
730,0,2018-04-09 00:59:58,1.8,OGS,0,36.45094,-98.79978,5.645,0.6,0.5,0.8,0.34,,MAJOR,
731,0,2018-04-09 09:01:57,2.1,OGS,0,36.21448,-97.57076,4.829,0.3,0.3,0.9,0.36,,GARFIELD,
732,0,2018-04-09 09:26:31,3.4,OGS,0,36.21511,-97.56857,5.189,0.2,0.3,0.8,0.34,,GARFIELD,
733,0,2018-04-09 09:50:09,2.2,OGS,0,36.29543,-97.53089,5.152,0.4,0.4,1.2,0.41,,GARFIELD,
734,0,2018-04-09 10:22:20,4.0,OGS,6,36.21847,-97.5735,4.274,0.2,0.3,0.8,0.33,,GARFIELD,
735,0,2018-04-09 11:05:28,1.9,OGS,0,36.21376,-97.55272,6.744,0.3,0.3,1.0,0.35,,GARFIELD,
736,0,2018-04-09 12:33:19,1.9,OGS,0,35.38129,-98.0902,5.0,0.7,0.7,1.8,0.33,,CANADIAN,
737,0,2018-04-09 12:33:19,1.9,OGS,0,35.37896,-98.08983,3.575,0.9,0.7,2.1,0.37,,CANADIAN,
738,0,2018-04-09 14:04:43,2.9,OGS,0,36.21164,-97.55554,3.587,0.2,0.3,0.8,0.33,,GARFIELD,
739,0,2018-04-09 18:37:00,2.4,OGS,0,36.32548,-97.53697,4.454,0.3,0.3,0.9,0.36,,GARFIELD,


In [111]:
y = np.zeros((X.shape[0], 1))
y.shape

(34560001, 1)

In [110]:
for date in catalog["origintime"]:
    seconds_to_hq = (date - startTime).seconds
    index_in_data = seconds_to_hq * sps
    y[int(index_in_data / X.shape[0])] = 1

3598
32517
33991
35409
37340
39928
45199
45199
50683
67020
69241
71486
73820
77201
61069
76843
77961
80346
12727
41191
41761
50054
64943
83297
85023
5602
16947
24544
49996
72734
86167


In [8]:
X

array([[-6731.],
       [-6694.],
       [-6757.],
       ...,
       [-3503.],
       [-3296.],
       [-3129.]])

In [22]:
time_hq = pd.to_datetime("2018-04-09 10:22:18")
second_to_hq = (time_hq - startTime).seconds

In [37]:
time_window = X[0:6000]

In [38]:
time_window.shape

(6000, 1)

In [39]:
time_window = time_window - np.mean(time_window)

In [68]:
change_of_sign = time_window[1:] * time_window[:-1]
#change_of_sign[change_of_sign < -100].shape[0]
(change_of_sign < -100000).sum()

5

In [75]:
time_window_hq = X[second_to_hq * sps: second_to_hq * sps + 6000]
time_window_hq = time_window_hq - np.mean(time_window_hq)
change_of_sign = time_window_hq[1:] * time_window_hq[:-1]
#change_of_sign[change_of_sign < -100].shape[0]
(change_of_sign < -100000).sum()

551

In [84]:
def count_signs_per_row(X, amplitude):
    first_kernel = X[:,1:]
    print(first_kernel.shape)
    second_kernel = X[:,:-1]
    print(second_kernel.shape)
    change_of_sign = first_kernel * second_kernel
    return np.sum(change_of_sign < -amplitude, axis=1)

In [86]:
count_signs_per_row(time_window_hq.reshape(1, time_window_hq.shape[0]), 100000)

(1, 5999)
(1, 5999)


array([551])

In [87]:
count_signs_per_row(time_window.reshape(1, time_window.shape[0]), 100000)

(1, 5999)
(1, 5999)


array([5])

In [47]:
def aggregate_window(X):
    N = X.shape[0]
    window_size = TIME_WINDOW*sps
    drop_size = WAIT_TIME_WINDOW*sps
    total_window_size = window_size + drop_size    
    number_window = np.math.floor(N / total_window_size)
    X = X[:number_window*total_window_size]
    X_dropped = np.array(np.split(X.reshape(-1), number_window))[:, :window_size]
    
    return np.mean(X_dropped, axis=1)