In [1]:
import os
import sys
import pandas as pd
import numpy as np
import re

In [2]:
scripts_path = os.path.abspath(os.path.join('../scripts'))
if scripts_path not in sys.path:
    sys.path.insert(0,scripts_path)

In [3]:
from station_location import *

In [4]:
TRAIN_SET = '../data/first-train.slist'
TRUE_VALUES_SET = '../data/2018.csv'
ANTENNA_INDEX = 0
SPS_INDEX = 2
START_TIME_INDEX = 3
TIME_WINDOW = 59
WAIT_TIME_WINDOW = 1

In [5]:
header = pd.read_csv(TRAIN_SET, sep='\t', nrows=0).columns[0].split(', ')

# Number of samples per second
sps = int(re.findall(r'\d+', header[SPS_INDEX])[0])
# Starting time
startTime = pd.to_datetime(header[START_TIME_INDEX])

# Antenna
antenna = header[ANTENNA_INDEX].split(' ')[1].split('_')
network = antenna[0]
station = antenna[1]
channel = antenna[3]

latitude, longitude = get_location(network, station, channel)

In [6]:
latitude, longitude

(35.79657, -97.454857)

In [7]:
X = pd.read_csv(TRAIN_SET, sep='\t', header=0, names=["1", "2", "3", "4", "5", "6"])
X = X.to_numpy().reshape(-1)
X = X[~(np.isnan(X))]
X

array([-6731., -6694., -6757., ..., -3503., -3296., -3129.])

In [8]:
# Compute ending time
total_number_of_seconds = (X.shape[0] - 1) / sps # First sample is at starting time -> subtract 1
endTime = startTime + pd.to_timedelta(total_number_of_seconds, unit='s')

In [9]:
catalog = pd.read_csv(TRUE_VALUES_SET)
catalog["origintime"] = pd.to_datetime(catalog["origintime"])
catalog = catalog[(catalog["origintime"] >= startTime) & (catalog["origintime"] <= endTime)]
catalog.sort_values("origintime")
catalog

Unnamed: 0,event_id,origintime,magnitude,magnitude_source,max_mmi,latitude,longitude,depth_km,err_lat,err_lon,err_depth,err_origintime,state,county,status
730,0,2018-04-09 00:59:58,1.8,OGS,0,36.45094,-98.79978,5.645,0.6,0.5,0.8,0.34,,MAJOR,
731,0,2018-04-09 09:01:57,2.1,OGS,0,36.21448,-97.57076,4.829,0.3,0.3,0.9,0.36,,GARFIELD,
732,0,2018-04-09 09:26:31,3.4,OGS,0,36.21511,-97.56857,5.189,0.2,0.3,0.8,0.34,,GARFIELD,
733,0,2018-04-09 09:50:09,2.2,OGS,0,36.29543,-97.53089,5.152,0.4,0.4,1.2,0.41,,GARFIELD,
734,0,2018-04-09 10:22:20,4.0,OGS,6,36.21847,-97.5735,4.274,0.2,0.3,0.8,0.33,,GARFIELD,
735,0,2018-04-09 11:05:28,1.9,OGS,0,36.21376,-97.55272,6.744,0.3,0.3,1.0,0.35,,GARFIELD,
736,0,2018-04-09 12:33:19,1.9,OGS,0,35.38129,-98.0902,5.0,0.7,0.7,1.8,0.33,,CANADIAN,
737,0,2018-04-09 12:33:19,1.9,OGS,0,35.37896,-98.08983,3.575,0.9,0.7,2.1,0.37,,CANADIAN,
738,0,2018-04-09 14:04:43,2.9,OGS,0,36.21164,-97.55554,3.587,0.2,0.3,0.8,0.33,,GARFIELD,
739,0,2018-04-09 18:37:00,2.4,OGS,0,36.32548,-97.53697,4.454,0.3,0.3,0.9,0.36,,GARFIELD,


In [10]:
def count_signs_per_row(X, amplitude):
    print(X.shape)
    X = X - np.mean(X, axis=1).reshape(X.shape[0], 1)
    first_kernel = X[:,1:]
    second_kernel = X[:,:-1]
    change_of_sign = first_kernel * second_kernel
    
    return np.sum(change_of_sign < -amplitude, axis=1).reshape(X.shape[0], 1)

In [11]:
def compute_X_and_y(X, sps, catalog, amplitude):
    N = X.shape[0]
    window_size = TIME_WINDOW*sps
    drop_size = WAIT_TIME_WINDOW*sps
    total_window_size = window_size + drop_size    
    number_window = np.math.floor(N / total_window_size)
    X = X[:number_window*total_window_size]
    X_time_window = np.array(np.split(X.reshape(-1), number_window))
    
    y = np.zeros((X_time_window.shape[0], 1))
    
    for date in catalog["origintime"]:
        seconds_to_hq = (date - startTime).total_seconds()
        index_in_data = seconds_to_hq * sps
        index = int(index_in_data / X_time_window.shape[1])
        y[index] = 1
    
    X_time_window = X_time_window[:, drop_size:]
    
    X_time_window = count_signs_per_row(X_time_window, amplitude)
    
    time_stamps = [startTime + pd.to_timedelta(i * (total_window_size / sps), unit='s') for i in range(X_time_window.shape[0])]
    
    return X_time_window, y, time_stamps

In [12]:
f_X, f_y, time_stamps = compute_X_and_y(X, sps, catalog, 100000)

(5760, 5900)


In [13]:
f_X[f_X > 0]

array([  5,  45,   5, ...,  12,  15, 134])

In [14]:
f_X > 0

array([[ True],
       [ True],
       [ True],
       ..., 
       [ True],
       [ True],
       [ True]], dtype=bool)

In [15]:
f_X.shape

(5760, 1)

In [16]:
np.array(time_stamps).shape

(5760,)

In [17]:
tt = [time_stamps[i] for i in range(f_X.shape[0]) if (f_X > 25)[i]]

In [18]:
tt_y = [time_stamps[i] for i in range(f_y.shape[0]) if (f_y > 0)[i]]

In [19]:
list(zip(zip(f_y[f_y > 0], f_X[f_y > 0]), tt_y))

[((1.0, 25), Timestamp('2018-04-09 00:59:00')),
 ((1.0, 9), Timestamp('2018-04-09 09:01:00')),
 ((1.0, 779), Timestamp('2018-04-09 09:26:00')),
 ((1.0, 17), Timestamp('2018-04-09 09:50:00')),
 ((1.0, 351), Timestamp('2018-04-09 10:22:00')),
 ((1.0, 70), Timestamp('2018-04-09 11:05:00')),
 ((1.0, 41), Timestamp('2018-04-09 12:33:00')),
 ((1.0, 191), Timestamp('2018-04-09 14:04:00')),
 ((1.0, 207), Timestamp('2018-04-09 18:37:00')),
 ((1.0, 26), Timestamp('2018-04-09 19:14:00')),
 ((1.0, 40), Timestamp('2018-04-09 19:51:00')),
 ((1.0, 201), Timestamp('2018-04-09 20:30:00')),
 ((1.0, 0), Timestamp('2018-04-09 21:26:00')),
 ((1.0, 0), Timestamp('2018-04-10 16:57:00')),
 ((1.0, 4), Timestamp('2018-04-10 21:20:00')),
 ((1.0, 2), Timestamp('2018-04-10 21:39:00')),
 ((1.0, 86), Timestamp('2018-04-10 22:19:00')),
 ((1.0, 31), Timestamp('2018-04-11 03:32:00')),
 ((1.0, 1), Timestamp('2018-04-11 11:26:00')),
 ((1.0, 13), Timestamp('2018-04-11 11:36:00')),
 ((1.0, 168), Timestamp('2018-04-11 13:54

In [20]:
list(zip(zip(f_y[f_X > 25], f_X[f_X > 25]), tt))

[((0.0, 45), Timestamp('2018-04-09 00:01:00')),
 ((0.0, 53), Timestamp('2018-04-09 00:08:00')),
 ((0.0, 117), Timestamp('2018-04-09 00:10:00')),
 ((0.0, 144), Timestamp('2018-04-09 00:11:00')),
 ((0.0, 43), Timestamp('2018-04-09 00:13:00')),
 ((0.0, 28), Timestamp('2018-04-09 00:14:00')),
 ((0.0, 35), Timestamp('2018-04-09 00:23:00')),
 ((0.0, 28), Timestamp('2018-04-09 00:24:00')),
 ((0.0, 32), Timestamp('2018-04-09 00:26:00')),
 ((0.0, 68), Timestamp('2018-04-09 00:31:00')),
 ((0.0, 937), Timestamp('2018-04-09 00:35:00')),
 ((0.0, 174), Timestamp('2018-04-09 00:36:00')),
 ((0.0, 31), Timestamp('2018-04-09 00:39:00')),
 ((0.0, 26), Timestamp('2018-04-09 00:45:00')),
 ((0.0, 27), Timestamp('2018-04-09 00:47:00')),
 ((0.0, 43), Timestamp('2018-04-09 01:00:00')),
 ((0.0, 444), Timestamp('2018-04-09 01:01:00')),
 ((0.0, 68), Timestamp('2018-04-09 01:14:00')),
 ((0.0, 49), Timestamp('2018-04-09 01:15:00')),
 ((0.0, 195), Timestamp('2018-04-09 01:16:00')),
 ((0.0, 179), Timestamp('2018-04-0

In [21]:
f_X.shape

(5760, 1)

In [22]:
f_y.shape

(5760, 1)

In [23]:
f_y[f_y == 1].shape

(30,)

In [24]:
time_hq = pd.to_datetime("2018-04-09 10:22:18")
second_to_hq = (time_hq - startTime).seconds

In [25]:
time_window = X[0:6000]

In [26]:
time_window.shape

(6000,)

In [27]:
time_window = time_window - np.mean(time_window)

In [28]:
change_of_sign = time_window[1:] * time_window[:-1]
#change_of_sign[change_of_sign < -100].shape[0]
(change_of_sign < -100000).sum()

5

In [29]:
time_window_hq = X[second_to_hq * sps: second_to_hq * sps + 6000]
time_window_hq = time_window_hq - np.mean(time_window_hq)
change_of_sign = time_window_hq[1:] * time_window_hq[:-1]
#change_of_sign[change_of_sign < -100].shape[0]
(change_of_sign < -100000).sum()

551

In [30]:
count_signs_per_row(time_window_hq.reshape(1, time_window_hq.shape[0]), 100000)

(1, 6000)


array([[551]])

In [31]:
count_signs_per_row(time_window.reshape(1, time_window.shape[0]), 100000)

(1, 6000)


array([[5]])