# Survey ASDF database for daily activity for each station

In [None]:
import os
import sys
import time

In [None]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.dates
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from collections import defaultdict
import datetime
import pytz
import dateutil
from dateutil import rrule

In [None]:
package_root = os.path.abspath('../../..')
if package_root not in sys.path:
    sys.path.append(package_root)
from seismic.ASDFdatabase import FederatedASDFDataSet
import obspy

In [None]:
from tqdm.auto import tqdm

In [None]:
# TARGET_NETWORKS = ['AU', '7B', '7D', '7E', '7F', '7G', '7J', '7W', '7X', 'OA', 'S']
TARGET_NETWORKS = ['AQ']

pkl_filename = ','.join(TARGET_NETWORKS) + "_station_survey.pkl"

regenerate_data = not os.path.exists(pkl_filename)

In [None]:
# Full date ranges for temporary deployments
TEMP_DEPLOYMENTS = {}
TEMP_DEPLOYMENTS['7B'] = (obspy.UTCDateTime('1993-05-03T03:00:58.000000Z'),
                          obspy.UTCDateTime('1995-08-10T01:25:16.000000Z'))
TEMP_DEPLOYMENTS['7D'] = (obspy.UTCDateTime('2012-01-01T00:01:36.000000Z'),
                          obspy.UTCDateTime('2014-03-27T15:09:51.000000Z'))
TEMP_DEPLOYMENTS['7E'] = (obspy.UTCDateTime('1998-05-22T02:27:38.000000Z'),
                          obspy.UTCDateTime('1998-07-25T02:09:54.000000Z'))
TEMP_DEPLOYMENTS['7F'] = (obspy.UTCDateTime('2012-12-31T23:59:59.000000Z'),
                          obspy.UTCDateTime('2014-11-15T00:43:14.000000Z'))
TEMP_DEPLOYMENTS['7G'] = (obspy.UTCDateTime('2014-01-01T00:00:06.000000Z'),
                          obspy.UTCDateTime('2016-02-09T21:04:29.000000Z'))
TEMP_DEPLOYMENTS['7J'] = (obspy.UTCDateTime('2006-02-04T03:12:15.000000Z'),
                          obspy.UTCDateTime('2007-05-30T23:59:59.000000Z'))
TEMP_DEPLOYMENTS['7W'] = (obspy.UTCDateTime('2008-08-27T03:27:12.000000Z'),
                          obspy.UTCDateTime('2011-05-24T00:34:59.000000Z'))
TEMP_DEPLOYMENTS['7X'] = (obspy.UTCDateTime('2009-06-16T03:42:00.000000Z'),
                          obspy.UTCDateTime('2011-04-01T23:18:49.000000Z'))
TEMP_DEPLOYMENTS['OA'] = (obspy.UTCDateTime('2017-09-13T23:59:13.000000Z'),
                          obspy.UTCDateTime('2018-11-28T01:11:14.000000Z'))
TEMP_DEPLOYMENTS['AQ'] = (obspy.UTCDateTime('2015-11-28T04:20:00.000000Z'),
                          obspy.UTCDateTime('2018-01-14T01:18:49.000000Z'))

In [None]:
if regenerate_data:
    ds = FederatedASDFDataSet.FederatedASDFDataSet("/g/data/ha3/Passive/SHARED_DATA/Index/asdf_files.txt",
                                                   variant='db')

    # Get all nets and stations.
    # Gets list of tuples containing [net, sta, start_time, end_time]
    # Start- and end-times are instances of obspy UTCDateTime
    import time
    import sqlite3
    tries = 10
    while tries > 0:
        tries = tries - 1
        try:
            all_codes = [x for x in ds.local_net_sta_list()]
            ntries = 10 - tries
            tries = 0
            print("Success! ({} {})".format(ntries, 'try' if ntries == 1 else 'tries'))
        except sqlite3.DatabaseError:
            if tries == 0:
                raise
            else:
                print("Retrying station list load ({} tries remaining)...".format(tries))
                time.sleep(1)

In [None]:
def reset_ds():
    ds = FederatedASDFDataSet.FederatedASDFDataSet("/g/data/ha3/Passive/SHARED_DATA/Index/asdf_files.txt", variant='db')
    return ds

In [None]:
if regenerate_data:
    targets = [x for x in all_codes if x[0] in TARGET_NETWORKS]
    len(targets)

In [None]:
PY2 = sys.version_info[0] < 3
# Persist results to pickle file, since they take a couple of hours to generate.
if PY2:
    import cPickle as pkl
else:
    import pickle as pkl

if regenerate_data: # This takes > 4 hours to run, only run when really needed.
    print("Regenerating data from scratch, this could take hours...")

    import sqlite3

    # Catalog the number of traces on a daily basis for all stations and channels
    one_day_sec = 3600*24
    millisec = 0.001

    # Format of each result row: NET, STA, LOC, CHA, DATE, NUM_TRACES
    temp_result = []
    pbar = tqdm(total=len(targets))
    for r in targets:
        pbar.update()
        net = r[0]
        sta = r[1]
        stn_start_time = r[2]
        stn_end_time = r[3]
        while True:
            try:
                stn_channels = ds.get_stations(stn_start_time, stn_end_time, network=net, station=sta)
                break
            except sqlite3.DatabaseError:
                print("WARNING: database error (1) accessing {}.{}, sleeping then retrying...".format(net, sta))
                ds = reset_ds()
                time.sleep(5)
                continue
        loc_chan = [(s[2], s[3]) for s in stn_channels]
        for loc, cha in loc_chan:
            # TODO: Parallelize this loop by breaking up into N time epochs
            pbar.set_description('Processing {}.{}.{}.{}'.format(net, sta, loc, cha))
            while True:
                try:
                    start_time, end_time = ds.get_global_time_range(net, sta, loc, cha)
                    if end_time.year == 2032:
                        end_time.year = 2017
                    break
                except sqlite3.DatabaseError:
                    print("WARNING: database error (2) accessing {}.{}, sleeping then retrying...".format(loc, cha))
                    ds = reset_ds()
                    time.sleep(5)
                    continue
            start_day = obspy.UTCDateTime(start_time.year, start_time.month, start_time.day)
            end_day = obspy.UTCDateTime(end_time.year, end_time.month, end_time.day) + one_day_sec
            day_begin = start_day
            day_end = day_begin + one_day_sec - millisec
            while True:
                try:
                    count = ds.get_waveform_count(net, sta, loc, cha, day_begin, day_end)
                except sqlite3.DatabaseError:
                    print("WARNING: database error (3) accessing {}, sleeping then retrying...".format(str(day_begin)))
                    ds = reset_ds()
                    time.sleep(5)
                    continue
                temp_result.append([net, sta, loc, cha, day_begin, count])
                if day_end > end_day:
                    break
                else:
                    day_begin += one_day_sec
                    day_end = day_begin + one_day_sec - millisec
    result = temp_result
    pbar.close()
    
    with open(pkl_filename, 'wb') as f:
        pkl.dump(result, f)

else:
    assert os.path.exists(pkl_filename)
    print("Loading data from pickle file {}".format(pkl_filename))
    with open(pkl_filename, 'rb') as f:
        result = pkl.load(f)

In [None]:
len(result)

In [None]:
def pandas_timestamp_to_plottable_datetime(data):
    """
    Convert float UTC timestamp to equivalent type that is plottable by matplotlib

    :param data: Pandas series of float timestamps
    :type data: pandas.Series
    :return: Array of Python datetimes
    :rtype: numpy.array(datetime)
    """
    return data.transform(datetime.datetime.utcfromtimestamp).astype('datetime64[ms]').dt.to_pydatetime()

In [None]:
df_all = pd.DataFrame(np.array(result), columns=['net', 'sta', 'loc', 'cha', 'date_utc', 'num_traces'])

In [None]:
# Convert obspy dates to sortable and matplotlib compatible dates
# date_plottable = df_all['date'].transform(utc_time_to_plottable_datetime)
date_plottable = df_all['date_utc'].transform(float)
df_all['date_flt'] = date_plottable

In [None]:
# Filter out earlier historical deployments duplicating certain network codes
if True:
    # Some select stations require custom date filters to remove singular events outside 
    # the date range of the rest of the network.
    DATE_FILTER = (
        ('7D', pd.Timestamp(datetime.datetime(2010, 1, 1))), 
        ('7G', pd.Timestamp(datetime.datetime(2010, 1, 1)))
    )
    before = len(df_all)
    for net, min_date in DATE_FILTER:
        date_mask = (df_all['net'] == net) & (df_all['date_flt'] < min_date.timestamp())
        df_all = df_all[~date_mask]
    after = len(df_all)
    print('Removed {} events due to timestamps'.format(before - after))

In [None]:
# Sort
df_all = df_all.sort_values(['net', 'sta', 'cha', 'date_flt'])

In [None]:
# Loop over all stations
all_pairs = [(n, s) for (n, s), _ in df_all.groupby(['net', 'sta'])]

In [None]:
def plot_station_uptime(df, netcode, statcode):
    # Fixed properties
    barprops = dict(aspect='auto', cmap='RdYlGn', interpolation='bilinear', alpha=0.8)
    
    sta_mask = (df['net'] == netcode) & (df['sta'] == statcode)
    if not np.any(sta_mask):
        return
    df_sta = df.loc[sta_mask]
    loc_cha = set()
    for (l, s) in df_sta[['loc', 'cha']].values:
        loc_cha.add((l, s))
    loc_cha = sorted(list(loc_cha))
    num_plots = len(loc_cha)
    fig_height = 1*num_plots + 2
    plt.figure(figsize=(16, fig_height))

    # Make sure we have a common time axis for all subplots.
    stn_min_date = df_sta['date_flt'].min()
    stn_max_date = df_sta['date_flt'].max()
    if netcode in TEMP_DEPLOYMENTS:
        deployment_dates = TEMP_DEPLOYMENTS[netcode]
        min_date = float(deployment_dates[0])
        max_date = float(deployment_dates[1])
    else:
        min_date = stn_min_date
        max_date = stn_max_date

    for i, (loc, cha) in enumerate(loc_cha):
        mask = (df_sta['loc'] == loc) & (df_sta['cha'] == cha)
        if num_plots > 1:
            plt.subplot(num_plots, 1, i + 1)
        df_masked = df_sta.loc[mask]
        data_avail = np.where(df_masked['num_traces'] > 0, 1.0, 0.0)
        im = plt.imshow(data_avail.reshape((1, -1)), **barprops)
        plt.ylabel(cha, fontsize=12)
        if i < len(loc_cha) - 1:
            plt.xticks([])
        plt.yticks([])
        plt.xlim(min_date, max_date)
        extent = im.get_extent()
        im.set_extent((df_masked['date_flt'].iloc[0], df_masked['date_flt'].iloc[-1], extent[2], extent[3]))

    if num_plots > 1:
        title_func = plt.suptitle
    else:
        title_func = plt.title
    title_func('.'.join([netcode, statcode]) + ' [green=up, red=down]', fontsize=16)
    xticks = [min_date]
    if stn_min_date != min_date:
        xticks.extend([stn_min_date])
    xticks.extend(plt.xticks()[0])
    if stn_max_date != max_date:
        xticks.extend([stn_max_date])
    xticks.extend([max_date])
    xticks = sorted(xticks)
    plt.xticks(xticks)
    plt.xlim(min_date, max_date)
    plt.gcf().autofmt_xdate()
    plt.gca().xaxis.set_major_formatter(
        mtick.FuncFormatter(lambda pos, _: time.strftime("%Y-%m-%d",time.localtime(pos))))

In [None]:
SAVE_FIGS = True
output_folder = 'survey'

if not os.path.exists(output_folder):
    os.makedirs(output_folder)
pbar = tqdm(total=len(all_pairs))
for n, s in all_pairs:
    full_code = '.'.join([n, s])
    pbar.update()
    pbar.set_description(full_code)
    plot_station_uptime(df_all, n, s)
    if SAVE_FIGS:
        outfile = os.path.join(output_folder, full_code + "_survey.png")
        plt.savefig(outfile, dpi=300)
        plt.close()
    else:
        plt.show()
pbar.close()