# Examine an event catalog and visualize representation of events from selected networks

In [None]:
import os
import sys
import math
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.width', 240)

In [None]:
import datetime
import matplotlib
import matplotlib.pyplot as plt
import scipy
from scipy.stats import kde
matplotlib.rcParams['figure.figsize'] = (16.0, 9.0)
matplotlib.rcParams['figure.max_open_warning'] = 100

In [None]:
# Progress bar helper to indicate that slow tasks have not stalled
from tqdm.auto import tqdm

In [None]:
import requests

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [None]:
PICKS_PATH = r"C:\data_cache\Picks\20190320\ensemble.p.txt"
dtype = {'#eventID': object,
    'originTimestamp': np.float64,
    'mag':                    np.float64,
    'originLon':              np.float64,
    'originLat':              np.float64,
    'originDepthKm':          np.float64,
    'net':                     object,
    'sta':                     object,
    'cha':                     object,
    'pickTimestamp':          np.float64,
    'phase':                   object,
    'stationLon':             np.float64,
    'stationLat':             np.float64,
    'az':                     np.float64,
    'baz':                    np.float64,
    'distance':               np.float64,
    'ttResidual':             np.float64,
    'snr':                    np.float64,
    'qualityMeasureCWT':      np.float64,
    'domFreq':                np.float64,
    'qualityMeasureSlope':    np.float64,
    'bandIndex':              np.int64,
    'nSigma':                 np.int64}

In [None]:
df_raw_picks = pd.read_csv(PICKS_PATH, ' ', header=0, dtype=dtype)
len(df_raw_picks)

In [None]:
# Query time period for source dataset
import obspy

start_time = obspy.UTCDateTime(df_raw_picks['originTimestamp'].min())
end_time = obspy.UTCDateTime(df_raw_picks['originTimestamp'].max())
print((str(start_time), str(end_time)))

In [None]:
def filterNetworkDataframe(df_all, netcode):
    net_mask = (df_all['net'] == netcode)
    return df_all.loc[net_mask]

In [None]:
TARGET_NET = 'AU'
df_net = filterNetworkDataframe(df_raw_picks, TARGET_NET)

In [None]:
STATIONS = list(sorted(df_net['sta'].unique()))
len(STATIONS)

## Determine record count per station and plot

In [None]:
# Count matching by station code even if the network code doesn't match, since some AU stations could appear under II or IR.
# We're mainly interested in stations with low record count.
record_count = {}
df = df_net
for sta in STATIONS:
    count = np.sum(df['sta'] == sta)
    record_count[sta] = count

In [None]:
df = pd.DataFrame({'sta': [k for k, v in record_count.items()], 'count': [v for k, v in record_count.items()]}, columns=['sta', 'count'])

In [None]:
def plotStationIncidence(df_, subplots=3, title=None, savefile=None):
    plt.figure(figsize=(32,12))
    chunk_size = int(math.ceil(len(df_)/float(subplots)))
    xlim = df_['count'].max()
    for i in range(subplots):
        plt.subplot(1, subplots, i+1)
        df_subplot = df_.iloc[i*chunk_size:(i+1)*chunk_size, :]
        df_subplot = df_subplot.iloc[::-1]
        plt.barh(df_subplot['sta'], df_subplot['count'])
        plt.xlim((0, xlim))
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=12)
        plt.xlabel("Record Count", fontsize=16)
        if i == 0:
            plt.ylabel("Station Code", fontsize=16)
        plt.gca().xaxis.grid(color="#80808080", linestyle="--")
    if title is not None:
        plt.suptitle(title, fontsize=20, y=0.92)
    if savefile is not None:
        plt.savefig(savefile, dpi=200)

In [None]:
df_count = df.sort_values('count', ascending=False)
df_sta = df.sort_values('sta')

In [None]:
# plotStationIncidence(df_count, title="Sorted incidence of {AU} station records in ensemble.p.txt".format(TARGET_NET),
#                      savefile='{}_station_incidence_pwave_event_ensemble_SORTED'.format(TARGET_NET))
plotStationIncidence(df_count, title="Sorted incidence of {} station records in ensemble.p.txt".format(TARGET_NET))

In [None]:
# plotStationIncidence(df_sta, title="Alphabetic incidence of {} station records in ensemble.p.txt".format(TARGET_NET),
#                      savefile='{}_station_incidence_pwave_event_ensemble_ALPHA'.format(TARGET_NET))
plotStationIncidence(df_sta, title="Alphabetic incidence of {} station records in ensemble.p.txt".format(TARGET_NET))

## Determine date range per station and plot

In [None]:
date_record = {}
df = df_net
for sta in STATIONS:
    mask_sta = (df['sta'] == sta)
    min_date = df.loc[mask_sta, 'originTimestamp'].min()
    max_date = df.loc[mask_sta, 'originTimestamp'].max()
    if not np.isnan(min_date) and not np.isnan(max_date):
        date_record[sta] = (min_date, max_date)

In [None]:
df_date = pd.DataFrame({'sta': [k for k, v in date_record.items()],
                        'start_date': [v[0] for k, v in date_record.items()],
                        'end_date': [v[1] for k, v in date_record.items()]},
                       columns=['sta', 'start_date', 'end_date'])

In [None]:
df_date['start_date_hr'] = df_date['start_date'].apply(obspy.UTCDateTime)
df_date['end_date_hr'] = df_date['end_date'].apply(obspy.UTCDateTime)

In [None]:
len(df_date)

In [None]:
def pandasTimestampToPlottableDatetime(data):
    return data.transform(datetime.datetime.utcfromtimestamp).astype('datetime64[D]')

In [None]:
# TODO: Change the colors of the bars here somehow include the colors of the network codes
# Only plot stations that have valid dates
df_plot = df_date[::-1]
start = pandasTimestampToPlottableDatetime(df_plot['start_date']).values
end = pandasTimestampToPlottableDatetime(df_plot['end_date']).values
dur = (end - start)/np.timedelta64(1, 'D')

plt.figure(figsize=(24, 32))
plt.barh(df_plot['sta'], dur, left=start)
time_formatter = matplotlib.dates.DateFormatter("%Y-%m-%d")
years = matplotlib.dates.YearLocator()   # every year
months = matplotlib.dates.MonthLocator()  # every month
plt.axes().xaxis.set_major_formatter(time_formatter)
plt.axes().xaxis.set_major_locator(years)
plt.axes().xaxis.set_minor_locator(months)
plt.xlabel("Date range", fontsize=16)
plt.ylabel("Station Code", fontsize=16)
plt.gca().xaxis.grid(color="#80808080", linestyle="--")
plt.gca().yaxis.grid(color="#80808020", linestyle=":")
plt.title("Record date ranges per station in ensemble.p.txt", fontsize=20)
time_range = (datetime.datetime.utcfromtimestamp(df_plot['start_date'].min()), datetime.datetime.utcfromtimestamp(df_plot['end_date'].max()))
plt.text(0.01, 0.99, "Channel selection: ALL", transform=plt.gca().transAxes, fontsize=12)
plt.text(0.01, 0.98, "Start date: {}".format(str(time_range[0])), transform=plt.gca().transAxes, fontsize=12)
plt.text(0.01, 0.97, "  End date: {}".format(str(time_range[1])), transform=plt.gca().transAxes, fontsize=12)
plt.xticks(fontsize=14, rotation=30, horizontalalignment='right')
if False:
    plt.savefig('{}_station_dates_pwave_event_ensemble.png'.format(TARGET_NET), dpi=200)

## Examine the frequency of station events as a function of time

In [None]:
def plotStationEventFrequency(df, sta_code, save_file=False):
    NET_COLOR_CODE = {'AU': 'C1', 'GE': 'C2', 'IR': 'C3'}
    # Plot the frequency of events recorded on given station code as a function of time.
    sta_events = df.loc[(df['sta'] == sta_code)]
    if len(sta_events) == 0:
        print("Nothing to plot for {}".format(sta_code))
        return
    # Convert origin timestamps to year and month column
    sta_events['timestamp'] = sta_events['originTimestamp'].apply(datetime.datetime.utcfromtimestamp)
    sta_events['year_month'] = sta_events['timestamp'].transform(lambda x: x.strftime("%Y-%m"))
    # Make sure table is sorted chronologically
    sta_events = sta_events.sort_values('originTimestamp')
    # Count number of events per month
    freq_dict = {d: len(df) for d, df in sta_events.groupby('year_month')}
    # Generate KDE data
    kde_times = sta_events['originTimestamp']
    density = kde.gaussian_kde(kde_times)
    y = density(kde_times)
    y = y/np.max(y)
    month_times = [datetime.datetime.strptime(k, "%Y-%m") for k in freq_dict.keys()]
    kde_scaled = y*max(freq_dict.values())
    # Plot KDE and stacked count charts - HOW TO, since x-values can differ?
    plt.figure(figsize=(16, 9))
    plt.plot(pandasTimestampToPlottableDatetime(kde_times), kde_scaled, '--', alpha=0.6, linewidth=2)
    plt.plot(month_times, freq_dict.values(), alpha=0.8, linewidth=2)
    time_formatter = matplotlib.dates.DateFormatter("%Y-%m")
    years = matplotlib.dates.YearLocator()   # every year
    months = matplotlib.dates.MonthLocator()  # every month
    plt.axes().xaxis.set_major_formatter(time_formatter)
    plt.axes().xaxis.set_major_locator(years)
    plt.axes().xaxis.set_minor_locator(months)
    plt.xticks(rotation=45)
    plt.ylim((0, plt.ylim()[1]))
    plt.grid(color='#80808080', linestyle=':')
    plt.xlabel('Year and month', fontsize=14)
    plt.ylabel('Number of events', fontsize=14)
    plt.xticks(fontsize=14, horizontalalignment='right', verticalalignment='top')
    plt.yticks(fontsize=14)
    plt.legend(['KDE', 'Raw monthly event count'], fontsize=12)
    plt.text(0.01, 0.96, 'Event source: ' + PICKS_PATH, transform=plt.gca().transAxes, fontsize=10, alpha=0.7)
    plt.title('Catalogued event count by month for station {}'.format(sta_code), fontsize=18)
    if save_file:
        subfolder = 'event_freq'
        os.makedirs(subfolder, exist_ok=True)
        fname = os.path.join(subfolder, 'Record_freq_{}.png'.format(sta_code))
        plt.savefig(fname, dpi=150)
        plt.close()
    else:
        plt.show()

In [None]:
# TODO: Change count plots to be vertical bar charts, stacked with the contributions from each network with the given station code.
pbar = tqdm(total=len(STATIONS))
for sta in STATIONS:
    pbar.update()
    plotStationEventFrequency(df_raw_picks, sta, save_file=False)

## Determine the date ranges for AU permanent stations belonging to different network codes

In [None]:
def printStationsNetworkDates(df, sta_code):
    df_sta = df.loc[(df['sta'] == sta_code)]
    networks = df_sta['net'].unique()
    if len(networks) > 0:
        print('-------------------------------------------------------------------')
    for net_code in networks:
        ts = df_sta.loc[(df_sta['net'] == net_code), 'originTimestamp'].apply(obspy.UTCDateTime)
        min_date = ts.min()
        max_date = ts.max()
        print('{}: {} -- {}'.format('.'.join([net_code, sta_code]), min_date, max_date))

In [None]:
df_net_chrono = df_net.sort_values('originTimestamp')
for sta in STATIONS:
    printStationsNetworkDates(df_net_chrono, sta)