In [None]:
import os
import pickle as pkl
import base64
import json

import numpy as np
import rf
import rf.imaging
import matplotlib.pyplot as plt
import scipy
from scipy import signal
import obspy
import seaborn as sns
import pandas as pd
from tqdm.auto import tqdm

In [None]:
# Bring in interactive widgets capability. See https://towardsdatascience.com/interactive-controls-for-jupyter-notebooks-f5c94829aee6
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [None]:
import seismic.receiver_fn.rf_util as rf_util
import seismic.receiver_fn.rf_plot_utils as rf_plot_utils

## Read source file

In [None]:
src_file = r"..\DATA\OA_event_waveforms_for_rf_20170911T000036-20181128T230620_ZRT_td_rev3_qual.h5"
# src_file = r"..\DATA\OA_event_waveforms_for_rf_20170911T000036-20181128T230620_LQT_td_rev3_qual.h5"
training_station = 'BT23'

In [None]:
oa_all = rf_util.read_h5_rf(src_file, network='OA', station=training_station)

In [None]:
type(oa_all)

## Select training data

In [None]:
db = rf_util.rf_to_dict(oa_all)
oa_trainer = db[training_station]
channel = 'HHR'
# channel = 'HHQ'

In [None]:
len(oa_trainer[channel])

## Add additional statistics for prediction of trace quality

In [None]:
rf_util.compute_extra_rf_stats(oa_trainer)

## Display ranges of metadata and quality metrics

In [None]:
def get_metadata_series(traces, field):
    x = [tr.stats.get(field) for tr in traces]
    return x

In [None]:
# Extract metadata and quality data on all traces for the target channel
channel_data = oa_trainer[channel]

snr = get_metadata_series(channel_data, 'snr')
entropy = get_metadata_series(channel_data, 'entropy')
coherence = get_metadata_series(channel_data, 'max_coherence')
distance = get_metadata_series(channel_data, 'distance')
inclination = get_metadata_series(channel_data, 'inclination')
magnitude = get_metadata_series(channel_data, 'event_magnitude')
depth = get_metadata_series(channel_data, 'event_depth')
rf_group = get_metadata_series(channel_data, 'rf_group')
# Replace no-group group IDs with '-1'
rf_group = [g if g is not None else -1 for g in rf_group]
amax = get_metadata_series(channel_data, 'amax')
# Extra metrics
amp_20pc = get_metadata_series(channel_data, 'amp_20pc')
amp_80pc = get_metadata_series(channel_data, 'amp_80pc')
mean_cplx_amp = get_metadata_series(channel_data, 'mean_cplx_amp')
rms_amp = get_metadata_series(channel_data, 'rms_amp')


In [None]:
# Generate array of data and labels to use to plot histogram distributions
dist_array = [(snr, "SNR"), (entropy, "Entropy"), (coherence, "Coherence"), (distance, "Distance"),
              (inclination, "Inclination"), (magnitude, "Magnitude"), (amax, "Max amplitude"), (amp_20pc, "Amplitude 20th perc."),
              (amp_80pc, "Amplitude 80th perc."), (mean_cplx_amp, "Mean amplitude"), (rms_amp, "RMS amplitude"), (rf_group, "Group ID")]
# dist_array = [(snr, "SNR"), (entropy, "Entropy"), (coherence, "Coherence"), (distance, "Distance"),
#               (inclination, "Inclination"), (magnitude, "Magnitude"), (amax, "Max amplitude"), (rf_group, "Group ID")]

In [None]:
# Plot histograms of metrics
plt.figure(figsize=(20, 12))
plt.subplot(3,4,1)
for i, (data, name) in enumerate(dist_array):
    ax = plt.subplot(3, 4, i + 1)
#     plt.hist(data, bins=20)
    sns.distplot(data, bins=20, ax=ax)
    plt.title(name + " distribution", y=0.88, fontweight='bold')
plt.show()

In [None]:
# Examine co-plots to look for discriminating variables
df = pd.DataFrame.from_dict({"SNR": snr, "Entropy": entropy, "Coherence": coherence, "Max_amp": amax,
                             "Amp_20pc": amp_20pc, "Amp_80pc": amp_80pc, "RMS_amp": rms_amp, "Mean_amp": mean_cplx_amp,
                             "Magnitude": ">=6", "Distance": ">=60", "Depth": ">=80km",
                             "Inclination": ">=20", "Group_id": rf_group,
                             "Quality": "unknown"})
df.loc[(np.array(magnitude) < 6.0), "Magnitude"] = "<6"
df.loc[(np.array(distance) < 60.0), "Distance"] = "<60"
df.loc[(np.array(inclination) < 20.0), "Inclination"] = "<20"
df.loc[(np.array(depth) < 80.0), "Depth"] = "<80km"

In [None]:
qual_file = training_station + "_quality_labels_ZRT.csv"
# qual_file = training_station + "_quality_labels_LQT.csv"
if os.path.isfile(qual_file):
    loaded_quality = pd.read_csv(qual_file, index_col=0, header=None)
    df['Quality'] = loaded_quality

### Use interactive widget to manually label the quality of the traces

In [None]:
print("Quality guide:")
print("'a' = low signal before onset, higher signal after onset with some multiples visible")
print("'b' = signal similar before and after onset, cannot make out multiples with much confidence")
print("Create labels by entering 10 character string of 'a's and 'b's according to quality, ordered from bottom to top trace.")
# Create labels for quality. Note that rf plots are numbered from the bottom up, whereas the Pandas table is displayed ordered from the top down.
quality_updated = False
for i in range(0, len(df), 10):
    existing_qual = df['Quality'].iloc[i:i+10].values
    if not 'unknown' in existing_qual:
        continue
    rf_slice = rf.RFStream(channel_data[i:i+10])
    rf_plot_utils.plot_rf_stack(rf_slice, trace_height=0.4)
    plt.show()
    get_labels = ''
    quit = False
    while len(get_labels) != len(rf_slice):
        get_labels = input("Enter labels: ")
        if get_labels.lower() == 'quit':
            quit = True
            break
        if len(get_labels) != len(rf_slice):
            print("Wrong number of labels, try again!")
    if quit:
        break
    for j, qual in enumerate(get_labels):
        df['Quality'].iloc[i+j] = qual
    quality_updated = True
    display(df.iloc[i:i+10])

if quality_updated:
    df['Quality'].to_csv(qual_file)
else:
    display(df.sample(20, random_state=0))

In [None]:
# Assign quality category to trace metadata
for i, tr in enumerate(channel_data):
    tr.stats.quality = df['Quality'].iloc[i]

### Plot labelled data to find metrics to discriminate trace quality

In [None]:
stats_metrics = ["SNR", "Entropy", "Coherence", "Max_amp", "Amp_20pc", "Amp_80pc", "RMS_amp", "Mean_amp"]

In [None]:
def metrics_pairplot(df, plot_vars, hue_by='Quality', title=''):
    hue_order = None
    if hue_by == 'Quality' or hue_by == 'Prediction':
        hue_order = ['unknown', 'b', 'a'] if 'unknown' in df['Quality'] else ['b', 'a']
    sns.pairplot(df, hue=hue_by, hue_order=hue_order, vars=plot_vars)
    plt.suptitle(title, y=1.01, fontsize=20)
#     plt.show()

In [None]:
@interact_manual
def _metrics_pairplot(hue_by=['Quality', 'Magnitude', 'Distance', 'Depth', 'Inclination', 'Group_id']):
    metrics_pairplot(df, stats_metrics, hue_by, title="Pairwise quality metrics scatter plot")

## Try to manually select metadata metrics for filtering to the Quality A set of events

In [None]:
num_total = len(channel_data)

rf_data = [tr for tr in channel_data if tr.stats.quality == 'a']
rf_data = sorted(rf_data, key=lambda v: v.stats.back_azimuth)
rf_stream_A = rf.RFStream(rf_data)
print("Quality A: {} events".format(len(rf_stream_A)))
quality_A_ids = [tr.stats.event_id for tr in rf_stream_A]
not_quality_A_ids = [tr.stats.event_id for tr in channel_data if tr.stats.event_id not in quality_A_ids]

rf_data = [tr for tr in channel_data if tr.stats.snr >= 2.0 and tr.stats.entropy >= 3.5 and tr.stats.max_coherence >= 0.25]
rf_data = sorted(rf_data, key=lambda v: v.stats.back_azimuth)
rf_stream_stats_filtered = rf.RFStream(rf_data)
num_filtered = len(rf_stream_stats_filtered)
print("Stats filtered: {} events".format(num_filtered))
stats_filtered_ids = [tr.stats.event_id for tr in rf_stream_stats_filtered]
true_positives = [id for id in stats_filtered_ids if id in quality_A_ids]
false_negatives = [id for id in quality_A_ids if id not in stats_filtered_ids]
num_true_positive = len(true_positives)
num_false_negative = len(false_negatives)
num_predicted_positive = len(stats_filtered_ids)
num_predicted_negative = num_total - num_predicted_positive

# Determine how many of the events in stats_filtered_ids are Quality A events
print("{}/{} correct filtered events (snr, entropy, coherence) (Positive predictive value = {:.2f}%, False omission rate = {:.2f}%)"
      .format(num_true_positive, num_filtered, 100.0*num_true_positive/num_predicted_positive, 100*num_false_negative/num_predicted_negative))

# Repeat using amplitude metrics
rf_data = [tr for tr in channel_data if tr.stats.amax <= 0.3 and tr.stats.amp_20pc <= 0.03 and tr.stats.amp_80pc <= 0.1]
rf_data = sorted(rf_data, key=lambda v: v.stats.back_azimuth)
rf_stream_stats2_filtered = rf.RFStream(rf_data)
num2_filtered = len(rf_stream_stats2_filtered)
print("Stats2 filtered: {} events".format(num2_filtered))
stats2_filtered_ids = [tr.stats.event_id for tr in rf_stream_stats2_filtered]
true_positives = [id for id in stats2_filtered_ids if id in quality_A_ids]
false_negatives = [id for id in quality_A_ids if id not in stats2_filtered_ids]
num_true_positive = len(true_positives)
num_false_negative = len(false_negatives)
num_predicted_positive = len(stats2_filtered_ids)
num_predicted_negative = num_total - num_predicted_positive

print("{}/{} filtered events (Max. amp, 20%, 80%) are quality A events (Positive predictive value = {:.2f}%, False omission rate = {:.2f}%)"
      .format(num_true_positive, num2_filtered, 100.0*num_true_positive/num_predicted_positive, 100*num_false_negative/num_predicted_negative))

# The performance stats shown below show what a human achieves trying to tune data selection criteria manually.

## See how well a neural network classifier works in comparison

### Use simple stats for feature vector

In [None]:
from sklearn.neural_network import MLPClassifier
# from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix

In [None]:
known_quality_mask = (df['Quality'] != 'unknown')
X = df.loc[known_quality_mask, stats_metrics]
X[np.isnan(X)] = 0
y = df['Quality'].loc[known_quality_mask]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [None]:
# scaler = preprocessing.StandardScaler().fit(X_train)
# X_train_transformed = scaler.transform(X_train)

In [None]:
# This perceptron network has been simplified back to the bare bone so that it corresponds to a linear predictor,
# as higher order complexity and non-linear activation functions gave no improvement in accuracy.
clf_simple = MLPClassifier(solver='lbfgs', alpha=1e-4, max_iter=1000, activation='identity',
                           hidden_layer_sizes=(1,), random_state=3772, tol=1e-4)

In [None]:
# Run cross-validation to tune hyperparameters
scores = cross_val_score(clf_simple, X_train, y_train, cv=5)
print(scores)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()*2))

In [None]:
# With tuned hyperparameters, train on full training set.
clf_simple.fit(X_train, y_train)
print("Final loss: %0.4f" % clf_simple.loss_)

In [None]:
final_score = clf_simple.score(X_test, y_test)
print("Final accuracy: %0.3f" % final_score)
# We get decent performance with a trivial network (1 neuron) with trivial activation f(x) = x,
# which means that simply a linear combination of feature vector is sufficient to determine
# classification.

### Apply quality classifier to whole dataset

In [None]:
full_prediction = clf_simple.predict(X)
confidences = clf_simple.predict_proba(X)
confidence_index = np.zeros(full_prediction.shape).astype(np.int)
confidence_index[(full_prediction == 'b')] = 1
df.loc[known_quality_mask, 'Prediction'] = full_prediction
df.loc[known_quality_mask, 'Confidence'] = confidences[range(confidence_index.size), confidence_index]

In [None]:
# Show the order of metrics so that weightings can be interpreted in relation to metrics
print(stats_metrics)
# Display the coefficients of the trained classifier
print("Hidden layer:")
print("  weightings: {}".format(clf_simple.coefs_[0].T[0]))
print("        bias: {}".format(clf_simple.intercepts_[0]))
A0 = clf_simple.coefs_[0].T[0]
b0 = clf_simple.intercepts_[0][0]

print("Output layer:")
print("  weightings: {}".format(clf_simple.coefs_[1][0]))
print("        bias: {}".format(clf_simple.intercepts_[1]))
A1 = clf_simple.coefs_[1][0][0]
b1 = clf_simple.intercepts_[1][0]

In [None]:
prediction_metric = A1*(np.matmul(X.values, A0) + b0) + b1
df.loc[known_quality_mask, 'Prediction metric'] = prediction_metric
df.sample(20, random_state=3772)

In [None]:
# Plot whole prediction dataset
@interact_manual
def _metrics_pairplot(hue_by=['Prediction', 'Quality']):
    metrics_pairplot(df, stats_metrics, hue_by, title="Pairwise predicted quality scatter plot - full data")

In [None]:
# Plot prediction just on test dataset
@interact_manual
def _metrics_pairplot(hue_by=['Prediction', 'Quality']):
    metrics_pairplot(df.iloc[sorted(X_test.index)], stats_metrics, hue_by, title="Pairwise predicted quality scatter plot - test data")

In [None]:
# Display confusion matrix and verify how to compute accuracy from it.
cm = confusion_matrix(df.loc[known_quality_mask, 'Quality'], df.loc[known_quality_mask, 'Prediction'], labels=['b', 'a'])
print(cm)
print(np.sum(cm))
print("Accuracy: %0.3f" % (np.sum(np.diag(cm))/float(np.sum(cm))))

In [None]:
# Look at how good is the DBSCAN grouping as an indicator of trace quality.
dbscan_group = df['Group_id'].copy()
primary_group_mask = (dbscan_group == 0)
dbscan_group[primary_group_mask] = 'a'
dbscan_group[~primary_group_mask] = 'b'
cm_dbscan = confusion_matrix(df.loc[known_quality_mask, 'Quality'], dbscan_group[known_quality_mask], labels=['b', 'a'])
print(cm_dbscan)
print(np.sum(cm_dbscan))
print("Accuracy: %0.3f" % (np.sum(np.diag(cm_dbscan))/float(np.sum(cm_dbscan))))
# Result here indicates DBSCAN grouping is not a strong predictor of subjective trace quality

In [None]:
# Look at how good SNR alone is as an indicator of trace quality.
snr_series = df['SNR'].copy()
high_snr_mask = (snr_series >= 1.5)
snr_series[high_snr_mask] = 'a'
snr_series[~high_snr_mask] = 'b'
cm_snr = confusion_matrix(df.loc[known_quality_mask, 'Quality'], snr_series[known_quality_mask], labels=['b', 'a'])
print(cm_snr)
print(np.sum(cm_snr))
print("Accuracy: %0.3f" % (np.sum(np.diag(cm_snr))/float(np.sum(cm_snr))))
# Result here indicates SNR alone is quite a good indicator of quality

## Persist classifier model to file

In [None]:
model_file = training_station + "_classifier_ZRT.json"
# model_file = training_station + "_classifier_LQT.json"
model = {}
model['params'] = clf_simple.get_params()
model['coeffs'] = base64.b64encode(pkl.dumps(clf_simple.coefs_, pkl.HIGHEST_PROTOCOL)).decode('utf-8')
model['biases'] = base64.b64encode(pkl.dumps(clf_simple.intercepts_, pkl.HIGHEST_PROTOCOL)).decode('utf-8')
model['binarizer'] = base64.b64encode(pkl.dumps(clf_simple._label_binarizer, pkl.HIGHEST_PROTOCOL)).decode('utf-8')
model['classes'] = clf_simple.classes_.tolist()
model['out_activation'] = clf_simple.out_activation_
model['n_outputs'] = clf_simple.n_outputs_
model['n_layers'] = clf_simple.n_layers_
with open(model_file, 'w') as f:
    json.dump(model, f, indent=4)