In [None]:
import os
import pickle as pkl

import numpy as np
import rf
import rf.imaging
import matplotlib.pyplot as plt
import scipy
from scipy import signal
import obspy
import seaborn as sns
import pandas as pd
from tqdm.auto import tqdm

In [None]:
# Bring in interactive widgets capability. See https://towardsdatascience.com/interactive-controls-for-jupyter-notebooks-f5c94829aee6
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

In [None]:
import seismic.receiver_fn.rf_util as rf_util
import seismic.receiver_fn.rf_plot_utils as rf_plot_utils

## Choose RF type

In [None]:
rf_type = 'ZRT_td'
# rf_type = 'ZRT_fd'
# rf_type = 'LQT_td'
# rf_type = 'LQT_fd'

## Choose training station

In [None]:
# training_station = 'BT23'
training_station = 'BL05'

## Read source file

In [None]:
# src_file = r"..\DATA\OA_rfs_20170911T000036-20181128T230620_{}_rev5_qual.h5".format(rf_type)
# oa_all = rf_util.read_h5_rf(src_file, network='OA', station=training_station, loc='0M')
src_file = r"..\DATA\7W_rfs_20080827T000136-20101231T235620_{}_qual.h5".format(rf_type)
data_all = rf_util.read_h5_rf(src_file, network='7W', station=training_station)

In [None]:
type(data_all)

## Select training data

In [None]:
db = rf_util.rf_to_dict(data_all)
db_trainer = db[training_station]
if rf_type[0:3] == 'ZRT':
    prospective_channels = ['HHR', 'BHR']
elif rf_type[0:3] == 'LQT':
    prospective_channels = ['HHQ', 'BHQ']
else:
    prospective_channels = []
# end if
channel = None
for c in prospective_channels:
    if c in db_trainer:
        channel = c
        break
# end for
print("Selected channel: {}".format(channel))

In [None]:
len(db_trainer[channel])

## Display ranges of metadata and quality metrics

In [None]:
def get_metadata_series(traces, field):
    x = [tr.stats.get(field) for tr in traces]
    return x

In [None]:
# Extract metadata and quality data on all traces for the target channel
channel_data = db_trainer[channel]

snr = get_metadata_series(channel_data, 'snr')
entropy = get_metadata_series(channel_data, 'entropy')
m0 = get_metadata_series(channel_data, 'm0_delta')
m1 = get_metadata_series(channel_data, 'm1_delta')
m2 = get_metadata_series(channel_data, 'm2_delta')
# m0 = get_metadata_series(channel_data, 'm0_ratio')
# m1 = get_metadata_series(channel_data, 'm1_ratio')
# m2 = get_metadata_series(channel_data, 'm2_ratio')
coherence = get_metadata_series(channel_data, 'max_coherence')
distance = get_metadata_series(channel_data, 'distance')
inclination = get_metadata_series(channel_data, 'inclination')
magnitude = get_metadata_series(channel_data, 'event_magnitude')
depth = get_metadata_series(channel_data, 'event_depth')
rf_group = get_metadata_series(channel_data, 'rf_group')
# Replace no-group group IDs with '-1'
rf_group = [g if g is not None else -1 for g in rf_group]
log10_amax = get_metadata_series(channel_data, 'log10_amax')
delta_log10_amp_20pc = get_metadata_series(channel_data, 'delta_log10_amp_20pc')
delta_log10_amp_80pc = get_metadata_series(channel_data, 'delta_log10_amp_80pc')
delta_mean_log10_cplx_amp = get_metadata_series(channel_data, 'delta_mean_log10_cplx_amp')
delta_log10_rms_amp = get_metadata_series(channel_data, 'delta_log10_rms_amp')


In [None]:
# Generate array of data and labels to use to plot histogram distributions
dist_array = [(snr, "SNR"), (entropy, "Entropy"), (m0, "M0"), (m1, "M1"), (m2, "M2"), (coherence, "Coherence"), (distance, "Distance"),
              (inclination, "Inclination"), (magnitude, "Magnitude"), (log10_amax, "Log max amp."), (delta_log10_amp_20pc, "Del20pc log amp."),
              (delta_log10_amp_80pc, "Del80pc log amp."), (delta_mean_log10_cplx_amp, "DelMean log amp."), (delta_log10_rms_amp, "DelLog RMS amp."),
              (rf_group, "Group ID")]

In [None]:
# Plot histograms of metrics
plt.figure(figsize=(20, 12))
plt.subplot(3,5,1)
for i, (data, name) in enumerate(dist_array):
    ax = plt.subplot(3, 5, i + 1)
#     plt.hist(data, bins=20)
    sns.distplot(data, bins=20, ax=ax)
    plt.title(name + " distribution", y=0.88, fontweight='bold')
plt.show()

In [None]:
# Examine co-plots to look for discriminating variables
df = pd.DataFrame.from_dict({"SNR": snr, "Entropy": entropy, "M0": m0, "M1": m1, "M2": m2, "Coherence": coherence,
                             "Log_max_amp": log10_amax,
                             "Del20pc_log_amp": delta_log10_amp_20pc, "Del80pc_log_amp": delta_log10_amp_80pc,
                             "DelLog_RMS_amp": delta_log10_rms_amp, "DelMean_log_amp": delta_mean_log10_cplx_amp,
                             "Magnitude": ">=6", "Distance": ">=60", "Depth": ">=80km",
                             "Inclination": ">=20", "Group_id": rf_group,
                             "Quality": "unknown"})
df.loc[(np.array(magnitude) < 6.0), "Magnitude"] = "<6"
df.loc[(np.array(distance) < 60.0), "Distance"] = "<60"
df.loc[(np.array(inclination) < 20.0), "Inclination"] = "<20"
df.loc[(np.array(depth) < 80.0), "Depth"] = "<80km"

In [None]:
qual_file = training_station + "_quality_labels_{}.csv".format(rf_type)
if os.path.isfile(qual_file):
    print("Loading {}".format(qual_file))
    loaded_quality = pd.read_csv(qual_file, index_col=0, header=None)
    df['Quality'] = loaded_quality
else:
    df['Quality'] = 'unknown'

### Use interactive widget to manually label the quality of the traces

In [None]:
print("Quality guide:")
print("'a' = low signal before onset, higher signal after onset with some multiples visible")
print("'b' = signal similar before and after onset, cannot make out multiples with much confidence")
print("Create labels by entering 10 character string of 'a's and 'b's according to quality, ordered from bottom to top trace.")
# Create labels for quality. Note that rf plots are numbered from the bottom up, whereas the Pandas table is displayed ordered from the top down.
quality_updated = False
for i in range(0, len(df), 10):
    existing_qual = df['Quality'].iloc[i:i+10].values
    if not 'unknown' in existing_qual:
        continue
    rf_slice = rf.RFStream(channel_data[i:i+10])
    rf_plot_utils.plot_rf_stack(rf_slice, trace_height=0.4, time_window=(-20, 25), fig_width=10.0)
    plt.show()
    get_labels = ''
    quit = False
    while len(get_labels) != len(rf_slice):
        get_labels = input("Enter labels: ")
        if get_labels.lower() == 'quit':
            quit = True
            break
        if len(get_labels) != len(rf_slice):
            print("Wrong number of labels, try again!")
    if quit:
        break
    for j, qual in enumerate(get_labels):
        df['Quality'].iloc[i+j] = qual
    quality_updated = True
    display(df.iloc[i:i+10])

if quality_updated:
    df['Quality'].to_csv(qual_file)
else:
    display(df.sample(10, random_state=0))

In [None]:
# Assign quality category to trace metadata
for i, tr in enumerate(channel_data):
    tr.stats.quality = df['Quality'].iloc[i]

### Plot labelled data to find metrics to discriminate trace quality

In [None]:
stats_metrics = ["SNR", "Entropy", "M0", "M1", "M2", "Coherence", "Log_max_amp",
                 "Del20pc_log_amp", "Del80pc_log_amp", "DelLog_RMS_amp", "DelMean_log_amp"]

In [None]:
def metrics_pairplot(df, plot_vars, hue_by='Quality', title=''):
    hue_order = None
    if hue_by == 'Quality' or hue_by == 'Prediction':
        hue_order = ['unknown', 'b', 'a'] if 'unknown' in df['Quality'] else ['b', 'a']
    sns.pairplot(df, hue=hue_by, hue_order=hue_order, vars=plot_vars)
    plt.suptitle(title, y=1.01, fontsize=20)
#     plt.show()

In [None]:
# @interact_manual
# def _metrics_pairplot(hue_by=['Quality', 'Magnitude', 'Distance', 'Depth', 'Inclination', 'Group_id']):
#     metrics_pairplot(df, stats_metrics, hue_by, title="Pairwise quality metrics scatter plot")

In [None]:
metrics_pairplot(df, plot_vars=stats_metrics, hue_by='Quality')

## Train a neural network classifier to discriminate quality

### Use simple stats for feature vector

In [None]:
from sklearn.neural_network import MLPClassifier
# from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix

In [None]:
known_quality_mask = (df['Quality'] != 'unknown')
X = df.loc[known_quality_mask, stats_metrics]
X[np.isnan(X)] = 0
y = df['Quality'].loc[known_quality_mask]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
# scaler = preprocessing.StandardScaler().fit(X_train)
# X_train_transformed = scaler.transform(X_train)

In [None]:
# This perceptron network has been simplified back to the bare bone so that it corresponds to a linear predictor,
# as higher order complexity and non-linear activation functions gave no improvement in accuracy.
clf_simple = MLPClassifier(solver='lbfgs', alpha=1e-4, max_iter=1000, activation='identity',
                           hidden_layer_sizes=(4,), random_state=3772, tol=1e-4)

In [None]:
# Run cross-validation to tune hyperparameters
scores = cross_val_score(clf_simple, X_train, y_train, cv=5)
print(scores)
print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std()*2))

In [None]:
# With tuned hyperparameters, train on full training set.
clf_simple.fit(X_train, y_train)
print("Final loss: %0.4f" % clf_simple.loss_)

In [None]:
final_score = clf_simple.score(X_test, y_test)
print("Final accuracy: %0.3f" % final_score)
# We get decent performance with a trivial network (1 neuron) with trivial activation f(x) = x,
# which means that simply a linear combination of feature vector is sufficient to determine
# classification.

### Apply quality classifier to whole dataset

In [None]:
X_full = df[stats_metrics]
X_full[np.isnan(X_full)] = 0

In [None]:
full_prediction = clf_simple.predict(X_full)
confidences = clf_simple.predict_proba(X_full)
confidence_index = np.zeros(full_prediction.shape).astype(np.int)
confidence_index[(full_prediction == 'b')] = 1
df['Prediction'] = full_prediction
df['Confidence'] = confidences[range(confidence_index.size), confidence_index]

In [None]:
# # Plot whole prediction dataset
# @interact_manual
# def _metrics_pairplot(hue_by=['Prediction', 'Quality']):
#     metrics_pairplot(df, stats_metrics, hue_by, title="Pairwise predicted quality scatter plot - full data")

In [None]:
# # Plot prediction just on test dataset
# @interact_manual
# def _metrics_pairplot(hue_by=['Prediction', 'Quality']):
#     metrics_pairplot(df.iloc[sorted(X_test.index)], stats_metrics, hue_by, title="Pairwise predicted quality scatter plot - test data")

In [None]:
# Display confusion matrix and verify how to compute accuracy from it.
cm = confusion_matrix(df.loc[known_quality_mask, 'Quality'], df.loc[known_quality_mask, 'Prediction'], labels=['b', 'a'])
print(cm)
print(np.sum(cm))
print("Accuracy: %0.3f" % (np.sum(np.diag(cm))/float(np.sum(cm))))

In [None]:
# Look at how good is the DBSCAN grouping as an indicator of trace quality.
dbscan_group = df['Group_id'].copy()
primary_group_mask = (dbscan_group == 0)
dbscan_group[primary_group_mask] = 'a'
dbscan_group[~primary_group_mask] = 'b'
cm_dbscan = confusion_matrix(df.loc[known_quality_mask, 'Quality'], dbscan_group[known_quality_mask], labels=['b', 'a'])
print(cm_dbscan)
print(np.sum(cm_dbscan))
print("Accuracy: %0.3f" % (np.sum(np.diag(cm_dbscan))/float(np.sum(cm_dbscan))))
# Result here indicates DBSCAN grouping is not a strong predictor of subjective trace quality

In [None]:
# Look at how good SNR alone is as an indicator of trace quality.
snr_series = df['SNR'].copy()
high_snr_mask = (snr_series >= 2)
snr_series[high_snr_mask] = 'a'
snr_series[~high_snr_mask] = 'b'
cm_snr = confusion_matrix(df.loc[known_quality_mask, 'Quality'], snr_series[known_quality_mask], labels=['b', 'a'])
print(cm_snr)
print(np.sum(cm_snr))
print("Accuracy: %0.3f" % (np.sum(np.diag(cm_snr))/float(np.sum(cm_snr))))
# Result here indicates SNR alone is quite a good indicator of quality

In [None]:
# Look at how good coherence alone is as an indicator of trace quality.
coh_series = df['Coherence'].copy()
high_coh_mask = (coh_series >= 0.25)
coh_series[high_coh_mask] = 'a'
coh_series[~high_coh_mask] = 'b'
cm_coh = confusion_matrix(df.loc[known_quality_mask, 'Quality'], coh_series[known_quality_mask], labels=['b', 'a'])
print(cm_coh)
print(np.sum(cm_coh))
print("Accuracy: %0.3f" % (np.sum(np.diag(cm_coh))/float(np.sum(cm_coh))))

## Persist classifier model to file

In [None]:
model_file = training_station + "_classifier_{}.pkl".format(rf_type)
with open(model_file, 'wb') as f:
    pkl.dump(clf_simple, f)