In [None]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

# Workflow Control
This notebook provides an end-to-end procedure for implementing the method described in "Unsupervised Deep Clustering of Seismic Data: Monitoring the Ross Ice Shelf, Antarctica."

<img src="RISArrayMap.jpg" alt="RISArrayMap" width="600"/>

<a id="contents"></a>
***
## <u>Table of Contents</u>

1. [Initialize Project Environment](#section1)
2. [Seismic Pre-Processing](#section2)
3. [Set Universal Experiment Parameters](#section3)
4. [Autoencoder (AEC)](#section4)
5. [Gaussian Mixtures Model (GMM)](#section5)
6. [Deep Embedded Clustering (DEC)](#section6)
7. [Compare GMM & DEC](#section7)
8. [Environmental Data Exploration](#section8)

Appendices  
A. [Evaluate Optimal Number of Clusters](#appendixA)

In [None]:
import datetime
import os
import sys

from IPython.display import Markdown as md
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader, Subset
from torchinfo import summary
from torchvision import transforms

from RISCluster import models, plotting, utils
from RISCluster.networks import AEC, DEC
from RISProcess.io import config

<a id="section1"></a>
***
## <u>1 Initialize Project Environment</u>
The default project structure is:<br>
`/Project Folder
├── Config
├── Data
│   ├── Meteo
│   ├── Ice
│   ├── Seismo
│   │   ├── MSEED
│   │   └── StationXML
└── Outputs
`
<br>Note that the raw seismic data from 1-Dec-2014 to 1-Dec-2016 is nearly 1 TB. It may be practical to split out the project's `Data` folder onto a disk with more storage.  If so, set the path to the data storage below.

In [None]:
# Main project folder to save outputs:
project_folder = '.'
# Path to configuration files:
path_config = f"{project_folder}/Config"
# Path to folder containing data, including HDF file for ML workflow:
path_data = f"{project_folder}/Data"
# Path to raw seismic data:
# path_data_seismo = f"{path_data}/Seismo"
path_data_seismo = "/zdata2/data/wfjenkin/RIS_Seismic"
# Path to save workflow outputs (ML models, figures, results, etc.)
path_output = f"{project_folder}/Outputs"
# Path to HDF dataset:
fname_dataset = f"{path_data}/RISData_20210713.h5"
# Path to save paper-ready figures:
figure_savepath = f"{path_output}/Figures"

utils.init_project_env([path_config, path_data, path_data_seismo, path_output, figure_savepath])

<a href="#contents">Return to Top</a>
<a id="section2"></a>
***
## <u>2 Seismic Pre-Processing</u>
Four workflows are provided for obtaining and pre-processing seismic data.  The recommended workflow makes use of sections 2.1, 2.3, and 2.4; section 2.2 is not required for the rest of the workflow, but instead provides a pipeline to save pre-processed data to disk.

### 2.1 Download Data
In this workflow, seismic data is downloaded using the FDSN mass data downloader. Data set parameters are stored in the configuration file to configpath; MSEED data are saved to `datapath/MSEED`; and station XML data are saved to `datapath/StationXML`.  The MSEED data are saved according to the following convention:
`Network.Station..Channel__YYYYMMDDTHHMMSSZ__YYYYMMDDTHHMMSSZ.mseed`

In [None]:
parameters = {
    'start': '20141201T0000',
    'stop': '20141203T0000',
    'mode': 'download',
    'datapath': path_data_seismo,
    'network': 'XH',
    'station': '*',
    'channel': 'HH*',
}
config_file = config('w', path=path_config, parameters=parameters)
print("Run the following in Terminal:")
md(f"`dlfdsn {config_file}`")

### 2.2 Pre-process Data
In this workflow, raw seismic data is read from `datapath`, processed, and saved to `writepath` according to the following file naming conventions:<br>
`MSEED/Network/Station/Network.Station.Channel.YYYY.DAY.mseed`

For the input data, two file formats are available.
<br>**Format 1:**
<br>`Network.Station.Channel.YYYY.DAY.mseed`
<br>**Format 2:**
<br>`Network.Station..Channel__YYYYMMDDTHHMMSSZ__YYYYMMDDTHHMMSSZ.mseed`

In [None]:
# parameters = {
#     'start': '20141201T0000',
#     'stop': '20161203T0000',
#     'mode': 'preprocess',
#     'sourcepath': path_data_seismo,
#     'name_format': 2,
#     'writepath': f"{path_data_seismo}/Preprocessed",
#     'parampath': f"{path_data_seismo}/Preprocessed",
#     'network': 'XH',
#     'channel': 'HHZ',
#     'taper': 60,
#     'prefeed': 60,
#     'fs2': 50,
#     'cutoff': '3, 20',
#     'output': 'acc',
#     'prefilt': '0.004, 0.01, 500, 1000',
#     'waterlevel': 14,
#     'detector': 'z',
#     'on': 8,
#     'off': 4,
#     'num_workers': 4
# }

parameters = {
    'start': '20161119T0000',
    'stop': '20161125T0000',
    'mode': 'preprocess',
    'sourcepath': f"{path_data_seismo}",
    'name_format': 2,
    'writepath': f"{path_data_seismo}/Preprocessed",
    'parampath': f"{path_data_seismo}/Preprocessed",
    'network': 'XH',
    'channel': 'HHZ',
    'taper': 60,
    'prefeed': 60,
    'fs2': 50,
    'cutoff': '3, 20',
    'output': 'acc',
    'prefilt': '0.004, 0.01, 500, 1000',
    'waterlevel': 14,
#     'detector': 'z',
#     'on': 8,
#     'off': 4,
    'num_workers': 17
}
config_file = config('w', path=path_config, parameters=parameters)
print("Run the following in Terminal:")
md(f"`process {config_file}`")

### 2.3 Detect Events & Build Catalogue
In this workflow, raw seismic data in `datapath` are processed in 24-hour segments, and an event detection algorithm is applied. The results of the event detector are compiled into a catalogue that is saved to disk at `writepath`. This catalogue serves as a useful pointer for follow-on processing of events of interest, rather than continuous data.

#### 2.3.1 Build Unsorted Catalogue

In [None]:
# parameters = {
#     'start': '20141201T0000',
#     'stop': '20141203T0000',
#     'mode': 'full',
#     'sourcepath': path_data_seismo,
#     'name_format': 2,
#     'writepath': path_data,
#     'parampath': path_data,
#     'network': 'XH',
#     'channel': 'HHZ',
#     'taper': 60,
#     'prefeed': 60,
#     'fs2': 50,
#     'cutoff': '3, 20',
#     'output': 'acc',
#     'prefilt': '0.004, 0.01, 500, 1000',
#     'waterlevel': 14,
#     'detector': 'z',
#     'on': 8,
#     'off': 4,
#     'num_workers': 4
# }

parameters = {
    'start': '20141203T0000',
    'stop':  '20161121T0000',
    'mode': 'detect',
    'sourcepath': f"{path_data_seismo}/Preprocessed",
    'name_format': 1,
    'writepath': path_data,
    'parampath': path_data,
    'network': 'XH',
    'station': '*',
    'channel': 'HHZ',
    'taper': 60,
    'prefeed': 60,
    'detector': 'recursive',
    'STA': 0.5,
    'LTA': 30,
    'on': 15,
    'off': 10,
    'num_workers': 32
}

config_file = config('w', path=path_config, parameters=parameters)
print("Run the following in Terminal:")
md(f"`process {config_file}`")

#### 2.3.2 Clean Catalogue
Remove duplicate detections, and if desired, detections that occur within a window (s) following an initial detection.

In [None]:
window = 5
!cleancat {path_data + '/catalogue.csv'} --dest {path_data + '/catalogue2.csv'} --window $window

### 2.4 Build HDF Database from Catalogue
In this workflow, a catalogue of detections at catalogue is used to process raw seismic data in `datapath`. In addition to pre-processing, the traces, spectrograms, and metadata of the detections are saved to an HDF database located at `writepath`. Because this workflow is implemented in parallel and results are returned asynchronously, a new catalogue is saved to `writepath.csv` that corresponds to the indexing within the HDF dataset. The index within `writepath.csv` corresponds to the original catalogue at catalogue.

In [None]:
parameters = {
    'start': '20141203T0000',
    'stop': '20161121T0000',
    'mode': 'cat2h5',
    'sourcepath': path_data_seismo,
    'name_format': 2,
    'writepath': fname_dataset,
    'catalogue': os.path.join(path_data,'catalogue2.csv'),
    'parampath': path_data,
    'network': 'XH',
    'channel': 'HHZ',
    'taper': 10,
    'prefeed': 10,
    'fs2': 50,
    'cutoff': '3, 20',
    'T_seg': 4,
    'NFFT': 256,
    'tpersnap': 0.4,
    'overlap': 0.9,
#     'output': 'acc',
#     'prefilt': '0.004, 0.01, 500, 1000',
#     'waterlevel': 14,
#     'detector': 'z',
#     'on': 8,
#     'off': 4,
    'det_window': 5,
    'num_workers': 1
}
config_file = config('w', path=path_config, parameters=parameters)
print("Run the following in Terminal:")
md(f"`process {config_file}`")

<a href="#contents">Return to Top</a>
<a id="section3"></a>
***
## <u>3 Set Universal Parameters</u>

In [None]:
exp_name = "FullArray"

# Get the number of samples in the dataset.
!query_H5size $fname_dataset

# Image Sample Indexes for Example Waveforms:
img_index = [35578, 361499, 328177, 371888]

# Generate new sample index for data set?
genflag = False

In [None]:
if genflag:
    M = 50000
    !GenerateSampleIndex $M $fname_dataset $path_data

In [None]:
universal = {
    'exp_name': exp_name,
    'fname_dataset': fname_dataset,
    'savepath': path_output,
    'indexpath': os.path.join(path_data, 'TraValIndex_M=50000.pkl'),
    'configpath': path_config
}
device_no = 1
device = utils.set_device(device_no)
transform = 'vec_norm'

<a href="#contents">Return to Top</a>
<a id="section4"></a>
***
## <u>4 Autoencoder (AEC)</u>

### 4.1 AEC Architecture

In [None]:
summary(AEC(), (1, 1, 87, 100))

### 4.2 Configure AEC training

In [None]:
parameters = {
    'model': 'AEC',
    'mode': 'train',
    'n_epochs': 500,
    'show': False,
    'send_message': False,
    'early_stopping': True,
    'patience': 10,
    'transform': transform,
    'img_index': str(img_index)[1:-1],
    'tb': True,
    'tbport': 6999,
    'workers': 8,
    'loadmode': 'ram',
    'datafiletype': 'h5'
}
hyperparameters = {
    'batch_size': '64, 128, 256, 512',
    'lr': '0.0001, 0.001, 0.01'
}
init_path = utils.config_training(universal, parameters, hyperparameters)
config_AEC = utils.Configuration(init_path)
config_AEC.load_config()
config_AEC.set_device(device_no)
config_AEC.show = True

### 4.3 View Detection Examples

In [None]:
fig = plotting.view_detections(fname_dataset, img_index)

In [None]:
fig.savefig(f"{figure_savepath}/DetectionExamples.eps", dpi=300, facecolor='w')

### 4.4 Train AEC

In [None]:
print("Run the following in Terminal:")
md(f"`runDEC {init_path}`")

<a id="BestAEC"></a>
### 4.5 Select Best AEC Run
Use Tensorboard to view outputs from the various hyperparameter runs.

In [None]:
batch_size = 64
LR = 0.001

expserial = 'Exp20210727T192309'
runserial = f'Run_BatchSz={batch_size}_LR={LR}'
# exp_path = f"{path_output}/Models/AEC/{expserial}/{runserial}"
exp_path_AEC = os.path.join(path_output, 'Models', 'AEC', expserial, runserial)

weights_AEC = os.path.join(exp_path_AEC, 'AEC_Params_Final.pt')
print(weights_AEC)

Return to [Section 6.2](#ConfigDCM)<br>
Return to [Section 7](#section7)

### 4.6 Evaluate AEC Training Performance

#### 4.6.1 Load Data and Model Parameters

In [None]:
dataset = utils.SeismicDataset(fname_dataset, 'h5')

model_AEC = AEC().to(device)
model_AEC = utils.load_weights(model_AEC, weights_AEC, device)

Return to [Section 5.3](#GMMeval)

#### 4.6.2 Training and Validation History

In [None]:
fig = plotting.view_history_AEC(os.path.join(exp_path_AEC, 'AEC_history.csv'), show=True)

In [None]:
fig.savefig(os.path.join(figure_savepath, 'AEC_History.eps'), dpi=300, facecolor='w')

#### 4.6.3 Input, Latent Space, and Reconstruction

In [None]:
fig = plotting.compare_images(model_AEC, 0, config_AEC)

In [None]:
fig.savefig(os.path.join(figure_savepath, 'CompareInOut.eps'), dpi=300, facecolor='w')

### 4.7 Evaluate all data through AEC

#### 4.7.1 Configure AEC Evaluation

In [None]:
parameters = {
    'model': 'AEC',
    'mode': 'predict',
    'show': False,
    'send_message': False,
    'transform': 'vec_norm',
#     'img_index': str(img_index)[1:-1],
    'tb': False,
    'workers': 8,
    'loadmode': 'ram',
    'datafiletype': 'h5',
    'saved_weights': weights_AEC
}
init_path = utils.config_training(universal, parameters)

#### 4.7.2 Evaluate all data

In [None]:
print("Run the following in Terminal:")
md(f"`runDC {init_path}`")

#### 4.7.3 Display Metrics for Entire Dataset
This metric is calculated by measuring the MSE of *all* spectrograms in the data set.

In [None]:
with open(os.path.join(exp_path_AEC, 'Prediction', 'MSE.txt'), 'r') as f:
    print(f.read())

[Return to Top](#contents)
<a id="section5"></a>
***
## <u>5 Gaussian Mixtures Model (GMM)</u>

### 5.1 Configure GMM

In [None]:
parameters = {
    'model': 'GMM',
    'mode': 'fit',
    'show': False,
    'send_message': False,
    'transform': 'vec_norm',
    'img_index': str(img_index)[1:-1],
    'tb': False,
    'workers': 8,
    'loadmode': 'ram',
    'datafiletype': 'h5',
    'saved_weights': weights_AEC
}
hyperparameters = {
    'n_clusters': '5, 6, 7, 8, 9, 10'
}
init_path = utils.config_training(universal, parameters, hyperparameters)
config_GMM = utils.Configuration(init_path)
config_GMM.load_config()
config_GMM.set_device(device_no)
config_GMM.show = True

### 5.2 Run GMM

In [None]:
print("Run the following in Terminal:")
md(f"`runDC {init_path}`")

<a id="GMMeval"></a>
### 5.3 Evaluate GMM Performance
Run [4.5](#BestAEC) and 4.6.1 to load AEC model.

#### 5.3.1 Load Data

In [None]:
n_clusters = 8
loadpath_GMM = os.path.join(exp_path_AEC, 'GMM', f'n_clusters={n_clusters}')
centroids_GMM = np.load(os.path.join(loadpath_GMM, 'centroids.npy'))
labels_GMM = np.load(os.path.join(loadpath_GMM, 'labels.npy'))
silh_scores_GMM = np.load(os.path.join(loadpath_GMM, 'silh_scores.npy'))
z_AEC = np.load(os.path.join(exp_path_AEC, 'Prediction', 'Z_AEC.npy'))

#### 5.3.2 View Clustering Statistics

In [None]:
pd.set_option('display.float_format', lambda x: '%.4e' % x)
df_GMM = pd.read_csv(os.path.join(loadpath_GMM, 'cluster_performance.csv'))
df_GMM['class'] = df_GMM['class'].fillna(-1).astype(int)
df_GMM['N'] = df_GMM['N'].fillna(-1).astype(int)
df_GMM

#### 5.3.3 View Clustering Results

In [None]:
p = 2
fig = plotting.cluster_gallery(
    model_AEC,
    dataset,
    fname_dataset,
    device,
    z_AEC,
    labels_GMM,
    centroids_GMM,
    p,
    True,
    True
)

In [None]:
fig.savefig(os.path.join(figure_savepath, 'GMM_Gallery.eps'), dpi=300, facecolor='w')

#### 5.3.4 View Silhouette Analysis

In [None]:
fig = plotting.view_silhscore(silh_scores_GMM, labels_GMM, n_clusters, 'GMM')

In [None]:
fig.savefig(os.path.join(figure_savepath, 'GMM_Silh.eps'), dpi=300, facecolor='w')

#### 5.3.5 View t-SNE Analysis

In [None]:
try:
    from cuml import TSNE
except:
    from sklearn.manifold import TSNE

M = len(z_AEC)
results_GMM = TSNE(n_components=2, perplexity=int(M/50), early_exaggeration=2000, learning_rate=int(M/25), n_iter=3000, verbose=0, random_state=2009).fit_transform(z_GMM.astype('float64'))

In [None]:
fig = plotting.view_TSNE(results_GMM, labels_GMM, 't-SNE Results: GMM', True)

In [None]:
fig.savefig(os.path.join(figure_savepath, 'GMM_TSNE.eps'), dpi=300, facecolor='w')

[Return to Top](#contents)
<a id="section6"></a>
***
## <u>6 Deep Embedded Clustering (DEC)</u>

### 6.1 DEC Model Architecture

In [None]:
summary(DEC(n_clusters=5), (1, 1, 87, 100))

<a id="ConfigDCM"></a>
### 6.2 Configure Training
Run [4.5](#BestAEC) first to get AEC weights.

In [None]:
parameters = {
    'model': 'DEC',
    'mode': 'train',
    'n_epochs': 400,
    'show': False,
    'send_message': False,
    'transform': 'vec_norm',
    'tb': True,
    'tbport': 6999,
    'workers': 4,
    'loadmode': 'ram',
    'datafiletype': 'h5',
    'init': 'load',
    'update_interval': -1,
    'saved_weights': weights_AEC
}

hyperparameters = {
    'batch_size': '64',
    'lr': '0.001',
    'n_clusters': '10',
    'gamma': '0.001',
    'tol': 0.003
}
init_path = utils.config_training(universal, parameters, hyperparameters)
config_DEC = utils.Configuration(init_path)
config_DEC.load_config()
config_DEC.set_device(device_no)
config_DEC.show = True

### 6.3 Train DEC Model

Run the following in Terminal:

In [None]:
md(f"`runDC {init_path}`")

To specify which CUDA device(s) is(are) used, prepend the following:

In [None]:
md(f"`CUDA_VISIBLE_DEVICES=7 runDC {init_path}`")

<a id="BestDEC"></a>
### 6.4 Select Best DEC Run
Use Tensorboard to view outputs from the various hyperparameter runs.

In [None]:
n_clusters = 8
batch_size = 64
LR = 0.001
gamma = 0.001
tol = 0.003

expserial = 'Exp20210730T172829'
runserial = f'Run_Clusters={n_clusters}_BatchSz={batch_size}_LR={LR}_gamma={gamma}_tol={tol}'
exp_path_DEC = os.path.join(path_output, 'Models', 'DEC', expserial, runserial)
weights_DEC = os.path.join(exp_path_DEC, 'DEC_Params_Final.pt')
print(weights_DEC)

Return to [Section 6.6](#DECeval)<br>
Return to [Section 7](#section7)<br>
Return to [Section 8](#section8)

### 6.5 Evaluate DEC Training Performance

In [None]:
fig = plotting.view_history_DEC([os.path.join(exp_path_DEC, 'DEC_history.csv'), os.path.join(exp_path_DEC, 'Delta_history.csv')], show=True)

In [None]:
fig.savefig(os.path.join(figure_savepath, 'DEC_History.eps'), dpi=300, facecolor='w')

### 6.6 Evaluate all data through DEC

In [None]:
parameters = {
    'model': 'DEC',
    'mode': 'predict',
    'show': False,
    'send_message': False,
    'transform': 'vec_norm',
#     'img_index': str(img_index)[1:-1],
    'tb': False,
    'workers': 16,
    'loadmode': 'ram',
    'datafiletype': 'h5',
    'saved_weights': weights_DEC
}
init_path = utils.config_training(universal, parameters)

In [None]:
print("Run the following in Terminal:")
md(f"`runDC {init_path}`")

To specify which CUDA device(s) is(are) used, prepend the following:

In [None]:
md(f"`CUDA_VISIBLE_DEVICES=1 runDC {init_path}`")

<a id="DECeval"></a>
### 6.7 Evaluate DEC Performance
Run [6.4](#BestDEC) first to get DEC weights.

#### 6.7.1 Load Data and Model Parameters

In [None]:
dataset = utils.SeismicDataset(fname_dataset, 'h5')
model_DEC = DEC(n_clusters).to(device)
model_DEC = utils.load_weights(model_DEC, weights_DEC, device)

#### 6.7.2 Load Data

In [None]:
loadpath_DEC = os.path.join(exp_path_DEC, 'Prediction')
centroids_DEC = np.load(os.path.join(loadpath_DEC, 'centroids_DEC.npy'))
labels_DEC = np.load(os.path.join(loadpath_DEC, 'labels_DEC.npy'))
silh_scores_DEC = np.load(os.path.join(loadpath_DEC, 'silh_scores.npy'))
z_DEC = np.load(os.path.join(loadpath_DEC, 'Z_DEC.npy'))

#### 6.7.3 View Clustering Statistics

In [None]:
pd.set_option('display.float_format', lambda x: '%.4e' % x)
df_DEC = pd.read_csv(os.path.join(loadpath_DEC, 'cluster_performance.csv'))
df_DEC['class'] = df_DEC['class'].fillna(-1).astype(int)
df_DEC['N'] = df_DEC['N'].fillna(-1).astype(int)
df_DEC

#### 6.7.4 View Clustering Results

In [None]:
p = 2
fig = plotting.cluster_gallery(
    model_DEC,
    dataset,
    fname_dataset,
    device,
    z_DEC,
    labels_DEC,
    centroids_DEC,
    p,
    True,
    True
)

In [None]:
fig.savefig(os.path.join(figure_savepath, 'GMM_Gallery.eps'), dpi=300, facecolor='w')

#### 6.7.5 View Silhouette Analysis

In [None]:
fig = plotting.view_silhscore(silh_scores_DEC, labels_DEC, n_clusters, 'DEC')

In [None]:
fig.savefig(os.path.join(figure_savepath, 'GMM_Silh.eps'), dpi=300, facecolor='w')

#### 6.7.6 View t-SNE Analysis

In [None]:
try:
    from cuml import TSNE
except:
    from sklearn.manifold import TSNE

M = len(z_DEC)
results_DEC = TSNE(n_components=2, perplexity=int(M/50), early_exaggeration=2000, learning_rate=int(M/25), n_iter=3000, verbose=0, random_state=2009).fit_transform(z_DEC.astype('float64'))

In [None]:
fig = plotting.view_TSNE(results_DEC, labels_DEC, 't-SNE Results: GMM', True)

In [None]:
fig.savefig(os.path.join(figure_savepath, 'GMM_TSNE.eps'), dpi=300, facecolor='w')

[Return to Top](#contents)
<a id="section7"></a>
***
## <u>7 Compare GMM & DEC</u>

Run [4.5](#BestAEC) and [6.4](#BestDEC) first to get AEC and DEC weights.

### 7.1 Load Data

In [None]:
dataset = utils.SeismicDataset(fname_dataset, 'h5')
n_clusters = 8
batch_size = 64
LR = 0.001
gamma = 0.001
tol = 0.003

pd.set_option('display.float_format', lambda x: '%.3e' % x)

# Load AEC/GMM data:
model_AEC = AEC().to(device)
model_AEC = utils.load_weights(model_AEC, weights_AEC, device)

loadpath_GMM = os.path.join(exp_path_AEC, 'GMM', f'n_clusters={n_clusters}')
centroids_GMM = np.load(os.path.join(loadpath_GMM, 'centroids.npy'))
labels_GMM = np.load(os.path.join(loadpath_GMM, 'labels.npy'))
silh_scores_GMM = np.load(os.path.join(loadpath_GMM, 'silh_scores.npy'))
z_AEC = np.load(os.path.join(exp_path_AEC, 'Prediction', 'Z_AEC.npy'))

MSE_GMM = np.load(os.path.join(loadpath_GMM, 'X_MSE.npy'))
ip_GMM = np.load(os.path.join(loadpath_GMM, 'X_ip.npy'))

df_GMM = pd.read_csv(os.path.join(loadpath_GMM, 'cluster_performance.csv'))
df_GMM['class'] = df_GMM['class'].fillna(-1).astype(int)
df_GMM['N'] = df_GMM['N'].fillna(-1).astype(int)

# Load DEC data:
expserial = 'Exp20210730T172829'
runserial = f'Run_Clusters={n_clusters}_BatchSz={batch_size}_LR={LR}_gamma={gamma}_tol={tol}'
exp_path_DEC = os.path.join(path_output, 'Models', 'DEC', expserial, runserial)
weights_DEC = os.path.join(exp_path_DEC, 'DEC_Params_Final.pt')

model_DEC = DEC(n_clusters).to(device)
model_DEC = utils.load_weights(model_DEC, weights_DEC, device)

loadpath_DEC = os.path.join(exp_path_DEC, 'Prediction')
centroids_DEC = np.load(os.path.join(loadpath_DEC, 'centroids_DEC.npy'))
labels_DEC = np.load(os.path.join(loadpath_DEC, 'labels_DEC.npy'))
silh_scores_DEC = np.load(os.path.join(loadpath_DEC, 'silh_scores.npy'))
z_DEC = np.load(os.path.join(loadpath_DEC, 'Z_DEC.npy'))

MSE_DEC = np.load(os.path.join(loadpath_DEC, 'X_MSE.npy'))
ip_DEC = np.load(os.path.join(loadpath_DEC, 'X_ip.npy'))

df_DEC = pd.read_csv(os.path.join(loadpath_DEC, 'cluster_performance.csv'))
df_DEC['class'] = df_DEC['class'].fillna(-1).astype(int)
df_DEC['N'] = df_DEC['N'].fillna(-1).astype(int)

### 7.2 Cluster Metrics

#### 7.2.1 Intra-cluster Similarity Metrics
Display metrics for intra-cluster sample similarity, comparing GMM (right) with DEC (left).

In [None]:
df = pd.concat([df_GMM, df_DEC], axis=1)
df

#### 7.2.2 Label Changes

In [None]:
df = utils.measure_label_change(labels_GMM, labels_DEC)
df

### 7.3 Figure Comparisons

#### 7.3.1 Clustering Results

In [None]:
p = 2
print('GMM ' + '=' * 75)
fig1 = plotting.cluster_gallery(
    model_AEC,
    dataset,
    fname_dataset,
    device,
    z_AEC,
    labels_GMM,
    centroids_GMM,
    p,
    True,
    True
)
print('DEC' + '=' * 75)
fig2 = plotting.cluster_gallery(
    model_DEC,
    dataset,
    fname_dataset,
    device,
    z_DEC,
    labels_DEC,
    centroids_DEC,
    p,
    True,
    True
)

In [None]:
fig1.savefig(os.path.join(figure_savepath, 'Results_GMM.pdf'), dpi=300, facecolor='w')
fig2.savefig(os.path.join(figure_savepath, 'Results_DEC.pdf'), dpi=300, facecolor='w')

#### 7.3.2 Silhouette Analysis

In [None]:
fig1 = plotting.view_silhscore(silh_scores_GMM, labels_GMM, n_clusters, 'GMM')
fig2 = plotting.view_silhscore(silh_scores_DEC, labels_DEC, n_clusters, 'DEC')

In [None]:
fig1.savefig(os.path.join(figure_savepath, 'Silh_GMM.pdf'), dpi=300, facecolor='w')
fig2.savefig(os.path.join(figure_savepath, 'Silh_DEC.pdf'), dpi=300, facecolor='w')

#### 7.3.3 t-SNE Results

In [None]:
if sys.platform == 'darwin':
    from sklearn.manifold import TSNE
elif sys.platform == 'linux':
    from cuml import TSNE

M = len(z_AEC)
results_GMM = TSNE(n_components=2, perplexity=int(M/50), early_exaggeration=2000, learning_rate=int(M/25), n_iter=3000, verbose=0, random_state=2009).fit_transform(z_AEC.astype('float64'))
results_DEC = TSNE(n_components=2, perplexity=int(M/50), early_exaggeration=2000, learning_rate=int(M/25), n_iter=3000, verbose=0, random_state=2009).fit_transform(z_DEC.astype('float64'))

In [None]:
fig1 = plotting.view_TSNE(results_GMM, labels_GMM, 't-SNE Results: GMM', show=True)
fig2 = plotting.view_TSNE(results_DEC, labels_DEC, 't-SNE Results: DEC', show=True)

In [None]:
fig1.savefig(os.path.join(figure_savepath, 'tSNE_GMM.pdf'), dpi=300, facecolor='w')
fig2.savefig(os.path.join(figure_savepath, 'tSNE_DEC.pdf'), dpi=300, facecolor='w')

#### 7.3.4 View Latent Space

In [None]:
p = 2
fig = plotting.view_latent_space(
    z_AEC,
    z_DEC,
    labels_GMM,
    labels_DEC,
    centroids_GMM,
    centroids_DEC,
    n_clusters,
    p,
    True,
    True
)

In [None]:
fig.savefig(os.path.join(figure_savepath, 'zspace.pdf'), dpi=300, facecolor='w')

#### 7.3.5 Cluster CDFs

In [None]:
p = 2
fig = plotting.view_class_cdf(
    z_AEC,
    z_DEC,
    labels_GMM,
    labels_DEC,
    centroids_GMM,
    centroids_DEC,
    n_clusters,
    p,
    True,
    True
)

In [None]:
fig.savefig(os.path.join(figure_savepath, 'CDF.pdf'), dpi=300, facecolor='w')

#### 7.3.6 Cluster PDFs

In [None]:
p = 2
fig = plotting.view_class_pdf(
    z_AEC,
    z_DEC,
    labels_GMM,
    labels_DEC,
    centroids_GMM,
    centroids_DEC,
    n_clusters,
    p,
    True,
    True
)

In [None]:
fig.savefig(os.path.join(figure_savepath, 'PDF.pdf'), dpi=300, facecolor='w')

[Return to Top](#contents)
<a id="section8"></a>
***
## <u>8 Environmental Data Exploration</u>

### 8.1 Calculate Dataset Statistics
Run [6.4](#BestDEC) first.

#### 8.1.1 Load Catalogue

In [None]:
# A = [{'idx': i, 'label': labels_DEC[i]} for i in np.arange(M)]
# utils.save_labels(A, os.path.join(exp_path_DEC))

In [None]:
pd.reset_option('display.float_format')
path_to_catalogue = f"{fname_dataset}.csv"
path_to_labels = f"{exp_path_DEC}/Labels.csv"
catalogue = utils.LabelCatalogue([path_to_catalogue, path_to_labels])

#### 8.1.2 Station Statistics
View occurrence frequencies by station and label.

In [None]:
catalogue.station_statistics().sort_values(by="N", ascending=False)

#### 8.1.3 Amplitude Statistics
View amplitude characteristics for each class.

In [None]:
catalogue.amplitude_statistics()

#### 8.1.4 Seasonal Statistics
Compare occurrence frequencies in austral winter (JFM) to austral summer (JJA).

In [None]:
catalogue.seasonal_statistics(mode=True)

#### 8.1.5 Peak Frequency Statistics
View average peak frequencies for each class:

In [None]:
catalogue.get_peak_freq(fname_dataset, batch_size=2048, workers=12)

### 8.2 View Environmental Data & Detection Statistics

#### 8.2.1 View Station DR02

In [None]:
station = "DR02"
aws = "gil"
fig = plotting.view_series(
    station,
    aws,
    path_data,
    path_to_catalogue,
    path_to_labels,
    env_vars=["sea_ice_conc","temp","wind_spd"],
    freq="hour",
    maxcounts=20,
    title=f"Station {station} Inter-annual Scale",
    show=True
)

In [None]:
fig.savefig(os.path.join(figure_savepath, f'{station}.eps'), dpi=300, facecolor='w')

#### 8.2.2 View Station RS09

In [None]:
station = "RS09"
aws = "mgt"
start = datetime.datetime(2016,6,15)
stop = datetime.datetime(2016,7,15)
fig1 = plotting.view_series(
    station,
    aws,
    path_data,
    path_to_catalogue,
    path_to_labels,
    env_vars=["temp","wind_spd","tide"],
    vlines=[start, stop],
    freq="hour",
    maxcounts=30,
    figsize=(12,9),
    title=f"Station {station} Interannual Scale",
    show=True
)
fig2 = plotting.view_series(
    station,
    aws,
    path_data,
    path_to_catalogue,
    path_to_labels,
    env_vars=["temp","wind_spd","tide"],
    times=[start, stop],
    freq="hour",
    maxcounts=20,
    figsize=(6,9),
    title=f"Station {station} Weekly Scale",
    showlabels=False,
    show=True
)

In [None]:
fig1.savefig(os.path.join(figure_savepath, f'{station}_ia.eps'), dpi=300, facecolor='w')
fig2.savefig(os.path.join(figure_savepath, f'{station}_wk.eps'), dpi=300, facecolor='w')

#### 8.2.3 Other Stations

In [None]:
station = "RS17"
aws = "mgt"
start = datetime.datetime(2016,4,1)
stop = datetime.datetime(2016,4,15)
fig1 = plotting.view_series(
    station,
    aws,
    path_data,
    path_to_catalogue,
    path_to_labels,
    env_vars=["temp","wind_spd","tide"],
    vlines=[start, stop],
    freq="hour",
    maxcounts=30,
    figsize=(12,9),
    title=f"Station {station} Interannual Scale",
    show=True
)
fig2 = plotting.view_series(
    station,
    aws,
    path_data,
    path_to_catalogue,
    path_to_labels,
    env_vars=["temp","wind_spd","tide"],
    times=[start, stop],
    freq="hour",
    maxcounts=20,
    figsize=(6,9),
    title=f"Station {station} Weekly Scale",
    showlabels=False,
    show=True
)

<a href="#contents">Return to Top</a>
<a id="appendixA"></a>
***
## Appendix A: Test for Optimal Number of Clusters

### A.1 Load Data
Run <a href="#BestAEC">4.5</a> first to get AEC weights.

In [None]:
index_tra, _ = utils.load_TraVal_index(fname_dataset, universal['indexpath'])

tra_dataset = Subset(dataset, index_tra)
dataloader = DataLoader(tra_dataset, batch_size=512, num_workers=16)

model = AEC().to(device)
model = utils.load_weights(model, AEC_weights, device)

### A.2 Compute K-means Metrics

In [None]:
klist = '2, 20'
klist = np.arange(int(klist.split(',')[0]), int(klist.split(',')[1])+1)
inertia, silh, gap_g, gap_u = models.kmeans_metrics(dataloader, model, device, klist)

### A.3 Plot Metrics

In [None]:
fig = plotting.view_cluster_stats(klist, inertia, silh, gap_g, gap_u, show=True)
np.save('kmeans_inertia', inertia)