In [None]:
import os
import numpy as np
import re
from PIL import Image
import networkx as nx
import h5py
from matplotlib import pyplot as plt
import matplotlib
from matplotlib import dates
from sklearn.cluster import (KMeans, SpectralClustering, AgglomerativeClustering, DBSCAN, OPTICS, Birch, MeanShift,
AffinityPropagation)
from sklearn.neighbors import kneighbors_graph
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.metrics import mean_squared_error as mse, silhouette_score, pairwise_distances
from glob import glob
from peakfinder import detect_peaks
from math import floor, ceil
from itertools import combinations
from datetime import datetime
from tqdm import tqdm
from loader import *
from features import *
from visualisation import *
np.set_printoptions(suppress=False)
scaler = standardize

In [None]:
wav, soundings, shift_std, shift_mean, space, time = load_soundings("../data/level_1p0a/*[I|E].h5", smoothing="poly")

In [None]:
centred_soundings = centre(soundings)
comp_wav, comp_centred_soundings = dropout(wav, centred_soundings, 11)

In [None]:
grad, polyres, poly_coeffs, curv = create_features(wav, centred_soundings, scaler, "grad", "polyres", "poly_coeffs", "curv")

In [None]:
logpolyres = np.log(polyres)/np.log(polyres).std(axis=0)
cbrt_poly_coeffs = np.cbrt(poly_coeffs)
cbrt_grad = np.cbrt(grad)
cbrt_curv = np.cbrt(curv)

In [None]:
features = np.hstack([poly_coeffs[:,0:1], polyres[:,1:2], grad[:,4:5], curv[:,1:2]])
sep_features = np.hstack([cbrt_poly_coeffs[:,0:1], logpolyres[:,1:2], cbrt_grad[:,4:5], cbrt_curv[:,1:2]])

In [None]:
combined_features = np.hstack([scaler(centred_soundings), features])
comp_combined_features = np.hstack([scaler(comp_centred_soundings), features])
sep_combined_features = np.hstack([scaler(centred_soundings), sep_features])
comp_sep_combined_features = np.hstack([scaler(comp_centred_soundings), sep_features])

In [None]:
"""index = {"poly_coeffs3": poly_coeffs[:,0], "poly_coeffs2": poly_coeffs[:,1], "poly_coeffs1": poly_coeffs[:,2], 
         "polyres1": polyres[:,0], "polyres2": polyres[:,1], "grad1": grad[:,0], "grad2": grad[:,1], 
         "grad3": grad[:,2], "grad4": grad[:,3], "curv1": curv[:,0], "curv2": curv[:,1], 
         "logpolyres1": logpolyres[:,0], "logpolyres2": logpolyres[:,1], "cbrt_poly_coeffs3": cbrt_poly_coeffs[:,0], 
         "cbrt_poly_coeffs2": cbrt_poly_coeffs[:,1], "cbrt_poly_coeffs1": cbrt_poly_coeffs[:,2], 
         "cbrt_grad1": cbrt_grad[:,0], "cbrt_grad2": cbrt_grad[:,1], "cbrt_grad3": cbrt_grad[:,2], 
         "cbrt_grad4": cbrt_grad[:,3], "cbrt_curv1": cbrt_curv[:,0], "cbrt_curv2": cbrt_curv[:,1]}""";

Positive and negative coefficient of $x^3$ is good at separating negative and positive curvature respectively, with periodic soundings with high magnitude coefficients at both ends.

Coefficient of $x^2$ does the same in reverse order. Coefficient of $x$ does same as $x^3$

# Spectral Clustering
Classifies periodic soundings if engineered features are used, puts periodic soundings in one cluster if raw transmissions are used.

use knn graph because $\epsilon$-neighbourhood works similarly to DBSCAN.

In [None]:
sc = SpectralClustering(n_clusters=8, gamma=0.1).fit(comp_centred_soundings)

In [None]:
clus = view_grouped_soundings(sc.labels_, wav, centred_soundings)

# Agglomerative Clustering
Computes a tree by iteratively joining together closest points and then number of clusters is specified 

In [None]:
kng = kneighbors_graph(comp_combined_features, 5, mode='connectivity', include_self=True).toarray()

In [None]:
ac = AgglomerativeClustering(
    n_clusters=20, memory="cache", compute_full_tree=True, affinity="precomputed", linkage="average", connectivity=kng
                            ).fit(comp_combined_features)

In [None]:
clus = view_grouped_soundings(ac.labels_, wav, centred_soundings)

# Gaussian Mixtures 
Superset of k-means: gaussians instead of centroids. Number of components is analogous to number of clusters. 

weights means and covariances are initialsied using the result of a kmeans algorithm. Means are initialised as centroids, weights are the proportion of dataset assigned to each cluster, and covariances are the within-cluster covariances.

Regularisation adds a small positive constant to the diagonal of the covariance matrix.

In [None]:
weights = np.arange(4) + 1e-15
weights /= weights.sum()
weights

###### Number of components

In [None]:
aic = []
bic = []
score = []

for n in range(5,30):
    gm = GaussianMixture(
        n_components=n, covariance_type="full", init_params="random", verbose=False
                            ).fit(standardised_combined_features)
    score.append(gm.score(standardised_combined_features))
    aic.append(gm.aic(standardised_combined_features))
    bic.append(gm.bic(standardised_combined_features))


plt.plot(np.arange(5,30), aic)
plt.title("aic")
plt.grid()
plt.show()
plt.plot(np.arange(5,30), bic)
plt.title("bic")
plt.grid()
plt.show()
plt.plot(np.arange(5,30), score)
plt.title("log likelihood")
plt.grid()
plt.show()

###### Tolerance

In [None]:
aic = []
bic = []
score = []

for n in np.logspace(-7,-10,5):
    gm = GaussianMixture(
        n_components=15, covariance_type="full", init_params="random", verbose=False, tol=n, max_iter=int(1e15)
                            ).fit(standardised_combined_features)
    score.append(gm.score(standardised_combined_features))
    aic.append(gm.aic(standardised_combined_features))
    bic.append(gm.bic(standardised_combined_features))

plt.semilogx(np.logspace(-7,-10,5), aic)
plt.title("aic")
plt.grid()
plt.show()
plt.semilogx(np.logspace(-7,-10,5), bic)
plt.title("bic")
plt.grid()
plt.show()
plt.semilogx(np.logspace(-7,-10,5), score)
plt.title("log likelihood")
plt.grid()
plt.show()

###### Regularisation

In [None]:
aic = []
bic = []
score = []

for n in np.logspace(-9,-12,4):
    gm = GaussianMixture(
        n_components=15, covariance_type="full", init_params="random", verbose=False, tol=1e-7, max_iter=int(1e15),
        reg_covar=n).fit(standardised_combined_features)
    
    score.append(gm.score(standardised_combined_features))
    aic.append(gm.aic(standardised_combined_features))
    bic.append(gm.bic(standardised_combined_features))

plt.semilogx(np.logspace(-9,-12,4), aic)
plt.title("aic")
plt.grid()
plt.show()
plt.semilogx(np.logspace(-9,-12,4), bic)
plt.title("bic")
plt.grid()
plt.show()
plt.semilogx(np.logspace(-9,-12,4), score)
plt.title("log likelihood")
plt.grid()
plt.show()

In [None]:
gm = GaussianMixture(
        n_components=15, covariance_type="full", init_params="random", tol=1e-15, max_iter=int(1e15), reg_covar=1e-15, 
                    ).fit(norm_combined_features)

gmlabels = gm.predict(norm_combined_features)
print(gm.score(norm_combined_features))
clus = view_grouped_soundings(gmlabels, wav, centred_soundings)

In [None]:
view_feature_space(eng_features, ["poly_coeffs", "polyres", "grad"], br.labels_, 4)

In [None]:
for cov in ["full", "tied", "diag", "spherical"]: 
    print(f"\n#### covariance type: {cov} ####\n")

    gm = GaussianMixture(
        n_components=15, covariance_type="full", init_params="random", tol=1e-3, max_iter=100, reg_covar=1e-14, verbose=True
                            ).fit_predict(att_standardised_combined_features)

    clus = view_clusters(gm, wav, centred_soundings, ["cbrt_poly_coeffs3", index["cbrt_poly_coeffs3"]], 
                         ["cbrt_grad300_350", index["cbrt_grad300_350"]], ["logpolyres1", index["logpolyres1"]])

In [None]:
for cov in ["full", "tied", "diag", "spherical"]: 
    print(f"\n#### covariance type: {cov} ####\n")
    gm = GaussianMixture(
        n_components=15, covariance_type=cov, init_params="random", n_init=3, tol=1e-5, max_iter=1000, reg_covar=0, verbose=True
                            ).fit_predict(att_eng_features)

    clus = view_clusters(gm, wav, centred_soundings, ["cbrt_poly_coeffs3", index["cbrt_poly_coeffs3"]], 
                         ["cbrt_grad300_350", index["cbrt_grad300_350"]], ["logpolyres1", index["logpolyres1"]])

In [None]:
for cov in ["full", "tied", "diag", "spherical"]: 
    print(f"\n#### covariance type: {cov} ####\n")
    gm = GaussianMixture(
        n_components=15, covariance_type=cov, init_params="random", n_init=3, tol=1e-5, max_iter=1000, reg_covar=0, verbose=True
                            ).fit_predict(comp_centred_soundings)

    clus = view_clusters(gm, wav, centred_soundings, ["cbrt_poly_coeffs3", index["cbrt_poly_coeffs3"]], 
                         ["cbrt_grad300_350", index["cbrt_grad300_350"]], ["logpolyres1", index["logpolyres1"]])

In [None]:
for cov in ["full", "tied", "diag", "spherical"]: 
    print(f"\n#### covariance type: {cov} ####\n")
    gm = GaussianMixture(
        n_components=15, covariance_type=cov, init_params="random", n_init=3, tol=1e-5, max_iter=1000, reg_covar=1e-14, verbose=True
                            ).fit_predict(standardised_combined_features)

    clus = view_clusters(gm, wav, centred_soundings, ["poly_coeffs3", index["poly_coeffs3"]], 
                         ["grad300_350", index["grad300_350"]], ["logpolyres1", index["polyres1"]])

In [None]:
for cov in ["full", "tied", "diag", "spherical"]: 
    print(f"\n#### covariance type: {cov} ####\n")
    gm = GaussianMixture(
        n_components=15, covariance_type=cov, init_params="random", n_init=3, tol=1e-5, max_iter=1000, reg_covar=0, verbose=True
                            ).fit_predict(eng_features)

    clus = view_clusters(gm, wav, centred_soundings, ["poly_coeffs3", index["poly_coeffs3"]], 
                         ["grad300_350", index["grad300_350"]], ["polyres0", index["polyres0"]])

# Mean Shift
High bandwidth puts all soundings in the same cluster. A well tuned bandwidth will automatically detect the appropriate number of clusters.

Sliding windows will move towards areas of high density and the bandwidth controls the colume of the sliding windows so they can cluster the entire dataset into one cluster, create clusters for individual points, or if fine tuned find local density maxima in the feature space.

bandwith ~ 0.04 for raw spectra 

engineered features: bandwidth ~ 2, classifies periodic soundings well

good with non-periodic soundings

In [None]:
n_clusters = []
for n in np.linspace(0.1, 1, 10):
    ms = MeanShift(bandwidth=n, cluster_all=True).fit(standardised_combined_features)
    n_clusters.append(len(set(ms.labels_)))
    
plt.plot(np.linspace(0.1,1,10), n_clusters)

In [None]:
ms = MeanShift(bandwidth=0.03, cluster_all=True, n_jobs=-1).fit(comp_centred_soundings)

In [None]:
len(set(ms.labels_))

In [None]:
clus = view_grouped_soundings(ms.labels_, wav, centred_soundings)

In [None]:
view_feature_space(eng_features, ["poly_coeffs", "polyres", "grad"], br.labels_, 4)

# BIRCH
Creates a cluster feature tree, branching factor controls how many children can be added to a node in the tree and threshold controls the size of subclusters.

###### optimal parameters for feature spaces
standardised combined_features: threshold = 4.925, branching_factor=5

norm_combined_features: threshold=0.9, branching_factor=50. (Usually decreasing threshold results in more clusters but decreasing from 0.9 to 0.89 decreases number of clusters by 1.)

separating with logarithm and cube root produces lower quality clusters

In [None]:
br = Birch(n_clusters=None, threshold=4, branching_factor=50).fit(combined_features)

In [None]:
len(set(br.labels_))

In [None]:
clus = view_grouped_soundings(br.labels_, wav, centred_soundings)

In [None]:
def view_feature_space(features, names, labels, c=None):
    """
    View feature space with specified cluster highlighted in yellow
    """
    n_features = features.shape[1]
    
    if n_features == 2:
        fig = plt.figure()
        ax = fig.add_subplot()
        
        if c == None:
            ax.scatter(features[:,0], features[:,1], c=labels.astype(float))
        else:
            ax.scatter(features[:,0], features[:,1], c=(labels==c).astype(float))
            
        ax.set_xlabel(names[0])
        ax.set_ylabel(names[1])

    elif n_features == 3:
        fig = plt.figure()
        ax = fig.add_subplot(projection="3d")
        
        if c == None:
            ax.scatter(features[:,0], features[:,1], features[:,2], c=labels.astype(float))
        else:
            ax.scatter(features[:,0], features[:,1], features[:,2], c=(labels==c).astype(float))
            
        ax.set_xlabel(names[0])
        ax.set_ylabel(names[1])
        ax.set_zlabel(names[2])
        
    else:
        fig = plt.figure()
        ax = fig.add_subplot()
        
        if c == None:
            ax.scatter(features[:,0], features[:,1], c=labels.astype(float))
        else:
            ax.scatter(features[:,0], features[:,1], c=(labels==c).astype(float))
            
        ax.set_xlabel(names[0])
        ax.set_ylabel(names[1])
        view_feature_space(features[:,2:], names[2:], labels, c)

In [None]:
view_feature_space(features, ["poly_coeffs", "polyres", "gradient", "curvature"], br.labels_)

In [None]:
%matplotlib inline