In [None]:
#!/usr/bin/env python3

import argparse
import os
import numpy as np
from sklearn.cluster import Birch
from PIL import Image
import matplotlib.pyplot as plt
import shutil
import pandas as pd

from loader import *
from features import *
from visualisation import *
from merging import *

## Loading data ##
wav, soundings, shift_std, shift_mean, space, time = load_soundings()
centred_soundings = centre(soundings)

## Creating features ##
grad, polyres, poly_coeffs, curv = create_features(wav, centred_soundings, standardize, "grad", "polyres", "poly_coeffs", "curv")

features = np.hstack([poly_coeffs[:,0:1], polyres[:,1:2], grad[:,4:5]])

combined_features = np.hstack([standardize(centred_soundings), features])

## Fitting model ##
model = Birch(n_clusters=None, threshold=2.5, branching_factor=50).fit(combined_features)

minorlabels = model.labels_

###### Change in dispersion

In [None]:
edge_list = []
for pair in combinations(np.unique(br.labels_), 2):
    cluster1 = combined_features[br.labels_ == pair[0]]
    dispersion1 = np.sqrt(np.cov(cluster1.T).trace())
    cluster2 = combined_features[br.labels_ == pair[1]]
    dispersion2 = np.sqrt(np.cov(cluster2.T).trace())
    merged_cluster = np.vstack([cluster1, cluster2])
    merged_dispersion = np.sqrt(np.cov(merged_cluster.T).trace())
    inc1 = 100*(merged_dispersion - dispersion1)/dispersion1
    inc2 = 100*(merged_dispersion - dispersion2)/dispersion2
    merged_soundings = np.vstack([centred_soundings[br.labels_ == pair[0]], centred_soundings[br.labels_ == pair[1]]])
    
    if inc1 < 0 or inc2 < 0:
        edge_list.append(pair)
        print(f"{pair} dispersion 1: {dispersion1:.3f} dispersion 2: {dispersion2:.3f} merged dispersion: {merged_dispersion:.3f}")
        print(f"increase 1: {inc1:.1f}%  increase 2: {inc2:.1f}%")
        plt.plot(wav, merged_soundings.T)
        plt.ylim([-0.5, 0.5])
        plt.show()

In [None]:
G = nx.Graph()
G.add_nodes_from(np.unique(br.labels_))
G.add_edges_from(edge_list)

In [None]:
for igroup, cluster_labels in enumerate(nx.connected_components(G)):
    for label in cluster_labels:
        print(igroup, cluster_labels, label)

In [None]:
for igroup, cluster_labels in enumerate(nx.connected_components(G)):
    for label in cluster_labels:
        print(igroup, group, label)
        print(f"group: {label}")
        cluster = centred_soundings[br.labels_ == label]
        plt.plot(wav, cluster.T)
        plt.ylim([-0.5,0.5])
        plt.show()

In [None]:
for c in np.unique(br.labels_):
    for cn in np.unique(br.labels_):
        if c != cn:
            cluster = combined_features[br.labels_ == pair[0]]

###### Maximum change

In [None]:
edge_list = []
for pair in combinations(np.unique(br.labels_), 2):
    cluster1 = centred_soundings[br.labels_ == pair[0]]
    cluster2 = centred_soundings[br.labels_ == pair[1]]

    merged_cluster = np.vstack([cluster1, cluster2])
    merged_spread = merged_cluster.max(axis=0) - merged_cluster.min(axis=0)
    max_spread = merged_spread.max()
    
    if max_spread < 0.09:
        edge_list.append(pair)
        print(f"{pair} max spread: {max_spread}")
        plt.plot(wav, merged_cluster.T)
        plt.ylim([-0.5, 0.5])
        plt.show()

In [None]:
G = nx.Graph()
G.add_nodes_from(np.unique(br.labels_))
G.add_edges_from(edge_list)

In [None]:
for igroup, cluster_labels in enumerate(nx.connected_components(G)):
    for label in cluster_labels:
        print(igroup, cluster_labels, label)

In [None]:
view_grouped_soundings(br.labels_, wav, centred_soundings)

###### Feature similarity

In [None]:
edge_list = []
for pair in combinations(np.unique(br.labels_), 2):
    cluster1 = centred_soundings[br.labels_ == pair[0]]
    cluster2 = centred_soundings[br.labels_ == pair[1]]
    
    feature1 = features[br.labels_ == pair[0],0]
    feature2 = features[br.labels_ == pair[1],0]
    
    feature_avg1 = feature1.mean(axis=0)
    feature_avg2 = feature2.mean(axis=0)
    
    mn = np.where(abs(feature_avg1) < abs(feature_avg2), feature_avg1, feature_avg2)
    mx = np.where(abs(feature_avg1) > abs(feature_avg2), feature_avg1, feature_avg2)
    sim = mn/mx
    
    if sim.sum() > 0.9:
        edge_list.append(pair)
        print(f"clusters: {pair}")
        print(f"feature average 1: {feature_avg1}")
        print(f"feature average 2: {feature_avg2}")
        print(f"similarity: {sim} sum of similarities: {sim.sum()}")
        plt.plot(wav, cluster1.T)
        plt.ylim([-0.5, 0.5])
        plt.plot(wav, cluster2.T)
        plt.ylim([-0.5,0.5])
        plt.show()

In [None]:
G = nx.Graph()
G.add_nodes_from(np.unique(br.labels_))
G.add_edges_from(edge_list)

In [None]:
for igroup, cluster_labels in enumerate(nx.connected_components(G)):
    for label in cluster_labels:
        print(igroup, cluster_labels, label)

In [None]:
clus20 = view_grouped_soundings(br.labels_, wav, centred_soundings, return_cluster=True)

###### Manually choosing feature range

In [None]:
set(finallabels)

In [None]:
view_grouped_soundings(br.labels_, wav, centred_soundings)

In [None]:
def merge_labels(soundings, labels, grad, polyres, poly_coeffs, curv):
    """
    Merges cluster labels into final major classes by considering average features over a cluster

    Inputs
    labels            Labels as outputted by BIRCH algorithm
    grad              gradient calculated by create_features function
    polyres           polynomial residual feature array as outputted by create_features function    
    poly_coeffs       polynomial coeffiecients feature array    
    curv              curvature feature array

    Outputs
    labels            Merged major labels corresponding to final classes (eg periodic, high gradient)
    """
    flat = []
    periodic = []
    positive_curvature = []
    negative_curvature = []
    high_gradient = []
    small_gradient = []

    for c in np.unique(labels):
        
        std = soundings[labels==c].std(axis=1).mean()
        avg_pres = polyres[labels == c, 0].mean()
        avg_pres1 = polyres[labels == c, 1].mean()
        avg_pres2 = polyres[labels == c, 2].mean()
        avg_pcoeff = poly_coeffs[labels == c, 0].mean()
        avg_pcoeff1 = poly_coeffs[labels == c, 1].mean()
        avg_pcoeff2 = poly_coeffs[labels == c, 2].mean()
        avg_curv = curv[labels == c, 0].mean() + curv[labels == c, 1].mean()
        avg_grad = sum([grad[labels == c, i].mean() for i in range(1,3)])
        
        per_cond = []
        per_cond.append((np.array([curv[labels == c, i].mean() for i in range(3)]).min() * 
                         np.array([curv[labels == c, i].mean() for i in range(3)]).max()) < 0)
        per_cond.append((np.array([grad[labels == c, i].mean() for i in range(4)]).min() * 
                         np.array([grad[labels == c, i].mean() for i in range(4)]).max()) < 0)
        per_cond.append(avg_pres1 > 2 or avg_pcoeff < -5 or avg_pcoeff > 3.5)
        
        pos_curv_cond = []
        pos_curv_cond.append(avg_pres1 > 7 or avg_pcoeff < -5 or avg_pcoeff > 3.5)
        pos_curv_cond.append(avg_curv > 0)
        
        if (std < 0.01) and not all(per_cond):
            flat.append(c)
            
        elif all(per_cond):
            periodic.append(c)
        
        elif (avg_pres1 > 5 or avg_pcoeff < -5 or avg_pcoeff > 3.5) and avg_curv > 0:
            plt.plot(soundings[labels==c].T)
            plt.title(f"{per_cond}")
            plt.ylim([0,1])
            plt.show()
            positive_curvature.append(c)
            
        elif avg_curv > 2:
            positive_curvature.append(c)
            
        elif (avg_pres1 > 5 or avg_pcoeff < -5 or avg_pcoeff > 3.5) and avg_curv < 0:
            negative_curvature.append(c)

        elif avg_curv < -4 and avg_grad > 2:
            negative_curvature.append(c)
            
        elif avg_grad > 3:
            high_gradient.append(c)
            
        else:
            small_gradient.append(c)
    
    labels2 = np.where(np.in1d(labels, np.array(flat)), 2, labels)
    labels2 = np.where(np.in1d(labels, np.array(periodic)), 5, labels2)
    labels2 = np.where(np.in1d(labels, np.array(negative_curvature)), 4, labels2)
    labels2 = np.where(np.in1d(labels, np.array(positive_curvature)), 6, labels2)
    labels2 = np.where(np.in1d(labels, np.array(high_gradient)), 3, labels2)
    labels2 = np.where(np.in1d(labels, np.array(small_gradient)), 7, labels2)
    
    return labels2
    
def add_flat_labels(soundings, labels):
    """
    Returns three masks for identification of flat soundings above, below, and in atmosphere
    """
    mean = soundings.mean(axis=1)
    
    above_mask = (mean > 0.99) & (labels==2)
    below_mask = (mean < 0.01) & (labels==2)
    
    labels = np.where(above_mask, 0, labels)
    labels = np.where(below_mask, 1, labels)

    return labels

In [None]:
majorlabels = merge_labels(soundings, minorlabels, grad, polyres, poly_coeffs, curv)
finallabels = add_flat_labels(soundings, majorlabels)