## Job Crafting: Reducing Dimensionality & Clustering

In [None]:
# !!! RESTART KERNEL WHEN SETTING NEW SEED - else UMAP results might be influenced by previous seed !!!
RANDOM_STATE = 772

In [None]:
EMBEDDINGS = 'data/'

In [None]:
# use following parameters in evaluation loop if parameter not varying
MIN_DIST = 0.01  # UMAP, default: 0.1
MIN_CLUSTER_SIZE = 10  # HDBSCAN, default: 15

# vary over the following parameters in evaluation loop
K_UMAP = [15, 30, 40, 45, 50]
N_UMAP = [10, 30, 50]
K_HDBSCAN = [10, 15, 20, 25, 30, 35, 40, 45, 50]

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import os
import numpy as np

import umap
import hdbscan

import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
# create folders to save evaluation files & plots
if not os.path.exists(f'evaluation'):
    os.makedirs(f'evaluation')

if not os.path.exists(f'evaluation/plots'):
    os.makedirs(f'evaluation/plots')

### Load reappraisal texts & embeddings

In [None]:
# load raw reappraisal sentences
data = pd.read_csv(EMBEDDINGS + 'strategies_raw.csv', sep=';')
strategies = data['strategy text'].tolist()

In [None]:
# load reappraisal embeddings and convert to numpy array
strategy_embeddings = np.load(EMBEDDINGS + 'strategy_embeddings_masked.npy')

### Evaluate hyperparameters and save to file

In [None]:
### DEFINE EVALUATION FUNCTION ###

def evaluation_loop(k_umap=K_UMAP, n_umap=N_UMAP, k_hdbscan=K_HDBSCAN):
    """evaluates UMAP in combination with HDBSCAN with different parameters along three axes: """
    # k_umap: iterable, list of values for UMAP nearest neighbors
    # n_umap: iterable, list of values for UMAP dimensions
    # k_hdbscan: iterable, list of values for HDBSCAN nearest neighbors

    print(f"STARTING EVALUATION LOOP FOR SEED {RANDOM_STATE}")
    eval_dict = {}

    for K in k_umap:
        K_eval_dict = {}
        
        for N in n_umap:
            N_eval_dict = {}
            no_clusters = []
            
            print(f"- Evaluating UMAP with {K} neighbors and {N} dimensions")
            mapper = umap.UMAP(n_neighbors=K,
                               min_dist=MIN_DIST,
                               n_components=N,
                               metric='cosine',  # default: 'euclidean'
                               random_state=RANDOM_STATE).fit(strategy_embeddings)

            umap_embeddings = mapper.transform(strategy_embeddings)

            print(f"- Running HDBSCAN with {k_hdbscan} neighbors")
            for H in k_hdbscan:
                H_eval_dict = {}
    
                clusterer = hdbscan.HDBSCAN(min_samples=H,
                                            min_cluster_size=MIN_CLUSTER_SIZE,
                                            metric='euclidean',  # default: 'euclidean'
                                            gen_min_span_tree=True)

                clusterer.fit(umap_embeddings)
    
                H_eval_dict['number of clusters'] = clusterer.labels_.max() + 1
                H_eval_dict['noise'] = list(clusterer.labels_).count(-1)
    
                N_eval_dict[H] = H_eval_dict
            
            K_eval_dict[N] = N_eval_dict

        eval_dict[K] = K_eval_dict
    
    return eval_dict

In [None]:
# run evaluation loop for current seed
results_seed = evaluation_loop()

In [None]:
# save results for current seed to files
with open('evaluation/eval-results-no-clusters.txt', 'a') as f:  # number of clusters
    for k in K_UMAP:
        for n in N_UMAP:
            f.write(f"s{RANDOM_STATE}_k{k}_n{n} = {[(h, results_seed[k][n][h]['number of clusters']) for h in K_HDBSCAN]}\n")
        f.write('\n')

with open('evaluation/eval-results-noise.txt', 'a') as f:  # noise
    for k in K_UMAP:
        for n in N_UMAP:
            f.write(f"s{RANDOM_STATE}_k{k}_n{n} = {[(h, results_seed[k][n][h]['noise']) for h in K_HDBSCAN]}\n")
        f.write('\n')

### Plot results

In [None]:
### DEFINE PLOTTING FUNCTION ###

def plot_values(k_umap=K_UMAP, n_umap=N_UMAP, k_hdbscan=K_HDBSCAN, parameter='number of clusters',
                width=1100, height=1000):
    """ plots parameters number of clusters/noise per evaluation run """
    # k_umap: iterable, list of values for UMAP nearest neighbors
    # n_umap: iterable, list of values for UMAP dimensions
    # k_hdbscan: iterable, list of values for HDBSCAN nearest neighbors
    # parameter: string, 'number of clusters' or 'noise'
    # width: integer, plot width
    # height: integer, plot height
    
    subplot_titles = [f"n_UMAP = {n_umap[a]}" for a in range(len(n_umap))] + []*len(k_umap)*(len(n_umap)-1)

    fig = make_subplots(rows=len(k_umap), cols=len(n_umap), subplot_titles=subplot_titles, shared_yaxes=True, vertical_spacing=0.05)

    for i in range(1, len(k_umap)+1):
        for j in range(1, len(n_umap)+1):
            fig.add_trace(go.Scatter(x=k_hdbscan,
                                     y=[results_seed[k_umap[i-1]][n_umap[j-1]][h][parameter] for h in k_hdbscan],
                                     mode='lines+markers'),
                          row=i, col=j)

    for x in range(1, len(n_umap)+1):
        fig['layout'][f"xaxis{(len(k_umap)-1)*len(n_umap)+x}"]['title']= 'k_hdbscan'

    fig['layout']['yaxis']['title']= f"k_umap = {k_umap[0]}"

    for y, l in zip(k_umap[1:], range(1, len(k_umap))):
        fig['layout'][f"yaxis{1+l*len(n_umap)}"]['title']= f"k_umap {y}"

    fig.update_layout(title_text = f"N{parameter[1:]} // Seed = {RANDOM_STATE}",
                      title_x = 0.5,
                      width = width,
                      height = height,
                      font_size = 13,
                      title_font_size = 20,
                      showlegend = False)

    fig.update_annotations(font_size = 15)
    
    return fig

In [None]:
# plot results for number of clusters and current seed
fig = plot_values(parameter='number of clusters')
fig.show()
# save to file
fig.write_image(f'evaluation/plots/no-clusters-seed={RANDOM_STATE}.svg')

In [None]:
# plots results for noise and current seed
fig = plot_values(parameter='noise')
fig.show()
# save to file
fig.write_image(f'evaluation/plots/noise-seed={RANDOM_STATE}.svg')