# CMP STEP 5 & 6: Reducing Dimensionality & Clustering

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import numpy as np

import umap
import hdbscan

import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [None]:
# random seed for umap embedding; restart kernel if resetting seed!
RANDOM_STATE = # EX4.1 YOUR CODE HERE

In [None]:
# use following parameters in evaluation loop if parameter not varying
# !!! RESTART KERNEL WHEN SETTING NEW SEED - else UMAP results might be influenced by previous seed !!!

# UMAP parameters
N_NEIGHBORS = 30
MIN_DIST = 0.01
N_COMPONENTS = 30
RANDOM_STATE = RANDOM_STATE

# HDBSCAN parameters
MIN_SAMPLES = 30
MIN_CLUSTER_SIZE = 10

### Load strategy texts & embeddings

In [None]:
# load raw strategy sentences
data = pd.read_csv('data/strategies_raw_en.csv')
strategies = data['text'].tolist()

In [None]:
# load strategy embeddings and convert to numpy array
strategy_embeddings = np.load('data/your_strategy_embeddings_masked.npy')

### Define evaluation & plotting function

In [None]:
def evaluation_loop(Ns=[10, 30, 50]):
    """evaluates UMAP in combination with HDBSCAN with different values for UMAP components"""
    # Ns: values for UMAP components
    # parameter n_neighbors of HDBSCAN will always vary between 10 and 50 in steps of five for each run 

    print('STARTING EVALUATION LOOP')
    eval_dict = {}

    for N in Ns:
        N_eval_dict = {}

        no_clusters = []
        
        print(f"--- Embedding with UMAP into {N} dimensions (n_neighbors = {N_NEIGHBORS}) ---")
        mapper = umap.UMAP(n_neighbors=N_NEIGHBORS,
                           min_dist=MIN_DIST,
                           n_components=N,
                           metric='cosine',
                           random_state=RANDOM_STATE).fit(strategy_embeddings)

        umap_embeddings = mapper.transform(strategy_embeddings)

        print('Running HDBSCAN with ...')
        for K in range(10, 51, 5):
            print(f'... {K} nearest neighbors')

            clusterer = hdbscan.HDBSCAN(min_samples=K,
                                        min_cluster_size=MIN_CLUSTER_SIZE,
                                        metric='euclidean',
                                        gen_min_span_tree=True)

            clusterer.fit(umap_embeddings)

            no_clusters.append(clusterer.labels_.max() + 1)
        
        eval_dict[N] = no_clusters
    
    return eval_dict

In [None]:
def plot_values(eval_dict):
    """plots number of clusters over combinations of UMAP & HDBSCAN parameters per evaluation run"""
    # eval_dict: evaluation loop output

    Ns = list(eval_dict.keys())
    
    subplot_titles = [f'UMAP_n = {Ns[0]}', f'UMAP_n = {Ns[1]}', f'UMAP_n = {Ns[2]}']

    fig = make_subplots(rows=1, cols=3, subplot_titles=subplot_titles, shared_yaxes=True)

    for N, col in zip(Ns, [1, 2, 3]):
        fig.add_trace(go.Scatter(x=np.arange(10, 51, 5),
                                 y=eval_dict[N],
                                 mode='lines+markers'),
                      row=1, col=col)

    fig['layout']['xaxis']['title']= 'HDBSCAN_k'
    fig['layout']['xaxis2']['title']= 'HDBSCAN_k'
    fig['layout']['xaxis3']['title']= 'HDBSCAN_k'

    fig['layout']['yaxis']['title']= 'number of clusters'

    fig.update_layout(showlegend=False)
    
    return fig

### Varying UMAP components over HDBSCAN k-nearest neighbors

In [None]:
# evaluate different components of UMAP embedding
eval_dict = evaluation_loop(Ns= # EX4.2 YOUR CODE HERE )

In [None]:
# plot number of clusters found for UMAP embeddings with different number of components
fig = plot_values(eval_dict)
fig.show()