# CMP STEP 9: Interpreting & Comparing

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import numpy as np

import umap
import hdbscan 

from bokeh.plotting import figure, show
from bokeh.transform import factor_cmap
from bokeh.palettes import Turbo256
from bokeh.models import HoverTool

In [None]:
### UMAP final parameters ###
N_NEIGHBORS = 30
MIN_DIST = 0.01
N_COMPONENTS = 30
RANDOM_STATE = 86531

### HDBSCAN final parameters ###
MIN_SAMPLES = 30
MIN_CLUSTER_SIZE = 10

### Load strategy texts & embeddings

In [None]:
# load raw strategy sentences
data = pd.read_csv('data/strategies_raw_en.csv')
strategies = data['text'].tolist()

In [None]:
# load strategy embeddings and convert to numpy array
strategy_embeddings = np.load('data/strategy_embeddings_masked.npy')

### Define embedding & plotting functions

In [None]:
def umap_hdbscan(embeds=strategy_embeddings,
                 n_neighbors=N_NEIGHBORS,
                 min_dist=MIN_DIST,
                 n_components=N_COMPONENTS,
                 random_state=RANDOM_STATE,
                 min_cluster_size=MIN_CLUSTER_SIZE,
                 min_samples=MIN_SAMPLES):
    
    umap_embed_fin = np.load(f'data/umap_embeddings_seed-{RANDOM_STATE}-(final).npy')

    hdbscan_fin = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,
                                  min_samples=min_samples,
                                  metric='euclidean')

    hdbscan_fin.fit(umap_embed_fin)
    
    return hdbscan_fin


def umap_2D(embeds=strategy_embeddings):
    # reduce dimensions further to 2D for plotting
    umap_2D = umap.UMAP(n_neighbors=N_NEIGHBORS,
                        min_dist=MIN_DIST,
                        n_components=2,
                        metric='cosine',
                        random_state=RANDOM_STATE).fit(embeds)

    umap_2D_embed = umap_2D.transform(embeds)
    
    return umap_2D_embed


def plot_ER_atlas(umap_2D, hdbscan, x_range, y_range):
    # create df with plotting data for bokeh interactive plot
    clusters = np.delete(umap_2D, np.where(hdbscan.labels_ == -1)[0], axis=0)
    cluster_labels = hdbscan.labels_[hdbscan.labels_ != -1]
    noise = np.delete(umap_2D, np.where(hdbscan.labels_ != -1)[0], axis=0)

    cluster_df = pd.DataFrame()
    cluster_df['x'] = clusters[:,0]
    cluster_df['y'] = clusters[:,1]
    cluster_df['labels'] = [str(label) for label in cluster_labels]
    cluster_df['strategy_text'] = np.delete(strategies, np.where(hdbscan.labels_ == -1)[0])

    noise_df = pd.DataFrame()
    noise_df['x'] = noise[:,0]
    noise_df['y'] = noise[:,1]
    noise_df['strategy_text'] = np.delete(strategies, np.where(hdbscan.labels_ != -1)[0])
    
    # create interactive bokeh plot of ER strategy clusters
    no_clusters = (max(cluster_labels) + 1)
    step = int(np.floor(len(Turbo256) / no_clusters)) 
    colors = Turbo256[::step][:no_clusters]
    
    p = figure(x_range=x_range, y_range=y_range, width=900, height=600, tools=['pan', 'wheel_zoom', 'reset', 'hover'])
               #, title='Landscape of Emotion Regulation Strategies')
    p.scatter(x='x', y='y', source=cluster_df, size=4, fill_alpha=0.5, color=factor_cmap('labels', colors, list(set(cluster_df['labels']))))
    p.scatter(x='x', y='y', source=noise_df, size=4, fill_alpha=0.5, color='#B3CCCC', marker='cross')
    p.xgrid.visible = False
    p.ygrid.visible = False
    p.xaxis.visible = False
    p.yaxis.visible = False
    p.title.align = 'center'
    p.background_fill_color = '#F0F5F5'
    hover = p.select(dict(type=HoverTool))
    hover.tooltips = [('Text', '@strategy_text')]
    
    p.output_backend = "svg"
    p.background_fill_color = None
    p.border_fill_color = None
        
    return p

### Compute final UMAP & HDBSCAN combination and plot

In [None]:
# compute UMAP & HDBSCAN solution
hdbscan_fin = umap_hdbscan(embeds=strategy_embeddings)
umap_2D_masked = umap_2D(embeds=strategy_embeddings)

In [None]:
# show plot
h = plot_ER_atlas(umap_2D_masked, hdbscan_fin, (3, 13), (5, 15))
show(h)

### Inspect clusters for interpretation

In [None]:
# create dataframe with sentences and cluster labels
label_df = pd.DataFrame(strategies, columns=['Strategy'])
label_df['Label'] = hdbscan_fin.labels_

In [None]:
# print sentences for cluster CLUSTER
CLUSTER = # EX7.1 YOUR CODE HERE

print(f'CLUSTER {CLUSTER}: \n')
for i in label_df[label_df.Label == CLUSTER]['Strategy'].tolist():
    print(i)