# CMP STEP 9: Interpreting & Comparing

In [None]:
NOT_LINUX = False  # set to True, if you have trouble replicating results with your operating system

In [1]:
# choose based on Steps 5 & 6 - Evaluation:
### UMAP final parameters ###
N_NEIGHBORS = 30
MIN_DIST = 0.01
N_COMPONENTS = 30
RANDOM_STATE = 86531

### HDBSCAN final parameters ###
MIN_SAMPLES = 30
MIN_CLUSTER_SIZE = 10

EMBEDDINGS = 'data/'

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/alina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
import warnings
warnings.filterwarnings("ignore")

import os

import pandas as pd
import numpy as np

import umap
import hdbscan 

from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from bokeh.palettes import Turbo256
from bokeh.models import HoverTool
from bokeh.io import export_svg

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

In [4]:
# create folder to save solution plots
if not os.path.exists(f'solution'):
    os.makedirs(f'solution')

### Load strategy texts & embeddings

In [5]:
# load raw strategy sentences
data = pd.read_csv(EMBEDDINGS + 'strategies_raw.csv', sep=';')
strategies = data['strategy text'].tolist()

In [6]:
# load strategy embeddings and convert to numpy array
strategy_embeddings = np.load(EMBEDDINGS + 'strategy_embeddings_masked.npy')

### Define embedding & plotting functions

In [7]:
def umap_hdbscan(embeds=strategy_embeddings,
                 n_neighbors=N_NEIGHBORS,
                 min_dist=MIN_DIST,
                 n_components=N_COMPONENTS,
                 random_state=RANDOM_STATE,
                 min_cluster_size=MIN_CLUSTER_SIZE,
                 min_samples=MIN_SAMPLES):
    if NOT_LINUX == True:
        umap_embed_fin = np.load(EMBEDDINGS + f'not_linux/umap_embeddings_seed-{RANDOM_STATE}-(final).npy')
    else:
        umap_fin = umap.UMAP(n_neighbors=n_neighbors,
                             min_dist=min_dist,
                             n_components=n_components,
                             metric='cosine',
                             random_state=random_state).fit(embeds)
                             # gives majority of cluster solutions of 1.000 different UMAP seeds

        umap_embed_fin = umap_fin.transform(embeds)

    hdbscan_fin = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,
                                  min_samples=min_samples,
                                  metric='euclidean')

    hdbscan_fin.fit(umap_embed_fin)
    
    return hdbscan_fin


def umap_2D(embeds=strategy_embeddings):
    # reduce dimensions further to 2D for plotting
    umap_2D = umap.UMAP(n_neighbors=N_NEIGHBORS,
                        min_dist=MIN_DIST,
                        n_components=2,
                        metric='cosine',
                        random_state=RANDOM_STATE).fit(embeds)

    umap_2D_embed = umap_2D.transform(embeds)
    
    return umap_2D_embed


def plot_ER_atlas(umap_2D, hdbscan, x_range, y_range):
    # create df with plotting data for bokeh interactive plot
    clusters = np.delete(umap_2D, np.where(hdbscan.labels_ == -1)[0], axis=0)
    cluster_labels = hdbscan.labels_[hdbscan.labels_ != -1]
    noise = np.delete(umap_2D, np.where(hdbscan.labels_ != -1)[0], axis=0)

    cluster_df = pd.DataFrame()
    cluster_df['x'] = clusters[:,0]
    cluster_df['y'] = clusters[:,1]
    cluster_df['labels'] = [str(label) for label in cluster_labels]
    cluster_df['strategy_text'] = np.delete(strategies, np.where(hdbscan.labels_ == -1)[0])

    noise_df = pd.DataFrame()
    noise_df['x'] = noise[:,0]
    noise_df['y'] = noise[:,1]
    noise_df['strategy_text'] = np.delete(strategies, np.where(hdbscan.labels_ != -1)[0])
    
    # create interactive bokeh plot of ER strategy clusters
    no_clusters = (max(cluster_labels) + 1)
    step = int(np.floor(len(Turbo256) / no_clusters)) 
    colors = Turbo256[::step][:no_clusters]
    
    p = figure(x_range=x_range, y_range=y_range, width=900, height=600, tools=['pan', 'wheel_zoom', 'reset', 'hover'])
               #, title='Landscape of Emotion Regulation Strategies')
    p.scatter(x='x', y='y', source=cluster_df, size=4, fill_alpha=0.5, color=factor_cmap('labels', colors, list(set(cluster_df['labels']))))
    p.scatter(x='x', y='y', source=noise_df, size=4, fill_alpha=0.5, color='#B3CCCC', marker='cross')
    p.xgrid.visible = False
    p.ygrid.visible = False
    p.xaxis.visible = False
    p.yaxis.visible = False
    p.title.align = 'center'
    p.background_fill_color = '#F0F5F5'
    hover = p.select(dict(type=HoverTool))
    hover.tooltips = [('Text', '@strategy_text')]
    
    p.output_backend = "svg"
    p.background_fill_color = None
    p.border_fill_color = None
        
    return p

### Compute final UMAP & HDBSCAN combination and plot

In [8]:
hdbscan_fin = umap_hdbscan(embeds=strategy_embeddings)
# cross-check number of clusters (37) & unclustered points (3067)
print(f'Number of clusters: {max(hdbscan_fin.labels_ + 1)}')
print(f'Number of unclustered points: {list(hdbscan_fin.labels_).count(-1)}')
umap_2D_masked = umap_2D(embeds=strategy_embeddings)

Number of clusters: 37
Number of unclustered points: 3067


In [9]:
h = plot_ER_atlas(umap_2D_masked, hdbscan_fin, (-4, 7), (-4.5, 6.5))
# export_svg(h, filename="solution/ER_atlas.svg")

['solution/ER_atlas.svg']

### Inspect clusters for interpretation

In [10]:
# create dataframe with sentences and cluster labels
label_df = pd.DataFrame(strategies, columns=['Strategy'])
label_df['Label'] = hdbscan_fin.labels_
# label_df.to_csv('strategies_labels.csv', index=False)

In [11]:
CLUSTER = 0  # print sentences for cluster CLUSTER

print(f'CLUSTER {CLUSTER}: \n')
for i in label_df[label_df.Label == CLUSTER]['Strategy'].tolist():
    print(i)

CLUSTER 0: 

Gefuehle der anderen Person mitteilen
eine Person, der man vertraut davon erzaehlen
koerperliches Training
Ich finde Loesungen unabhaengig von diesem Kollegen.
Ich mache meinem Aerger ihm gegenueber Luft.
Ich aendere zukuenftige Plaene, sodass sie moeglichst ohne diesen Kollegen funktionieren.
fruehere Zeit ansetzen
Atemuebungen machen
Ich kann ihm bei seiner Aufgabe helfen, die zur besagten Info fuehrt.
Ich kann die Situation meinem Chef erklaeren.
Ich kann meinen Zeitplan ein wenig aendern.
Pause machen & spaeter weitermachen
Falls er immer zu spaet dran ist mit jmd anderem zusammenarbeiten
Dinge machen fuer die man normalerweise zu wenig Freizeit hat  
ich lasse den Kollegen wissen wie veraergert ich bin
Fragen, warum nicht, koennte plausible Gruende (haben) geben
Erklaeren, dass er es das naechste Mal frueher anfangen soll bzw. An andere denken muss.
fuer das naechste Mal planen, dich nicht von diesem Kollegen abhaengig zu machen
in mich gehen & ueberdenken, ob ich es 

In [12]:
# 0 : --- .34
# 1 : substances (medication) .96
# 2 : situational control (flight anxiety item 2) .96
# 3 : 'fliegen' .94
# 4 : distraction .79
# 5 : relaxation techniques .97
# 6 : therapy .91
# 7 : situational control (sick relatives item 3) .88
# 8 : --- [refocusing?] .65!
# 9 : suppression .86
# 10: 'anrufen' .75
# 11: eating & drinking 1.00
# 12: substances (alcohol) .94
# 13: 'anfreunden' .96
# 14: situational control (colleague gone item 9) .95
# 15: 'Café' .75
# 16: information seeking .91
# 17: withdrawal .88
# 18: 'Angst' .85
# 19: situational control (house warming party item 8) .89
# 20: emotional expression 1.00
# 21: instrumental support .85
# 22: --- .44
# 23: self-enhancement .76
# 24: situational control (sport team item 4) .83
# 25: situational control (university item 5) .88
# 26: situational control (resources item 7) 1.00
# 27: social support .85
# 28: social support 1.00
# 29: social support .96
# 30: situational control (information item 1) .81
# 31: emotional expression .94
# 32: 'beschweren' 1.00
# 33: reappraisal .79
# 34: 'Begriffe aus Arbeitskontext' (Bericht, Betriebsmittel, Klient, Kunde, ...) .80
# 35: instrumental support .90
# 36: instrumental support 1.00

### Extract cluster top words table with c-TF_IDF (class-based TF-IFD)
(see BERTopic tutorial)

In [13]:
### DEFINE TOP WORDS HELPER FUNCTIONS ###

GERMAN_STOP = stopwords.words('german')


def c_tf_idf(documents, m):
    # documents: single document of concatenated documents per class
    # m: original number of documents
    count = CountVectorizer(stop_words=GERMAN_STOP).fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count


def extract_topic_topwords(tf_idf, count, docs_per_topic, n=20):
    # tf_idf: c_tf_idf
    # count: CountVectorizer
    # docs_per_topic: c_tf_idf dataframe
    # n: number of topwords to be returned (dictionary with {topic_labels: top_words})
    words = count.get_feature_names()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    
    return top_n_words

def top_words_table(hdbscan, n=10):
    """extracts top n words for each cluster based on a tf-idf cluster representation"""
    """FOR VISUALIZATION PURPOSES"""
    # create single document with all documents (=strategy texts) in single cluster
    docs_df = pd.DataFrame(strategies, columns=["Doc"])
    docs_df['Topic'] = hdbscan.labels_
    docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

    # extract topwords
    tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(strategies))
    top_n_words = extract_topic_topwords(tf_idf, count, docs_per_topic, n=n)
    
    top_words_table = pd.DataFrame(columns=['Cluster', 'Top Words'])

    for i, j in zip(top_n_words, range(len(top_n_words))):
        top_words_table.loc[j] = [i, ' '.join([j[0] for j in top_n_words[i]])]

    return top_words_table

In [14]:
# compute and display top words table
pd.set_option('display.max_colwidth', 1000)
table = top_words_table(hdbscan_fin)
table

Unnamed: 0,Cluster,Top Words
0,-1,kollegen versuche frage klienten rede mehr freunden fragen situation chef
1,0,darueber fuer ueber koennte reden freunden rede koennen hoeren arbeit
2,1,medikamente nehme beruhigungstabletten beruhigungsmittel nehmen tabletten besorge beruhigende flug beruhigungstropfen
3,2,zug alternativen suche verkehrsmittel schiff bahn fahren transportmöglichkeiten ort suchen
4,3,fliegen flug fliege angst statistiken trotzdem person sicherheit informiere einfach
5,4,ablenken lenke ab abzulenken musik ablenkung film sport schauen schaue
6,5,entspannen meditiere pause entspannungstechniken entspannungsübungen tief atmen mache atme atemübungen
7,6,therapie flugangst therapeuten gehe gehen überwinden psychologen psychologische therapieren versuche
8,7,besuchen krankenhaus besuche verwandten fahre verwandte besuch fahren sofort person
9,8,zeit spass erledigen sinnvoll dinge macht derweil tun hobbies sachen
