## Job crafting: Interpreting & Comparing

In [None]:
# choose based on Steps 5 & 6 - Evaluation:
### UMAP parameters ###
N_NEIGHBORS = 30
MIN_DIST = 0.01
N_COMPONENTS = 30
RANDOM_STATE = 44669

### HDBSCAN parameters ###
MIN_SAMPLES = 10
MIN_CLUSTER_SIZE = 10

EMBEDDINGS = 'data/'

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import os

import pandas as pd
import numpy as np

import umap

from bokeh.plotting import figure, show, save
from bokeh.transform import factor_cmap
from bokeh.palettes import Turbo256, Viridis256, Plasma256
from bokeh.models import HoverTool
# from bokeh.io import export_svg

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

In [None]:
# create folder to save solution plots
if not os.path.exists(f'solution'):
    os.makedirs(f'solution')

### Load strategy texts & embeddings

In [None]:
# load raw strategy sentences
data = pd.read_csv(EMBEDDINGS + 'strategies_raw.csv', sep=';')
strategies = data['strategy text'].tolist()

In [None]:
# load strategy embeddings and convert to numpy array
strategy_embeddings = np.load(EMBEDDINGS + 'strategy_embeddings_masked.npy')

### Define embedding & plotting functions

In [None]:
def umap_2D(embeds=strategy_embeddings):
    # reduce dimensions further to 2D for plotting
    umap_2D = umap.UMAP(n_neighbors=N_NEIGHBORS,
                        min_dist=MIN_DIST,
                        n_components=2,
                        metric='cosine',
                        random_state=RANDOM_STATE).fit(embeds)

    umap_2D_embed = umap_2D.transform(embeds)
    
    return umap_2D_embed


def plot_ER_atlas(umap_2D, labels, keep_clusters, x_range, y_range, map_type=1):
    # create df with plotting data for bokeh interactive plot
    # umap_2D: 2D representation of sentence embeddings
    # labels: inferred cluster labels with HDBSCAN
    # keep_clusters: array with True/False for clusters who survived/did not survive the intrusion task
    # x_range, y_range: axis boundaries of plot
    # map_type: 1 (kept clusters in green shades, discarded clusters in red shades)
    #           2 (kept clusters in multiple colors, discarded clusters in dark grey)
    #           3 (all clusters in multiple colors)
    
    clusters = np.delete(umap_2D, np.where(labels == -1)[0], axis=0)
    cluster_labels = labels[labels != -1]
    noise = np.delete(umap_2D, np.where(labels != -1)[0], axis=0)

    cluster_df = pd.DataFrame()
    cluster_df['x'] = clusters[:,0]
    cluster_df['y'] = clusters[:,1]
    cluster_df['labels'] = [str(label) for label in cluster_labels]
    cluster_df['strategy_text'] = np.delete(strategies, np.where(labels == -1)[0])

    noise_df = pd.DataFrame()
    noise_df['x'] = noise[:,0]
    noise_df['y'] = noise[:,1]
    noise_df['strategy_text'] = np.delete(strategies, np.where(labels != -1)[0])
    
    # create interactive bokeh plot of ER strategy clusters 
    if map_type == 1:
        step_keep = int(np.floor(50/keep_clusters.count(True)))
        colors_keep = list(Viridis256[150:200:step_keep][:keep_clusters.count(True)])
    
        step_discard = int(np.floor(50/keep_clusters.count(False)))
        colors_discard = list(Plasma256[150:200:step_discard][:keep_clusters.count(False)])
    
        colors = [colors_keep.pop(0) if keep_clusters[i] == True else colors_discard.pop(0) for i in range(len(keep_clusters))]

    if map_type == 2:
        step_keep = int(np.floor(len(Turbo256) / keep_clusters.count(True)))
        colors_keep = list(Turbo256[::step_keep][:keep_clusters.count(True)])
    
        colors_discard = ['#7D8E8E'] * keep_clusters.count(False)
    
        colors = [colors_keep.pop(0) if keep_clusters[i] == True else colors_discard.pop(0) for i in range(len(keep_clusters))]
        
    if map_type == 3:
        step = int(np.floor(len(Turbo256) / len(keep_clusters)))
        colors = Turbo256[::step][:len(keep_clusters)]

    p = figure(x_range=x_range, y_range=y_range, width=900, height=600, tools=['pan', 'wheel_zoom', 'reset', 'hover'])
               #, title='Landscape of Emotion Regulation Strategies')
    p.scatter(x='x', y='y', source=cluster_df, size=4, fill_alpha=0.5, color=factor_cmap('labels', colors, [str(x) for x in range(len(keep_clusters))]))
    p.scatter(x='x', y='y', source=noise_df, size=4, fill_alpha=0.5, color='#B3CCCC', marker='cross')
    p.xgrid.visible = False
    p.ygrid.visible = False
    p.xaxis.visible = False
    p.yaxis.visible = False
    p.title.align = 'center'
    p.background_fill_color = '#F0F5F5'
    hover = p.select(dict(type=HoverTool))
    hover.tooltips = [('Text', '@strategy_text')]
    
    p.output_backend = "svg"
    p.background_fill_color = None
    p.border_fill_color = None
        
    return p

### Compute 2D UMAP representation and plot

In [None]:
labels_robust = np.load("robustness_check/data/labels.npy")
random_seeds_robust = np.load("robustness_check/data/random_seeds.npy")
labels = labels_robust[np.where(random_seeds_robust == RANDOM_STATE)[0][0]]

In [None]:
survey_analysis = pd.read_csv('intrusion/intrusion_survey_results.txt')
survey_analysis.columns = ['Cluster', 'Fraction']

keep_clusters = [True if x > 0.67 else False for x in survey_analysis.Fraction]

In [None]:
umap_2D_masked = umap_2D(embeds=strategy_embeddings)

In [None]:
h = plot_ER_atlas(umap_2D_masked, labels, keep_clusters, (10, 16), (4, 14), map_type=2)
# show(h)
save(h, filename="solution/strategy_class_map.html")
# export_svg(h, filename="solution/ER_atlas_green_red.svg")

### Inspect clusters for interpretation

In [None]:
# create dataframe with sentences and cluster labels
label_df = pd.DataFrame(strategies, columns=['reap'])
label_df['Label'] = labels
# label_df.to_csv('strategies_labels.csv', index=False)

In [None]:
CLUSTER = 0  # print sentences for cluster CLUSTER

print(f'CLUSTER {CLUSTER}: \n')
for i in label_df[label_df.Label == CLUSTER]['reap'].tolist():
    print(i)

In [None]:
# write cluster contents to file
with open("solution/cluster_content.txt", "w") as f:
    for cluster in range(25):
        if keep_clusters[cluster]:
            f.write(f"CLUSTER {cluster}\n")
            for sentence in label_df[label_df.Label == cluster]['reap'].tolist():
                f.write(sentence)
                f.write("\n")
            f.write("\n")

### Extract cluster top words table with c-TF_IDF (class-based TF-IFD)
(see BERTopic tutorial)

In [None]:
### DEFINE TOP WORDS HELPER FUNCTIONS ###

ENGLISH_STOP = stopwords.words('spanish')


def c_tf_idf(documents, m):
    # documents: single document of concatenated documents per class
    # m: original number of documents
    count = CountVectorizer(stop_words=ENGLISH_STOP).fit(documents)
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count


def extract_topic_topwords(tf_idf, count, docs_per_topic, n=20):
    # tf_idf: c_tf_idf
    # count: CountVectorizer
    # docs_per_topic: c_tf_idf dataframe
    # n: number of topwords to be returned (dictionary with {topic_labels: top_words})
    words = count.get_feature_names_out()
    labels = list(docs_per_topic.Topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1] for i, label in enumerate(labels)}
    
    return top_n_words

def top_words_table(labels, n=10):
    """extracts top n words for each cluster based on a tf-idf cluster representation"""
    """FOR VISUALIZATION PURPOSES"""
    # create single document with all documents (=strategy texts) in single cluster
    docs_df = pd.DataFrame(strategies, columns=["Doc"])
    docs_df['Topic'] = labels
    docs_per_topic = docs_df.groupby(['Topic'], as_index = False).agg({'Doc': ' '.join})

    # extract topwords
    tf_idf, count = c_tf_idf(docs_per_topic.Doc.values, m=len(strategies))
    top_n_words = extract_topic_topwords(tf_idf, count, docs_per_topic, n=n)
    
    top_words_table = pd.DataFrame(columns=['Cluster', 'Top Words'])

    for i, j in zip(top_n_words, range(len(top_n_words))):
        top_words_table.loc[j] = [i, ' '.join([j[0] for j in top_n_words[i]])]

    return top_words_table

In [None]:
# compute and display top words table
pd.set_option('display.max_colwidth', 1000)
table = top_words_table(labels)
table.to_csv("solution/cluster_top_words.csv", index=False)
table