## Job Crafting: Creating Intrusion Task

In [None]:
### FINAL UMAP PARAMETERS ###

N_NEIGHBORS = 30
MIN_DIST = 0.01
N_COMPONENTS = 30
METRIC_UMAP = 'cosine'
RANDOM_STATE = 44669

### FINAL HDBSCAN PARAMETERS ###

MIN_CLUSTER_SIZE = 10
MIN_SAMPLES = 10
METRIC_HDBSCAN = 'euclidean'

EMBEDDINGS = 'data/'

In [None]:
import numpy as np
import pandas as pd

import umap
import hdbscan

import os
import random

### Load strategy texts & embeddings

In [None]:
# load raw strategy sentences
data = pd.read_csv(EMBEDDINGS + 'strategies_raw.csv', sep=';')
strategies = data['strategy text'].tolist()

In [None]:
# load strategy embeddings and convert to numpy array
strategy_embeddings = np.load(EMBEDDINGS + 'strategy_embeddings_masked.npy')

### Define helper functions

In [None]:
def umap_hdbscan(embeds=strategy_embeddings,
                 n_neighbors=N_NEIGHBORS,
                 min_dist=MIN_DIST,
                 n_components=N_COMPONENTS,
                 random_state=RANDOM_STATE,
                 min_cluster_size=MIN_CLUSTER_SIZE,
                 min_samples=MIN_SAMPLES):

    umap_fin = umap.UMAP(n_neighbors=n_neighbors,
                         min_dist=min_dist,
                         n_components=n_components,
                         metric='cosine',
                         random_state=random_state).fit(embeds)
                         # gives majority of cluster solutions of 5.000 different UMAP seeds

    umap_embed_fin = umap_fin.transform(embeds)

    hdbscan_fin = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,
                                  min_samples=min_samples,
                                  metric='euclidean')

    hdbscan_fin.fit(umap_embed_fin)
    
    return hdbscan_fin


# produce intrusion task survey
def sentence_intrusion_survey(hdbscan, k=4, items=[], random_state=15):
    """ draws pre-calculated number of intrusion items
        3 sentences of each item belong to the same cluster and 1 (the intruder) belongs to a different cluster
        intrusion task items and answers are shuffled and stored in a txt file"""

    if os.path.exists('intrusion/survey/survey.txt'):
        os.remove('intrusion/survey/survey.txt')

    if os.path.exists('intrusion/survey_solution.txt'):
        os.remove('intrusion/survey_solution.txt')
    
    solutions = []
    tasks = []
    random.seed(random_state)
    
    for cluster in range(max(hdbscan.labels_) + 1):
            
        cluster_sentences = [i for i in np.delete(strategies, np.where(
                              hdbscan.labels_ != cluster), axis=0)]
        sentences = random.sample(cluster_sentences, k=(k-1)*items[cluster])
        
        intruder_sentences = [i for i in np.delete(strategies,
                                            list(np.where(hdbscan.labels_ == cluster)[0]) + list(np.where(hdbscan.labels_ == -1)[0]),
                                            axis=0)]
        intruders = random.sample(intruder_sentences, k=items[cluster])
        
        for x, y in enumerate(np.arange(k-2, len(sentences), k-1)):
            task = sentences[y-(k-2):y+1]
            task.append(intruders[x])
            random.shuffle(task)
            solutions.append((cluster, task.index(intruders[x])))
            tasks.append(task)
    
    shuffle_temp = list(zip(tasks, solutions))
    random.shuffle(shuffle_temp)
    tasks, solutions = zip(*shuffle_temp)
    
    with open('intrusion/survey.txt', 'a') as f:
        for i in range(len(tasks)):
            f.write('\n')
            f.write(f'ITEM {i+1}')
            for j in range(k):
                f.write('\n')
                f.write(tasks[i][j])
            f.write('\n')
        
    with open('intrusion/survey_solution.txt', 'a') as f:
        f.write('Item, Cluster, Solution')
        f.write('\n')
        for z in range(len(solutions)):
            f.write(f'{z+1}, {solutions[z][0]}, {solutions[z][1]}')
            f.write('\n')

### Recreate final solution

In [None]:
hdbscan_fin = umap_hdbscan(embeds=strategy_embeddings)
# cross-check number of clusters (40) & unclustered points
print(f'Number of clusters: {max(hdbscan_fin.labels_ + 1)}')
print(f'Number of unclustered points: {list(hdbscan_fin.labels_).count(-1)}')

### Draw intrusion survey items

In [None]:
# evaluate aproxx./at least 30% of sentences per cluster
items_per_cluster = [int(3 * np.ceil(list(hdbscan_fin.labels_).count(x) * 0.3 / 3) / 3
                        ) for x in range(max(hdbscan_fin.labels_) + 1)]
sentence_intrusion_survey(hdbscan_fin, items=items_per_cluster)