In [None]:
# requirements 
# i suggest to use a virtual environment when installing these packages 
!pip install -U pandas
!pip install -U scikit-learn
!pip install -U sentence-transformers

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

import warnings
warnings.filterwarnings('ignore')

In [None]:
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

#keyphrases.csv in shared drive (not public data)
df = pd.read_csv('keyphrases.csv') # file containing keyphrases from out old narratives
keyphrases = df['ngram_range_keyphrases'].to_numpy()

#embed keyphrases
corpus_embeddings = embedding_model.encode(sentences=keyphrases, 
                                           convert_to_numpy=True,
                                           show_progress_bar=True)

#normalization (optional)
corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

In [None]:
from sklearn.cluster import KMeans

kmeans_arr  = []

# compute kmeans for k = 5 to k = 15
for n_clusters in range(5, 16): 
    clustering_model = KMeans(n_clusters=n_clusters)
    clustering_model.fit(corpus_embeddings)
    cluster_assignment = clustering_model.labels_
    kmeans_arr.append(np.array(cluster_assignment))

In [None]:
clustered_sentences = [[[] for _ in range(n_clusters)] for n_clusters in range(5, 16)]

for n_clusters, arr in enumerate(kmeans_arr):
    for phrase_id, cluster_id in enumerate(arr):
        clustered_sentences[n_clusters][cluster_id].append(keyphrases[phrase_id])

In [None]:
import json

index = 10 # 10 clusters seemed to be the best
with open(f'clustered_sentences_{index}.json', 'w') as f:

    json.dump(clustered_sentences[index], f, indent=4, sort_keys=True)