In [1]:
import sys
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

In [2]:
VERSION = 7
FILE_NAME = f"ap_doc_clusters_{VERSION}.json"

In [3]:
with open(FILE_NAME, "r") as fin:
    clusters = json.load(fin)
len(clusters)

127

In [4]:
cluster_list = list(clusters.values())

In [5]:
corpus = [" ".join(v) for v in cluster_list]
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.95, min_df=2, ngram_range=(1, 3))
tfidf_scores = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

array(['00', '000', '000 00', ..., 'zooming', 'zskgke326',
       'zskgke326 files'], dtype=object)

In [6]:
tfidf_scores.shape

(127, 54874)

In [7]:
n_top = 10
feature_names = vectorizer.get_feature_names_out()
for cix, docs in sorted(enumerate(cluster_list), key=(lambda x: len(x[1])), reverse=True):
    top_features_ind = np.argsort(tfidf_scores[cix, :].toarray().ravel())[-n_top:][::-1]
    top_features = [feature_names[ix] for ix in top_features_ind]
    weights = tfidf_scores[cix, top_features_ind].toarray().ravel().tolist()
    title = f"cluster #{cix} ({len(docs)} docs)"
    print(title)
    print("=" * len(title))
    for tf, w in zip(top_features, weights):
        count = sum(1 for doc in docs if all(word in doc for word in tf.split(" ")))
        print(f"{tf} ({count}): {w:.4f}")
    print()

cluster #53 (155 docs)
waste (34): 0.1790
public (97): 0.1674
innovation (85): 0.1326
data (128): 0.1262
development (96): 0.1242
solutions (95): 0.1238
local (94): 0.1092
undp (38): 0.1082
learning (92): 0.1015
digital (60): 0.1013

cluster #54 (50 docs)
iraq (1): 0.3050
local (36): 0.1377
data (46): 0.1300
waste (15): 0.1285
undp (9): 0.1223
tourism (10): 0.1173
innovation (29): 0.1122
climate (17): 0.0993
change (30): 0.0945
acclab (0): 0.0944

cluster #56 (49 docs)
climate (23): 0.1684
blue (7): 0.1579
economy (22): 0.1537
sector (30): 0.1429
blue economy (6): 0.1375
solutions (33): 0.1195
local (30): 0.1084
verde (1): 0.1077
development (37): 0.1046
waste (14): 0.1035

cluster #57 (43 docs)
informal (19): 0.1605
iraq (0): 0.1444
portfolio (23): 0.1388
social contract (2): 0.1315
social (19): 0.1250
food (8): 0.1241
youth (16): 0.1164
contract (3): 0.1044
solutions (33): 0.1009
economy (14): 0.1002

cluster #94 (42 docs)
government (28): 0.1332
public (26): 0.1242
social (27): 0.12