In [1]:
import sys
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

In [2]:
VERSION = 7
FILE_NAME = f"ap_doc_clusters_{VERSION}.json"

In [3]:
with open(FILE_NAME, "r") as fin:
    clusters = json.load(fin)
len(clusters)

125

In [4]:
cluster_list = list(clusters.values())

In [11]:
corpus = [" ".join(v) for v in cluster_list]
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.95, min_df=2, ngram_range=(1, 3))
tfidf_scores = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

array(['00', '000', '000 00', ..., 'zskgke326 files', 'μg', 'μg m3'],
      dtype=object)

In [12]:
tfidf_scores.shape

(125, 56803)

In [14]:
n_top = 10
feature_names = vectorizer.get_feature_names_out()
for cix, docs in sorted(enumerate(cluster_list), key=(lambda x: len(x[1])), reverse=True):
    top_features_ind = np.argsort(tfidf_scores[cix, :].toarray().ravel())[-n_top:][::-1]
    top_features = [feature_names[ix] for ix in top_features_ind]
    weights = tfidf_scores[cix, top_features_ind].toarray().ravel().tolist()
    title = f"cluster #{cix} ({len(docs)} docs)"
    print(title)
    print("=" * len(title))
    for tf, w in zip(top_features, weights):
        count = sum(1 for doc in docs if all(word in doc for word in tf.split(" ")))
        print(f"{tf} ({count}): {w:.4f}")
    print()

cluster #43 (130 docs)
public (76): 0.1639
innovation (67): 0.1460
development (79): 0.1379
data (109): 0.1333
design (95): 0.1237
social (81): 0.1193
undp (32): 0.1097
solutions (72): 0.1095
new (92): 0.1061
digital (55): 0.1056

cluster #50 (68 docs)
waste (33): 0.4154
iraq (1): 0.1732
solutions (45): 0.1294
local (53): 0.1284
waste management (30): 0.1261
management (32): 0.1129
diapers (3): 0.1095
economy (26): 0.1074
tourism (19): 0.1073
blue (6): 0.0987

cluster #70 (58 docs)
sector (39): 0.1579
data (52): 0.1493
climate (17): 0.1414
solutions (37): 0.1184
economy (15): 0.1117
informal (17): 0.1077
nbsp (8): 0.1053
local (37): 0.1026
tourism (10): 0.1011
lab (44): 0.0976

cluster #46 (42 docs)
data (38): 0.1336
waste (12): 0.1243
new (30): 0.1063
development (28): 0.1045
lab (29): 0.1040
undp (10): 0.0993
innovation (22): 0.0983
communities (22): 0.0966
local (25): 0.0949
new capital (4): 0.0946

cluster #112 (36 docs)
nbsp (11): 0.1677
waste (8): 0.1555
innovation (17): 0.1386
p