In [1]:
import sys
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

In [2]:
VERSION = 7
FILE_NAME = f"ap_doc_clusters_{VERSION}.json"

In [3]:
with open(FILE_NAME, "r") as fin:
    clusters = json.load(fin)
len(clusters)

543

In [15]:
cluster_list = list(clusters.values())

In [16]:
corpus = [" ".join(v) for v in cluster_list]
vectorizer = TfidfVectorizer()
tfidf_scores = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

array(['00', '000', '000usd', ..., 'განვითარებისთვის', 'ენერგოეფექტურობა',
       'მდგრადი'], dtype=object)

In [17]:
tfidf_scores.shape

(543, 19529)

In [29]:
# plot_top_words(cluster_list, vectorizer.get_feature_names_out(), tfidf_scores, n_top=10)
n_top = 10
feature_names = vectorizer.get_feature_names_out()
for cix, docs in sorted(enumerate(cluster_list), key=(lambda x: len(x[1])), reverse=True):
    top_features_ind = np.argsort(tfidf_scores[cix, :].toarray().ravel())[-n_top:][::-1]
    top_features = [feature_names[ix] for ix in top_features_ind]
    weights = tfidf_scores[cix, top_features_ind].toarray().ravel().tolist()
    title = f"cluster #{cix} ({len(docs)} docs)"
    print(title)
    print("=" * len(title))
    for tf, w in zip(top_features, weights):
        count = sum(1 for doc in docs if tf in doc)
        print(f"{tf} ({count}): {w:.4f}")
    print()

cluster #10 (10 docs)
de (10): 0.4473
les (10): 0.4222
des (10): 0.4054
la (10): 0.2884
et (10): 0.2873
le (10): 0.2037
dattes (1): 0.1422
déchets (4): 0.1257
dans (10): 0.1115
en (10): 0.1112

cluster #7 (8 docs)
de (8): 0.5069
des (8): 0.3678
la (8): 0.3464
et (8): 0.3204
les (8): 0.3139
le (8): 0.2309
du (8): 0.1389
une (8): 0.1293
nous (6): 0.1252
avec (8): 0.1134

cluster #336 (7 docs)
the (7): 0.5136
of (7): 0.3118
and (7): 0.3092
to (7): 0.2319
depopulation (4): 0.2138
serbia (2): 0.2067
in (7): 0.1791
serbian (0): 0.1306
migration (4): 0.1191
for (7): 0.1105

cluster #385 (6 docs)
the (6): 0.5335
iraq (1): 0.3677
and (6): 0.2977
of (6): 0.2558
to (6): 0.2520
in (6): 0.2301
iraqi (0): 0.1397
mosul (1): 0.1163
with (6): 0.0958
undp (2): 0.0938

cluster #398 (6 docs)
the (6): 0.5677
and (6): 0.2823
of (6): 0.2790
to (6): 0.2632
in (6): 0.1931
on (6): 0.1144
food (1): 0.1136
with (6): 0.0989
as (6): 0.0983
local (4): 0.0855

cluster #402 (6 docs)
the (6): 0.5560
and (6): 0.3821
sdg