In [1]:
import sys
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

In [2]:
VERSION = 6
FILE_NAME = f"ap_doc_clusters_{VERSION}.json"

In [3]:
with open(FILE_NAME, "r") as fin:
    clusters = json.load(fin)
len(clusters)

543

In [14]:
def plot_top_words(clus, feature_names, scores, n_top):
    fig, axes = plt.subplots(len(clus) // 5 + 1, 5, figsize=(30, 15), sharex=True)
    axes = axes.flatten()
    for cix, _ in enumerate(clus):
        top_features_ind = np.argsort(scores[cix, :].toarray().ravel())[-n_top:][::-1]
        top_features = [feature_names[ix] for ix in top_features_ind]
        weights = scores[cix, top_features_ind].toarray().ravel().tolist()

        ax = axes[cix]
        y_pos = np.arange(len(top_features))
        ax.barh(y_pos, weights, height=0.7, linewidth=0.1, hatch="//")
        ax.set_yticks(y_pos, labels=top_features)
        ax.set_title(f"Cluster {cix}", fontdict={"fontsize": 30})
        ax.invert_yaxis()
        ax.tick_params(axis="both", which="major", labelsize=20)
        for pos in ["top", "right", "left"]:
            ax.spines[pos].set_visible(False)
        fig.suptitle(f"{cix}", fontsize=40)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

In [15]:
cluster_list = list(clusters.values())

In [16]:
corpus = [" ".join(v) for v in cluster_list]
vectorizer = TfidfVectorizer()
tfidf_scores = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

array(['00', '000', '000usd', ..., 'განვითარებისთვის', 'ენერგოეფექტურობა',
       'მდგრადი'], dtype=object)

In [17]:
tfidf_scores.shape

(543, 19529)

In [21]:
# plot_top_words(cluster_list, vectorizer.get_feature_names_out(), tfidf_scores, n_top=10)
n_top = 10
feature_names = vectorizer.get_feature_names_out()
for cix, _ in enumerate(cluster_list):
    top_features_ind = np.argsort(tfidf_scores[cix, :].toarray().ravel())[-n_top:][::-1]
    top_features = [feature_names[ix] for ix in top_features_ind]
    weights = tfidf_scores[cix, top_features_ind].toarray().ravel().tolist()
    for tf, w in zip(top_features, weights):
        print(f"{tf}: {w:.4f}")
    print()

de: 0.4109
raison: 0.3016
en: 0.2728
nous: 0.2415
au: 0.2249
année: 0.2107
cours: 0.1952
accompagner: 0.1685
concernant: 0.1581
sommes: 0.1581

et: 0.2837
de: 0.2796
ceux: 0.2522
abondance: 0.2522
abbattoires: 0.2522
ménagers: 0.2367
des: 0.2314
produits: 0.2043
déchets: 0.2043
il: 0.1871

réalisée: 0.4980
activité: 0.4745
crise: 0.2547
le: 0.2176
non: 0.2171
de: 0.1893
ras: 0.1779
la: 0.1410
connu: 0.1336
mener: 0.1274

de: 0.4260
les: 0.2805
emballage: 0.2508
le: 0.2204
nous: 0.2152
produits: 0.1784
défi: 0.1633
voir: 0.1612
des: 0.1592
avons: 0.1546

les: 0.3723
de: 0.3600
des: 0.3576
et: 0.2557
déchets: 0.2368
plastiques: 0.2273
une: 0.2003
la: 0.1770
pour: 0.1404
le: 0.1390

la: 0.3699
et: 0.3674
de: 0.3518
des: 0.3425
les: 0.2378
jeunes: 0.2269
le: 0.1712
jeune: 0.1400
dans: 0.1262
au: 0.1246

de: 0.3468
les: 0.3464
et: 0.3408
des: 0.3358
la: 0.2772
le: 0.2535
moringa: 0.2528
pour: 0.1759
du: 0.1620
nous: 0.1596

de: 0.5069
des: 0.3678
la: 0.3464
et: 0.3204
les: 0.3139
le: 0.2309