In [None]:
%run './model/multi_corpus.py'
%run './constants.py'

from itertools import accumulate
import operator

In [None]:
corpora = co_occurence_graphs()
Gs = {field_name: corpus['G'] for (field_name, corpus) in corpora.items()}
Dfs = {field_name: corpus['Df'] for (field_name, corpus) in corpora.items()}

In [None]:
communities = {}
community_lengths = {}
community_number = {}
for field_name, G in Gs.items():
    print(field_name)
    comms = nx.community.louvain_communities(G)
    communities[field_name] = comms
    community_lengths[field_name] = [len(comm) for comm in comms]
    community_number[field_name] = len(comms)

In [None]:
sns.reset_defaults()

for field_name, comms in community_lengths.items():

    fig = plt.figure(figsize=(15, 10))
    ax = fig.add_axes([0, 0, 1, 1])

    comm_id = range(len(comms))
    sorted_comms = sorted(comms, reverse=True)
    cum_comms = list(accumulate(sorted_comms, operator.add))

    dist = np.array(cum_comms) / sum(sorted_comms)

    ax.bar(comm_id, sorted_comms, edgecolor='k')

    ax2 = ax.twinx()

    ax2.plot(
        comm_id,
        dist,
        'ro-', 
        markeredgewidth=1, 
        markeredgecolor='k',
        markerfacecolor="white", 
        linewidth=3,
    )

    plt.xticks(range(0, len(comms)))

    fig.tight_layout()

    plt.savefig(
        os.path.join(LATEX_FIGURES_PATH, 'co_occurrence_louvain_comm_dist', f'{field_name.capitalize()}.png'), 
        transparent=True, 
        dpi=150,
        bbox_inches='tight'
    )

    plt.show()

In [None]:
field_name = 'Political Science'
G = Gs[field_name]
comms = communities[field_name]

n = 20

degree = dict(G.degree(weight='weight'))
sorted_degrees = sorted(degree.items(), key=lambda tup: tup[1], reverse=True)
node_order = {n: i for i, (n, _) in enumerate(sorted_degrees)}

df = pl.DataFrame()
for i, comm in enumerate(comms):
    ordered_comm = [(n, node_order[n]) for n in comm]
    sorted_comm = sorted(ordered_comm, key=lambda tup: tup[1])
    sorted_terms = list(map(lambda tup: tup[0], sorted_comm))
    sorted_terms = sorted_terms + [None] * (n - len(sorted_terms))
    df = df.with_columns(pl.Series(f'Community {i}', sorted_terms[:n]))

In [None]:
df