In [None]:
%run './model/multi_corpus.py'
%run './constants.py'

from scipy.cluster.hierarchy import linkage, dendrogram

from matplotlib import cm

from itertools import combinations

In [None]:
corpora = co_occurence_graphs(n_edges=100)
Gs = {field_name: corpus['G'] for (field_name, corpus) in corpora.items()}
Dfs = {field_name: corpus['Df'] for (field_name, corpus) in corpora.items()}

In [None]:
import itertools
import scipy

G = Gs['Sociology']

method = 'grivan_newman' # louvain, girvan_newman

if method == 'louvain':
    hier_comms = list(nx.community.louvain_partitions(G, weight='weight', resolution=1, threshold=1e-07, seed=None))
    hier_comms.append([{node} for community in hier_comms[-1] for node in community])
if method == 'grivan_newman':
    hier_comms = list(nx.community.girvan_newman(G))

print('Nodes: ', len(G))
print('Levels: ', len(hier_comms)-1)

nodes = set.union(*hier_comms[-1])
ids = dict(zip(nodes, range(len(nodes))))

M_dst = np.full((len(nodes), len(nodes)), len(hier_comms))

dsts = []
modularities = []
for i, level in enumerate(hier_comms):
    modularity = nx.community.modularity(G, level, weight='weight', resolution=1)
    modularities.append(modularity)
    dst = len(hier_comms) - i - 1
    for cluster in level:
        for u, v in itertools.combinations(cluster, 2):
            M_dst[ids[u]][ids[v]] = dst
            M_dst[ids[v]][ids[u]] = dst
    dsts.append(dst)

np.fill_diagonal(M_dst, 0)
A_dst = scipy.spatial.distance.squareform(M_dst)
M_linkage = scipy.cluster.hierarchy.linkage(A_dst)

fig, ax = plt.subplots(1, figsize=(25, 15))
scipy.cluster.hierarchy.dendrogram(M_linkage, labels=list(ids.keys()), leaf_rotation=90., leaf_font_size=15)
plt.show()

In [None]:
x = range(len(modularities))
y = modularities

plt.plot(x, y)
plt.show()

In [None]:
x = range(len(dsts))
y = dsts

plt.plot(x, y)
plt.show()

In [None]:
field_name = 'Sociology'

G = Gs[field_name]
df = Dfs[field_name]

In [None]:
import networkx as nx
from community import community_louvain
from itertools import combinations

def merge_communities(graph):
    # Identify the Louvain communities
    # partition = community_louvain.best_partition(graph)
    partition = {}
    for i, community in enumerate(nx.community.louvain_communities(G)):
        for node in community:
            partition[node] = i

    # Create a dictionary of communities to nodes
    communities = {}
    for node, community in partition.items():
        if community not in communities:
            communities[community] = []
        communities[community].append(node)

    # Create a new graph to hold the super-nodes
    super_graph = nx.Graph()

    # Iterate over each community
    for community, nodes in communities.items():
        # Create a super-node in the new graph
        super_graph.add_node(community, size=len(nodes))

        # Calculate the total weight of the edges within the community
        total_weight = sum(graph[u][v]['weight'] for u, v in combinations(nodes, 2) if graph.has_edge(u, v))

        # Create a self-loop edge on the super-node with the total weight
        super_graph.add_edge(community, community, weight=total_weight)

    # Iterate over each pair of communities
    for community1, community2 in combinations(communities.keys(), 2):
        # Calculate the total weight of the edges between the communities
        total_weight = sum(graph[u][v]['weight'] for u in communities[community1] for v in communities[community2] if graph.has_edge(u, v))

        # Create an edge between the super-nodes with the total weight
        if total_weight > 0:
            super_graph.add_edge(community1, community2, weight=total_weight)

    return super_graph, partition


In [None]:
G_merged, partition = merge_communities(G)

In [None]:
node_sizes = {n: d['size'] * 50 for n, d in G_merged.nodes(data=True)}
nodelist = list(node_sizes.keys())
node_size = list(node_sizes.values())

edge_sizes = {(u, v): d['weight'] * 50 for u, v, d in G_merged.edges(data=True)}
edgelist = list(edge_sizes.keys())
edge_size = list(edge_sizes.values())

fig, ax = plt.subplots(1, figsize=(15, 15))

pos = nx.nx_agraph.graphviz_layout(G_merged, prog='sfdp') # dot, twopi, fdp, sfdp, circo

nx.draw_networkx_nodes(
    G_merged, 
    pos,
    nodelist=nodelist,
    node_size=node_size,
    # node_color='#00ABB3',
    # node_color=node_color,
    node_shape='o',
    alpha=None,
    cmap=plt.cm.Blues,
    ax=ax,
    linewidths=1.0,
    edgecolors='k',
).set_zorder(1)

nx.draw_networkx_labels(G_merged, pos=pos, ax=ax, labels=dict(zip(nodelist, nodelist)))

nx.draw_networkx_edges(
    G_merged, 
    pos,
    width=1.0,
    edge_color='k',
    style='solid',
    alpha=0.5,
    arrowsize=10,
    ax=ax,
    nodelist=nodelist,
    node_size=node_size,
    node_shape='o',
    connectionstyle='arc3',
    # width=width,
).set_zorder(-1)

ax.grid(False)

plt.show()
# plt.savefig(
#     os.path.join(LATEX_FIGURES_PATH, 'co_occurrence_graphs', f'{field_name.capitalize()}.png'), 
#     transparent=True, 
#     dpi=150 
# )

In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import pandas as pd
import seaborn as sns
from collections import defaultdict

# Define parameters
sizes = [25, 25, 25]
p_matrix = [
    [0.5, 0.1, 0.01], 
    [0.1, 0.5, 0.01],
    [0.01, 0.01, 0.5],
]

# Create a stochastic block model
G = nx.stochastic_block_model(sizes, p_matrix, seed=0)
nx.write_weighted_edgelist(G, './link_clustering/weighted_edgelist.edgelist', delimiter='\t')

# Use the Louvain method for community detection
communities = nx.community.louvain_communities(G)
communities = {n: i for i, comm in enumerate(communities) for n in comm}

# Get the number of communities
num_communities = len(set(communities.values()))
cmap = cm.get_cmap('plasma', num_communities)
color_map = [cmap(communities[node]) for node in G]

pos = nx.spring_layout(G)
print(pos)
# KDE
pos_array = defaultdict(list)
for n, coord in pos.items():
    if n == 25:
        pos_array[0].append(coord)    
    pos_array[communities[n]].append(coord)

plt.figure(figsize=(8, 6))

nx.draw_networkx_nodes(G, pos=pos, node_color=color_map, alpha=0.9, edgecolors='k')
nx.draw_networkx_edges(G, pos=pos, alpha=0.1)
nx.draw_networkx_labels(G, pos=pos)

for comm, arr in pos_array.items():
    
    df = pd.DataFrame(arr)
    df.columns = ['x', 'y']

    sns.kdeplot(data=df, x="x", y="y", levels=5, thresh=0.1, color=cmap(comm)) # , fill=True

plt.show()

In [None]:
df_comm = (
    pl.DataFrame([
        pl.Series('Doi', list(comms_dict.keys())),
        pl.Series('Community', list(comms_dict.values())),
    ])
    .join(df, on='Doi', how='left')
    .select(
        pl.col('Doi'), 
        pl.col('Community'), 
        pl.col('Text'),
    )
    .groupby('Community')
    .agg(
        pl.col('Doi'), 
        pl.col('Text')
    )
    .with_columns(
        pl.col('Text')
        .arr.eval(pl.element().explode())
    )
    .sort(pl.col('Doi').arr.lengths(), descending=True)
)
df_comm

In [None]:
clusters = [
    ({0, 1, 6, 2, 3, 4},),
    ({0, 1, 6}, {2, 3, 4}),
    ({0, 1, 6}, {2, 3}, {4}),
    ({0}, {1}, {6}, {2}, {3}, {4}),
]

clusters = [tuple([frozenset(cluster) for cluster in level]) for level in clusters]

root = clusters[0][0]
n_levels = len(clusters)

G = nx.DiGraph()

for i in range(len(clusters)-1):
    for parent in clusters[i]:
        for child in clusters[i+1]:
            if set(child).issubset(set(parent)):
                if child != parent:
                    G.add_edge(parent, child)
                    G[parent][child]['weight'] = n_levels - (len(nx.shortest_path(G, root, child)) - 1)


pos = nx.nx_agraph.graphviz_layout(G, prog="dot")
nx.draw(G, with_labels=True, pos=pos)

edge_labels = {(u, v): d['weight'] for u, v, d in G.edges(data=True)}

nx.draw_networkx_edge_labels(
    G, pos,
    edge_labels=edge_labels,
    font_color='k'
)

plt.show()

In [None]:
G

In [None]:
M_dst = [
    # 0  1  2  3  4  6
    [ 0, 1, 3, 3, 3, 1], # 0
    [ 1, 0, 3, 3, 3, 1], # 1
    
    [ 3, 3, 0, 1, 2, 3], # 2
    [ 3, 3, 1, 0, 2, 3], # 3

    [ 3, 3, 2, 2, 0, 3], # 4

    [ 1, 1, 3, 3, 3, 0], # 6
]

A_dst = scipy.spatial.distance.squareform(M_dst)

M_linkage = scipy.cluster.hierarchy.linkage(A_dst)

scipy.cluster.hierarchy.dendrogram(M_linkage)

plt.show()

In [None]:
sns.reset_defaults()
cmap = cm.plasma

node_size = 50

# pos = nx.nx_agraph.graphviz_layout(Gc, prog="twopi") # dot, twopi, fdp, sfdp, circo
pos = nx.multipartite_layout(Gc, subset_key="layer", align="horizontal")


plt.figure(figsize=(15, 15))

nodes = nx.draw_networkx_nodes(
    Gc, 
    pos, 
    # nodelist=nodes_with_term,
    node_size=node_size,
    node_color='red',
    node_shape='o',
    alpha=None,
    # cmap=cmap,
    vmin=None,
    vmax=None,
    ax=None,
    linewidths=1.0, 
    edgecolors='k', 
    # label=None, 
    margins=None
)
nodes.set_zorder(1)

edges = nx.draw_networkx_edges(
    Gc, 
    pos,
    edgelist=None,
    width=1.0,
    edge_color='k',
    style='solid',
    alpha=0.5,
    arrowstyle=None,
    arrowsize=10,
    edge_cmap=None,
    edge_vmin=None,
    edge_vmax=None,
    ax=None,
    arrows=None,
    label=None,
    node_size=node_size,
    nodelist=None,
    node_shape='o',
    connectionstyle='arc3',
    min_source_margin=0,
    min_target_margin=0,
)

for c in edges:
    c.set_zorder(-1)
# edges.set_zorder(-1)

plt.legend()

plt.grid(False)
plt.box(False)

# root = root.replace('/', '-')

plt.show()
# plt.savefig(
#     os.path.join(LATEX_FIGURES_PATH, 'genealogy_trees', f'{root}.png'), 
#     transparent=True, 
#     dpi=300,
#     bbox_inches='tight',
# )

In [None]:

new_leaves = dict(
    pl.read_parquet(f'./output/main_dfs/{field_name}.parquet')
    .select(
        pl.col('Doi'), 
        pl.concat_str([
            pl.lit('('),
            pl.col('Authors').arr.first().str.split(', ').arr.first(),
            pl.lit(', '),
            pl.col('Date').dt.year(),
            pl.lit(')')
        ])
    )
    .filter(pl.col('Doi').is_in(leaves))
    .to_numpy()
)

leaves = dict(zip(leaves, leaves))
leaves.update(new_leaves)

fig = plt.figure(figsize=(15, 18))
ax = fig.add_axes([0, 0, 1, 1])

# ax.set_xlabel('Node')
ax.set_ylabel('Distance')

dendrogram(Z, labels=list(leaves.values()), ax=ax)
ax.yaxis.grid(False)
ax.xaxis.grid(False)

# fig.tight_layout()

plt.show()

In [None]:
print(nx.number_connected_components(G))

In [None]:
m = nx.to_numpy_array(G)

In [None]:
Z = linkage(m, method='ward')

In [None]:
dn = dendrogram(Z, labels=G.nodes)

In [None]:
nx.draw(G)

In [None]:
G = Gs['Sociology']
print(G)
print(nx.number_connected_components(G))
nodes = list(G.nodes)

comms = nx.community.louvain_communities(G)
print(len(comms))

df = Dfs['Sociology']
df = df.select(pl.col('Doi'), pl.col('Text'), pl.col('References'))

labelled_comms = {}
for i, comm in enumerate(comms):
    labelled_comms.update(dict.fromkeys(comm, i))

df_comms = pl.from_dict({
    'Doi': labelled_comms.keys(),
    'Community': labelled_comms.values(),
})

df_comms = df_comms.join(df, on='Doi', how='left')

df_comms = df_comms.with_columns(pl.col('Text').arr.eval(pl.element().unique()))

df_comms = (
    df_comms
    .explode('References')
    .join(
        df_comms.select(pl.col('Doi'), pl.col('Community')),
        right_on='Doi',
        left_on='References',
        how='left',
        suffix='Reference'
    )
)

nx.set_node_attributes(G, labelled_comms, 'Community')

df_comms.head()

In [None]:
df_core = (
    df_comms
    .groupby('References')
    .agg(
        pl.col('Doi'),
        pl.col('Community'),
    )
    .filter(pl.col('Community').arr.unique().arr.lengths().eq(1))
    .select(
        pl.col('References'),
        pl.lit(True).alias('Core'),
    )
)

In [None]:
dois, cores = (
    df
    .join(df_core, left_on='Doi', right_on='References', how='left')
    .fill_null(False)
    .select(
        pl.col('Doi'),
        pl.col('Core'),
    )
)

core_attrs = dict(
    zip(
        dois.to_list(),
        cores.to_list(),
    )
)

nx.set_node_attributes(G, core_attrs, 'Core')

In [None]:
(
    df_comms
    .select(
        pl.col('Doi'),
        pl.col('Community'),
        pl.col('CommunityReference'),
    )
    .groupby(pl.col('Doi'))
    .agg(pl.all())
)

In [None]:
nx.set_node_attributes(G, labelled_comms, 'Community')

In [None]:
df_comms.head()

In [None]:
(
    df_comms
    .explode('Text')
    .groupby('Text')
    .agg(pl.col('Community'))
    .filter(pl.col('Community').arr.lengths().eq(1))
    .with_columns(pl.col('Community').arr.first())
    .groupby('Community')
    .agg(pl.col('Text'))
    # .unique(subset=['Community', 'Text'])
    # .groupby('Community')
    # .agg(pl.col('Text'))
)