In [1]:
%run './model/multi_corpus.py'
%run './constants.py'

sns.set(rc = {'figure.figsize':(15,8)})

In [2]:
corpora = co_citation_graphs()
Gs = {field_name: corpus['G'] for (field_name, corpus) in corpora.items()}
Dfs = {field_name: corpus['Df'] for (field_name, corpus) in corpora.items()}

Gender Studies Graph with 1070 nodes and 2505 edges
Geometry Graph with 749 nodes and 1262 edges
Geophysics Graph with 22475 nodes and 119997 edges
Economics Graph with 5710 nodes and 81283 edges
Language & Linguistics Graph with 1925 nodes and 13565 edges
Probability & Statistics Graph with 2802 nodes and 13312 edges
Material Engineering Graph with 32157 nodes and 450244 edges
Artificial Intelligence Graph with 2817 nodes and 15003 edges
Sociology Graph with 3037 nodes and 31327 edges
International Business Graph with 4191 nodes and 82833 edges
Political Science Graph with 3367 nodes and 21319 edges
Genetics & Genomics Graph with 11868 nodes and 73011 edges
Immunology Graph with 17730 nodes and 271452 edges
Human Resources & Organizations Graph with 4657 nodes and 43787 edges
Ethnic & Cultural Studies Graph with 747 nodes and 1502 edges
Neurology Graph with 22058 nodes and 348235 edges


In [4]:
def gini(x):
    mad = np.abs(np.subtract.outer(x, x)).mean()
    rmad = mad/np.mean(x)
    g = 0.5 * rmad
    return g

In [3]:
d = {}

for field_name, corpus in corpora.items():

    print(field_name)

    df = corpus['Df']
    G = corpus['G']

    start_date, end_date = (
        df
        .filter(pl.col('Doi').is_in(list(G.nodes)))
        .select(pl.col('Date').min().alias('Min'), pl.col('Date').max().alias('Max'))
        .row(0)
    )

    degree = dict(G.degree()).values()

    large_component = max(nx.connected_components(G), key=len)
    G_large_component = G.subgraph(large_component)

    d[field_name] = {
        'Nodes': nx.number_of_nodes(G),
        'Edges': nx.number_of_edges(G),
        'Density': nx.density(G),
        'AvgDegree': sum(degree) / len(degree),
        'AvgClustering': nx.average_clustering(G),
        'DiameterLargestComponent': nx.diameter(G.subgraph(large_component)),
        'AvgShortestPath': nx.average_shortest_path_length(G_large_component),
        'ShortestPath': nx.shortest_path_length(G),
        'Triangles': sum(nx.triangles(G).values()) / 3,
        'Gini': gini(degree),
        'Louvain': len(nx_comm.louvain_communities(G)),
        'Components': len(list(nx.connected_components(G))),
        'Clustering': nx.average_clustering(G),
        'Transitivity': nx.transitivity(G),
        'Centralization': float((len(G) * max(degree) - sum(degree))) / (len(G)-1)**2,
        'Isolates': nx.number_of_isolates(G),
        'Loops': nx.number_of_selfloops(G),
        # 'LabelPropagation': len(nx_comm.label_propagation_communities(G)),
    }

df = pl.DataFrame([{'Field': field_name} | desc_d for field_name, desc_d in d.items()])

latex = (
    df
        .to_pandas()
        .to_latex(
            index=False,
            na_rep=' ',
            bold_rows=True,
            float_format="%.2f",
        )
)
with open(os.path.join(LATEX_TABLE_PATH, 'co_occurrence_desc_stats.tex'), 'w+') as file:
    file.write(latex)


Gender Studies
Geometry
Geophysics
Economics
Language & Linguistics
Probability & Statistics
Material Engineering
Artificial Intelligence
Sociology
International Business
Political Science
Genetics & Genomics
Immunology
Human Resources & Organizations
Ethnic & Cultural Studies
Neurology


  latex = df.to_pandas().to_latex(
