In [None]:
%run './model/multi_corpus.py'
%run './constants.py'

from matplotlib import cm


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import umap.umap_ as umap

import functools

from scipy.spatial.distance import pdist, squareform

import itertools

In [None]:
corpora = citation_graphs()
Gs = {field_name: corpus['G'] for (field_name, corpus) in corpora.items()}
Dfs = {field_name: corpus['Df'] for (field_name, corpus) in corpora.items()}

In [None]:
def vectorization_method(method, texts):
    match method:
        case 'tfidf':
            tfidf = TfidfVectorizer()
            vecs = tfidf.fit_transform(texts.to_list()).toarray()
        case 'doc2vec':
            documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(texts.str.split(' ').to_list())]
            model = Doc2Vec(documents, vector_size=100, window=2, min_count=1, workers=4)
            vecs = np.array([model.dv[i] for i in range(len(texts))])
    return vecs

def format_field_name(field_name):
    if ' & ' in field_name:
        return ' &\n'.join(field_name.split(' & '))
    elif ' ' in field_name:
        return '\n'.join(field_name.split(' '))
    else:
        return field_name

In [None]:
n = 5_000

df_fields = []
for field_name, df in Dfs.items():

    G = Gs[field_name]
    in_deg = G.in_degree()
    sorted_in_deg = sorted(in_deg, key=lambda tup: tup[1])
    top_in_deg = [doi for doi, _ in sorted_in_deg][:n]

    df_fields.append(
        df
        .filter(pl.col('Doi').is_in(top_in_deg))
        .select(
            pl.col('Doi'),
            pl.col('Text').arr.join(' '),
            pl.lit(field_name).alias('Field'),
        )
    )
df_fields = pl.concat(df_fields)

print(df_fields.shape)

texts = df_fields['Text'].to_numpy()
fields = df_fields['Field'].to_numpy()

tfidf = TfidfVectorizer(stop_words='english')
tfidf_embedding = tfidf.fit_transform(texts)

reducer = umap.UMAP(random_state=42)
embs = reducer.fit_transform(tfidf_embedding)

df_fields = (
    df_fields
    .with_columns(pl.Series('Embedding', embs))
    .with_columns(
        pl.col('Embedding').arr.first().alias('x'),
        pl.col('Embedding').arr.last().alias('y'),
    )
)

print(df_fields.shape)

centroids = (
    df_fields
    .groupby('Field')
    .agg(
        pl.col('x').mean(),
        pl.col('y').mean(),
    )
)

print(centroids.shape)

In [None]:

sns.set_theme(style='white')

fig = plt.figure(figsize=(15, 15))
ax = fig.add_axes([0, 0, 1, 1])

g = sns.scatterplot(data=df_fields, x="x", y="y", hue="Field", alpha=0.5, edgecolor='k') # , palette="deep", edgecolor='none'

for (field_name, x, y) in centroids.rows():
    field_name = format_field_name(field_name)
    plt.text(x, y, field_name, horizontalalignment='center', size='medium', color='black', weight='semibold')


sns.despine(bottom = True, left = True)
g.set(xlabel=None, ylabel=None, xticklabels=[], yticklabels=[])
plt.legend(frameon=False)

plt.savefig(
    os.path.join(LATEX_FIGURES_PATH, 'all_tfidf_umap.png'), 
    transparent=True, 
    dpi=300,
    bbox_inches='tight'
)

In [None]:
coords = centroids.select(['x', 'y']).to_numpy()

distances = np.round(squareform(pdist(coords)), 2)

dist_df = pl.from_numpy(distances)

mask = np.triu(np.ones_like(dist_df))

labels = [format_field_name(field_name) for field_name in centroids['Field']]

plt.figure(figsize=(15,15))
sns.heatmap(dist_df, annot=True, xticklabels=labels, yticklabels=labels, mask=mask)

plt.savefig(
    os.path.join(LATEX_FIGURES_PATH, 'all_tfidf_umap_distance_heatmap.png'), 
    transparent=True, 
    dpi=300,
    bbox_inches='tight'
)

# Desc Stats

In [None]:
corpora = citation_graphs()
Gs_citation = {field_name: corpus['G'] for (field_name, corpus) in corpora.items()}

corpora = co_citation_graphs()
Gs_co_citation = {field_name: corpus['G'] for (field_name, corpus) in corpora.items()}

corpora = co_occurence_graphs()
Gs_co_occurrence = {field_name: corpus['G'] for (field_name, corpus) in corpora.items()}

In [None]:
import pandas as pd

fields = []
nodes = []
edges = []
for field_name, G in Gs_citation.items():
    fields.append(field_name)
    nodes.append(nx.number_of_nodes(G))
    edges.append(nx.number_of_edges(G))

df_citation_desc = pd.DataFrame({
    ("", "Field"): fields,
    ("Citation", "Nodes"): nodes,
    ("Citation", "Edges"): edges,
})

fields = []
nodes = []
edges = []
for field_name, G in Gs_co_citation.items():
    fields.append(field_name)
    nodes.append(nx.number_of_nodes(G))
    edges.append(nx.number_of_edges(G))

df_co_citation_desc = pd.DataFrame({
    ("", "Field"): fields,
    ("Co-Citation", "Nodes"): nodes,
    ("Co-Citation", "Edges"): edges,
})

fields = []
nodes = []
edges = []
for field_name, G in Gs_co_occurrence.items():
    fields.append(field_name)
    nodes.append(nx.number_of_nodes(G))
    edges.append(nx.number_of_edges(G))

df_co_occurrence_desc = pd.DataFrame({
    ("", "Field"): fields,
    ("Co-Occurrence", "Nodes"): nodes,
    ("Co-Occurrence", "Edges"): edges,
})

# merged = pd.merge(dfs, on=[('', 'Field')])
# merged


import functools
dfs = [
    df_citation_desc,
    df_co_citation_desc,
    df_co_occurrence_desc,
]
df_final = functools.reduce(lambda left, right: pd.merge(left, right, on=[('', 'Field')]), dfs)
df_final

latex = df_final.to_latex(
    index=False,
    na_rep=' ',
    bold_rows=True,
    float_format="%.2f",
)

with open(os.path.join(LATEX_TABLE_PATH, 'all_desc_stats.tex'), 'w+') as file:
    file.write(latex)

print(latex)

# Clusters




In [None]:
corpora = citation_graphs()
Gs_citation = {field_name: corpus['G'] for (field_name, corpus) in corpora.items()}

corpora = co_citation_graphs()
Gs_co_citation = {field_name: corpus['G'] for (field_name, corpus) in corpora.items()}

corpora = co_occurence_graphs()
Gs_co_occurrence = {field_name: corpus['G'] for (field_name, corpus) in corpora.items()}

In [None]:
communities = {}
for field_name, G in Gs_citation.items():
    print(field_name)
    communities[field_name] = nx.community.louvain_communities(G)

citation_comms = {field_name: len(comms) for field_name, comms in communities.items()}

df_citation_comms = pl.DataFrame({
    "Field": list(citation_comms.keys()),
    "Citation": list(citation_comms.values()),
})

communities = {}
for field_name, G in Gs_co_citation.items():
    print(field_name)
    communities[field_name] = nx.community.louvain_communities(G)

co_citation_comms = {field_name: len(comms) for field_name, comms in communities.items()}

df_co_citation_comms = pl.DataFrame({
    "Field": list(co_citation_comms.keys()),
    "Co-Citation": list(co_citation_comms.values()),
})

communities = {}
for field_name, G in Gs_co_occurrence.items():
    print(field_name)
    communities[field_name] = nx.community.louvain_communities(G)

co_occurrence_comms = {field_name: len(comms) for field_name, comms in communities.items()}

df_co_occurrence_comms = pl.DataFrame({
    "Field": list(co_occurrence_comms.keys()),
    "Co-Occurrence": list(co_occurrence_comms.values()),
})

dfs = [df_citation_comms, df_co_citation_comms, df_co_occurrence_comms]

df_final = functools.reduce(lambda left, right: left.join(right, on='Field'), dfs)

In [None]:
latex = df_final.to_pandas().to_latex(index=False, bold_rows=True)

with open(os.path.join(LATEX_TABLE_PATH, 'all_louvain_communities.tex'), 'w+') as file:
    file.write(latex)

print(latex)

In [None]:
(
    df_final
    .select(pl.exclude('Field'))
    .select(
        pl.pearson_corr('Citation', 'Co-Citation').alias('Citation Co-Citation'),
        pl.pearson_corr('Citation', 'Co-Occurrence').alias('Citation Co-Occurrence'),
        pl.pearson_corr('Co-Citation', 'Co-Occurrence').alias('Co-Citation Co-Occurrence'),
    )
)

In [None]:
citation_comps = {}
for field_name, G in Gs_citation.items():
    print(field_name)
    citation_comps[field_name] = nx.number_weakly_connected_components(G)

df_citation_comps = pl.DataFrame({
    "Field": list(citation_comps.keys()),
    "Citation": list(citation_comps.values()),
})

co_citation_comps = {}
for field_name, G in Gs_co_citation.items():
    print(field_name)
    co_citation_comps[field_name] = nx.number_connected_components(G)

df_co_citation_comps = pl.DataFrame({
    "Field": list(co_citation_comps.keys()),
    "Co-Citation": list(co_citation_comps.values()),
})

co_occurrence_comps = {}
for field_name, G in Gs_co_occurrence.items():
    print(field_name)
    co_occurrence_comps[field_name] = nx.number_connected_components(G)

df_co_occurrence_comps = pl.DataFrame({
    "Field": list(co_occurrence_comps.keys()),
    "Co-Occurrence": list(co_occurrence_comps.values()),
})

dfs = [df_citation_comps, df_co_citation_comps, df_co_occurrence_comps]

df_final = functools.reduce(lambda left, right: left.join(right, on='Field'), dfs)

In [None]:
latex = df_final.to_pandas().to_latex(index=False, bold_rows=True)

with open(os.path.join(LATEX_TABLE_PATH, 'all_components.tex'), 'w+') as file:
    file.write(latex)

print(latex)

# Groups

In [None]:
corpora = co_citation_graphs()
Gs = {field_name: corpus['G'] for (field_name, corpus) in corpora.items()}
Dfs = {field_name: corpus['Df'] for (field_name, corpus) in corpora.items()}

In [None]:
def avg_degree(G):
    degree = dict(G.degree()).values()
    return sum(degree) / len(degree)

def triangles(G):
    return sum(nx.triangles(G).values()) / 3

def gini(G):
    x = list(dict(G.degree()).values())
    mad = np.abs(np.subtract.outer(x, x)).mean()
    rmad = mad/np.mean(x)
    gini = 0.5 * rmad
    return gini

def louvain_communities(G):
    return len(nx.community.louvain_communities(G))

def components(G):
    return len(list(nx.connected_components(G)))

def centralization(G):
    degree = dict(G.degree()).values()
    return float((len(G) * max(degree) - sum(degree))) / (len(G)-1)**2

measurments = {
    # 'Nodes': nx.number_of_nodes,
    # 'Edges': nx.number_of_edges,
    'Density': nx.density,
    'AvgDegree': avg_degree,
    # 'DiameterLargestComponent': nx.diameter(G.subgraph(large_component)),
    # 'AvgShortestPath': nx.average_shortest_path_length(G_large_component),
    # 'ShortestPath': nx.shortest_path_length,
    'Triangles': triangles,
    # 'Gini': gini,
    'Louvain': louvain_communities,
    'Components': components,
    'AvgClustering': nx.average_clustering,
    'Transitivity': nx.transitivity,
    'Centralization': centralization,
    # 'Isolates': nx.number_of_isolates(G),
    # 'Loops': nx.number_of_selfloops(G),
    # 'LabelPropagation': len(nx_comm.label_propagation_communities(G)),
}

measurment_combs = list(itertools.combinations(measurments.keys(), 2))
print(len(measurment_combs))
print(list(measurments.keys()))

In [None]:
sns.reset_defaults()

for i, (m1, m2) in enumerate(measurment_combs):

    print(f'\r{m1} - {m2}', flush=True, end=' ')

    f1 = measurments[m1]
    f2 = measurments[m2]

    dict1 = {field_name: f1(G) for field_name, G in Gs.items() if field_name not in ['Gender Studies', 'Geometry', 'Ethnic & Cultural Studies']}
    dict2 = {field_name: f2(G) for field_name, G in Gs.items() if field_name not in ['Gender Studies', 'Geometry', 'Ethnic & Cultural Studies']}

    df1 = pl.DataFrame([
        pl.Series('Field', list(dict1.keys()), pl.Utf8),
        pl.Series(m1, list(dict1.values()), pl.Float32),
    ])
    df2 = pl.DataFrame([
        pl.Series('Field', list(dict2.keys()), pl.Utf8),
        pl.Series(m2, list(dict2.values()), pl.Float32),
    ])

    df = df1.join(df2, on='Field')

    x_mean, y_mean, x_median, y_median = df.select(
        pl.col(m1).mean().alias('x_mean'),
        pl.col(m2).mean().alias('y_mean'),
        pl.col(m1).median().alias('x_median'),
        pl.col(m2).median().alias('y_median'),
    ).row(0)

    plt.figure(figsize=(10,10))
    sns.scatterplot(data=df, x=m1, y=m2)

    plt.axhline(y=y_mean, c='r') 
    plt.axvline(x=x_mean, c='r') 

    plt.axhline(y=y_median, c='b') 
    plt.axvline(x=x_median, c='b') 

    for field_name, x, y in df.rows():
        field_name = format_field_name(field_name)
        plt.text(x, y, field_name, horizontalalignment='left', size='medium', color='black', weight='semibold')

    plt.savefig(
        os.path.join(LATEX_FIGURES_PATH, 'all_bivariate', f'{m1}_{m2}.png'),
        transparent=True, 
        dpi=150,
        bbox_inches='tight'
    )

    plt.show()