# Comparing centrality measures in Estonian and English Wikipedias

## Setup

In [1]:
!pip install -q networkx

You should consider upgrading via the '/home/eeriksp/.config/jupyterlab-desktop/jlab_server/bin/python3.8 -m pip install --upgrade pip' command.[0m


In [3]:
import pandas as pd
import networkx as nx
import heapq
from multiprocessing import Pool


## Read data

In [4]:
def filter_categories(data: pd.DataFrame) -> pd.DataFrame:
    return data[~data["node_name"].str.startswith("Category:")]

In [5]:
edges_en = pd.read_csv("../data/edges_en.csv")
edges_et = pd.read_csv("../data/edges_et.csv")
nodes_en = filter_categories(pd.read_csv("../data/nodes_en.csv"))
nodes_et = filter_categories(pd.read_csv("../data/nodes_et.csv"))
nodes_en

Unnamed: 0,node_id,node_name,node_summary,node_et_equivalent
0,27685,Tantra,Tantra (; ) refers to an esoteric yogic tr...,Tantrad
1,911,African Americans in Tennessee,African Americans are the second largest ethni...,
2,3359,Asia Minor,,
3,13296,Hot in Cleveland (season 5),The fifth season of the TV Land original sitco...,
4,6720,Christopher Duggan,Christopher John Hesketh Duggan (4 November 19...,
...,...,...,...,...
31816,7475,Cormac Burke (priest),,
31817,30894,Western world,"The Western world, also known as the West, pri...",Läänemaailm
31818,13108,History of the Hungarian language,"right|thumb|250px|The ""Funeral Sermon and Pray...",
31819,140,1996 Nobel Prize in Literature,The 1996 Nobel Prize in Literature was awarded...,


In [6]:
edges_en

Unnamed: 0,source,target
0,22260,150
1,22260,264
2,22260,266
3,22260,402
4,22260,460
...,...,...
69144,5237,6054
69145,5237,6064
69146,5237,6125
69147,5237,22836


## Construct graphs

In [7]:
def create_graph(nodes: pd.DataFrame, edges: pd.DataFrame) -> nx.Graph:
    n = nodes.drop_duplicates(subset='node_id').reset_index(drop=True)
    g = nx.from_pandas_edgelist(edges, create_using=nx.Graph())
    node_attrs = n.set_index('node_id').to_dict('index')
    nx.set_node_attributes(g, node_attrs)
    return g


In [8]:
g_en = create_graph(nodes_en, edges_en)
g_et = create_graph(nodes_et, edges_et)

g_en.nodes[27685]

{'node_name': 'Tantra',
 'node_summary': '   Tantra (; ) refers to an  esoteric yogic tradition that developed on the Indian subcontinent from the middle of the 1st millennium CE onwards in both Hinduism and Buddhism.  The term tantra, in the Indian traditions, also means any systematic broadly applicable "text, theory, system, method, instrument, technique or practice".  A key feature of these traditions is the use of mantras, and thus they are commonly referred to as Mantramārga ("Path of Mantra") in Hinduism or Mantrayāna ("Mantra Vehi.',
 'node_et_equivalent': 'Tantrad'}

## Compute centrality metrics

In [9]:
pagerank_en_scores = nx.pagerank(g_en)
heapq.nlargest(10, pagerank_en_scores, key=pagerank_en_scores.get)

[2386, 4305, 3148, 3569, 2131, 4481, 1108, 1412, 988, 4938]

In [31]:
def compute_centrality_measures(g: nx.Graph) -> pd.DataFrame:
    return pd.DataFrame.from_dict({
        "degree": nx.degree_centrality(g),
        "betweenness": nx.betweenness_centrality(g),
        "eigenvector": nx.eigenvector_centrality(g),
        "closeness": nx.closeness_centrality(g),
        "pagerank": nx.pagerank(g)
    })


In [37]:

def compute_centrality_measures(g: nx.Graph) -> pd.DataFrame:
    with Pool() as pool:
        degree = pool.apply_async(nx.degree_centrality, (g,))
        betweenness = pool.apply_async(nx.betweenness_centrality, (g,))
        eigenvector = pool.apply_async(nx.eigenvector_centrality, (g,))
        closeness = pool.apply_async(nx.closeness_centrality, (g,))
        pagerank = pool.apply_async(nx.pagerank, (g,))

        return pd.DataFrame.from_dict({
            "degree": degree.get(),
            "betweenness": betweenness.get(),
            "eigenvector": eigenvector.get(),
            "closeness": closeness.get(),
            "pagerank": pagerank.get()
        })

In [12]:
degree_en = nx.degree_centrality(g_en)
betweenness_en = nx.betweenness_centrality(g_en)

KeyboardInterrupt: 

In [None]:
    return pd.DataFrame.from_dict({
        "degree": nx.degree_centrality(g),
        "betweenness": nx.betweenness_centrality(g),
        "eigenvector": nx.eigenvector_centrality(g),
        "closeness": nx.closeness_centrality(g),
        "pagerank": nx.pagerank(g)
    })

In [38]:
centrality_en = compute_centrality_measures(g_en)

In [None]:
import seaborn as sns