In [1]:
def gini(array): # https://github.com/oliviaguest/gini
    """Calculate the Gini coefficient of a numpy array."""
    # based on bottom eq: http://www.statsdirect.com/help/content/image/stat0206_wmf.gif
    # from: http://www.statsdirect.com/help/default.htm#nonparametric_methods/gini.htm
    array = array.flatten() #all values are treated equally, arrays must be 1d
    if np.amin(array) < 0:
        array -= np.amin(array) #values cannot be negative
    array += 0.0000001 #values cannot be 0
    array = np.sort(array) #values must be sorted
    index = np.arange(1,array.shape[0]+1) #index per array element
    n = array.shape[0]#number of array elements
    return ((np.sum((2 * index - n  - 1) * array)) / (n * np.sum(array))) #Gini coefficient

def oliviaguest_gini(array):
    return gini(np.array(array, dtype='float64')) # needs explicit cast -- helper function.

# https://stackoverflow.com/questions/39512260/calculating-gini-coefficient-in-python-numpy
def mvd_G(v):
    bins = np.linspace(0., 100., 11)
    total = float(np.sum(v))
    yvals = []
    for b in bins:
        bin_vals = v[v <= np.percentile(v, b)]
        bin_fraction = (np.sum(bin_vals) / total) * 100.0
        yvals.append(bin_fraction)
    # perfect equality area
    pe_area = np.trapz(bins, x=bins)
    # lorenz area
    lorenz_area = np.trapz(yvals, x=bins)
    gini_val = (pe_area - lorenz_area) / float(pe_area)
    return bins, yvals, gini_val

def mvd_gini(array, title='', color=''):
    import matplotlib.pyplot as plt 
    v=np.array(array, dtype='int64')
    bins, result, gini_val = mvd_G(v)
    plt.figure()
    plt.rcParams["figure.figsize"] = (20,10)
    plt.subplot(2, 1, 1)
    plt.plot(bins, result, label="observed", color=color)
    plt.plot(bins, bins, '--', label="perfect eq.", color='black')
    plt.xlabel("fraction of population")
    plt.ylabel("fraction of wealth")
    plt.title(title + " GINI APPROX (BINNED): %.4f" %(gini_val))
    plt.legend()
    plt.subplot(2, 1, 2)
    plt.hist(v, bins=20, color=color)

In [2]:
import networkx as nx
import numpy as np

FULL_GEPHI_FILE = 'PT-pruned.gexf'
SLICE_1 = 'PT-slice1.gexf'
SLICE_2 = 'PT-slice2.gexf'


def gephi_to_nx(file):
    G = nx.read_gexf(file)
    print('Nodes = ' + str(G.number_of_nodes()))
    print('Edges = ' + str(G.number_of_edges()))
    return G

def unweighted_node_list(G):
    # https://link.springer.com/article/10.1007/s13721-016-0140-7
    # https://dl.acm.org/doi/abs/10.1145/1988688.1988767?download=true
    ## A -> B if B retweeted A. Which means that B amplified/rebroadcast A. The idea here was that A influences --> B. 
    outdegs = [  G.out_degree(node) for node in nx.nodes(G)  ]
    return outdegs

def weighted_node_list(G):
    weighted_outdegs = []
    for node in nx.nodes(G):
        node_outedges_sum = 0
        for edge in nx.edges(G, nbunch=node):
            node_outedges_sum += (G.get_edge_data(*edge)['weight']) 
            # tuple form, see https://networkx.github.io/documentation/networkx-1.10/reference/generated/networkx.MultiDiGraph.get_edge_data.html
        weighted_outdegs.append(node_outedges_sum)
    #print(weighted_outdegs)
    return weighted_outdegs


def isolate_copy_subgraph(G, community_id):
    # makes a deep copy of the graph, preserving original!
    G_copy = G.copy()

    # given a node in Gephi's native XML (per Ignacio)
    # <attvalue for="1" value="40" />    # this is the louvain community_id; e.g. 40=MAGA
    ## defined as: <attribute id="1" title="louvain" type="long" />
    louvain_dict = nx.get_node_attributes(G_copy, 'louvain')
    
    total_nodes  = G_copy.number_of_nodes()
    total_culled = 0
    # faster to go through a dict and cull nodes NOT matching community_id
    for node in louvain_dict:
        if louvain_dict[node] != community_id:
            G_copy.remove_node(node)
            total_culled += 1
    
    final_nodes = G_copy.number_of_nodes()
    assert final_nodes == (total_nodes - total_culled)

    print('Subgraph copy with community id=' + str(community_id) + ' has total nodes=' + str(final_nodes) + 
          ' | originally ' + str(total_nodes) + ' | culled ' + str(total_culled))

    return G_copy

In [3]:
## TEST for isolate_copy_subgraph
# G = gephi_to_nx(GEPHI_FILE)
# unweighted_outdegs = unweighted_node_list(G)
# isolate_copy_subgraph(G,40)

In [4]:
### NEW main code starts here ###

# Excel init
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws.append(['Community',
            'GraphFullSlices (unweighted)', 'GraphFullSlices (weighted)', 
            'GraphSlice1 (unweighted)',     'GraphSlice1 (weighted)', 
            'GraphSlice2 (unweighted)',     'GraphSlice2 (weighted)'])
 
# full graph, GF, and slice G1, slice G2
print('%%% Processing: full graphs (GF, G1, G2) ')

GF = gephi_to_nx(FULL_GEPHI_FILE)
G1 = gephi_to_nx(SLICE_1)
G2 = gephi_to_nx(SLICE_2)

gf_unweighted_gini = oliviaguest_gini(unweighted_node_list(GF))
gf_weighted_gini = oliviaguest_gini(weighted_node_list(GF))

g1_unweighted_gini = oliviaguest_gini(unweighted_node_list(G1))
g1_weighted_gini = oliviaguest_gini(weighted_node_list(G1))

g2_unweighted_gini = oliviaguest_gini(unweighted_node_list(G2))
g2_weighted_gini = oliviaguest_gini(weighted_node_list(G2))

ws.append(['ALL', gf_unweighted_gini, gf_weighted_gini, g1_unweighted_gini, g1_weighted_gini, g2_unweighted_gini, g2_weighted_gini])


### iterate for 'spatial' community distributions ###
# data per 20200915 Ignacio email
communities = [
    {'id': 13,  'label': '13/Democrats',             'color':'blue'},
    {'id': 48,  'label': '48/Republicans',           'color':'red'},
    {'id': 6,   'label': '06/Unorthodox',            'color':'green'},
    {'id': 49,  'label': '49/Public Health',         'color':'yellow'},
    {'id': 104, 'label': '104/Antivaxxers',              'color':'magenta'}]

for c in communities:
    print('%%% Processing: community ' + c['label'])
    sgf = isolate_copy_subgraph(GF, c['id'])
    sg1 = isolate_copy_subgraph(G1, c['id'])
    sg2 = isolate_copy_subgraph(G2, c['id'])

    sgf_unweighted_gini = oliviaguest_gini(unweighted_node_list(sgf))
    sgf_weighted_gini = oliviaguest_gini(weighted_node_list(sgf))

    sg1_unweighted_gini = oliviaguest_gini(unweighted_node_list(sg1))
    sg1_weighted_gini = oliviaguest_gini(weighted_node_list(sg1))

    sg2_unweighted_gini = oliviaguest_gini(unweighted_node_list(sg2))
    sg2_weighted_gini = oliviaguest_gini(weighted_node_list(sg2))

    ws.append([c['label'], sgf_unweighted_gini, sgf_weighted_gini, sg1_unweighted_gini, sg1_weighted_gini, sg2_unweighted_gini, sg2_weighted_gini])

wb.save('gini_matrix.xlsx')
# upvoted_outfile.close()
print('%%% XLSX generation completed.')

%%% Processing: full graphs (GF, G1, G2) 
Nodes = 383032
Edges = 3627175
Nodes = 383032
Edges = 695736
Nodes = 383032
Edges = 3033515
%%% Processing: community 13/Democrats
Subgraph copy with community id=13 has total nodes=91792 | originally 383032 | culled 291240
Subgraph copy with community id=13 has total nodes=91792 | originally 383032 | culled 291240
Subgraph copy with community id=13 has total nodes=91792 | originally 383032 | culled 291240
%%% Processing: community 48/Republicans
Subgraph copy with community id=48 has total nodes=70247 | originally 383032 | culled 312785
Subgraph copy with community id=48 has total nodes=70247 | originally 383032 | culled 312785
Subgraph copy with community id=48 has total nodes=70247 | originally 383032 | culled 312785
%%% Processing: community 06/Unorthodox
Subgraph copy with community id=6 has total nodes=60027 | originally 383032 | culled 323005
Subgraph copy with community id=6 has total nodes=60027 | originally 383032 | culled 323005
Subg