In [1]:
import os
import networkx as nx
import numpy as np     
import pandas as pd
from help_functions_drawing import *
from help_functions_plots import *
from help_functions_misc import *

In [10]:
os.path.dirname(os.getcwd())

'c:\\Users\\P281866\\Documents\\PhD\\Programming\\Majority_Illusion\\Github'

In [11]:
### Simulation with Erdös-Rényi graphs
def run_experiment_ER(min_n, max_n, step_n, nr_epochs, p_blue_values, p_edge_values): 
    # Creates, for each number of nodes n between [min_n] and [max_n], for each value in [p_edge_values], for each value in [p_blue_values], 
    # [nr_epochs] Erdös-Renyi graphs with these values for the random parameters, and check on each graph the number of nodes in a majority 
    # illusion and whether the graph is in a majority-majority illusion. The results are returned in a pandas dataframe.

    graph_type = 'ER'
    path = os.path.dirname(os.getcwd()) +'/ER_analysis/'
    filename = 'minn' + str(min_n) + '_maxn' + str(max_n) + '_step' + str(step_n) + '_epochs' + str(nr_epochs) + '_weak_' + graph_type + '.csv'

    node_numbers = range(min_n, max_n+1, step_n)

    data = pd.DataFrame(columns=['n', 'p_edge', 'p_blue', 'mi', 'wmi', 'nr_nodes_str_ill', 'nr_nodes_weak_ill', 'deg_assort_coef', 'deg_seq', 'avg_path_length', 'CC', 'EV_centr', 'close_centr', 'between_centr', 'frac_largest_comp', 'tie']) #create empty dataframe
    
    for n in node_numbers:
        print(f'n: {n}')
        for p_edge in p_edge_values:
            for p_blue in p_blue_values:
                for epoch in range(nr_epochs):
                    G=nx.erdos_renyi_graph(n, p_edge)
                    color_randomly(G, p_blue)
                    color_edges(G)
                    (has_strict_illusion, has_weak_illusion, nr_str_ill_nodes, nr_weak_ill_nodes, nodes_under_illusion, global_tie) = has_illusions(G)  

                ###### Other measures of G:
                    deg_assort_coef = nx.degree_assortativity_coefficient(G) #Degree assortativity coefficient
                    deg_seq = list(dict(nx.degree(G)).values()) # Degree sequence
                    if nx.is_connected(G): #If G is not connected, path length raises error
                        avg_path_length = nx.average_shortest_path_length(G) #Average shortest path length
                    else:
                        avg_path_length = 'NA'
                    CC = nx.transitivity(G) #Clustering coefficient

                    #Centrality measures:
                    try:
                        EV_centr = nx.eigenvector_centrality(G, max_iter=500) 
                        EV_centr = list(EV_centr.values()) #Eigenvector centrality
                    except:
                        print('EV_centrality failed. Use NA instead.')
                        EV_centr = 'NA'

                    close_centr = nx.closeness_centrality(G) 
                    close_centr = list(close_centr.values()) #Closeness centrality
                    between_centr = nx.betweenness_centrality(G)
                    between_centr = list(between_centr.values()) #Betweenness centrality

                    # Fraction of nodes in largest component:
                    connected_components = sorted(nx.connected_components(G), key=len, reverse=True)
                    G0 = G.subgraph(connected_components[0])
                    size_largest_comp = G0.number_of_nodes()
                    frac_largest_comp = size_largest_comp / G.number_of_nodes() # Fraction of nodes in largest component

                    # Save the new data in the dataframe:
                    newrow = pd.DataFrame({'n': [n], 'p_edge': [p_edge], 'p_blue': [p_blue], 'mi': [has_strict_illusion], 'wmi': [has_weak_illusion], 'nr_nodes_str_ill': [nr_str_ill_nodes], 'nr_nodes_weak_ill': [nr_weak_ill_nodes], 'deg_assort_coef': [deg_assort_coef], 'deg_seq': [deg_seq], 'avg_path_length': [avg_path_length], 'CC': [CC], 'EV_centr': [EV_centr], 'close_centr': [close_centr], 'between_centr': [between_centr], 'frac_largest_comp':[frac_largest_comp], 'tie':[global_tie]})
                    data = pd.concat([data, newrow], ignore_index=True)

    # Save data in csv-file:
    data.to_csv(path+filename, header=True, sep=';')
    return data

In [None]:
min_n = 10
max_n = 10
step_n = 10
nr_epochs = 1
p_blue_values = np.arange(0.1, 0.5+0.01, 0.1).tolist()
p_blue_values = np.around(p_blue_values, 2) # To change 0.600000001 into 0.6

p_edge_values = np.arange(0.1, 0.9+0.01, 0.2).tolist()
p_edge_values = np.around(p_edge_values, 2) 
 
ER_data = run_experiment_ER(min_n, max_n, step_n, nr_epochs, p_blue_values, p_edge_values)

In [None]:
### Calculate average values for the properties that contain a value per node:

# Eigenvector centrality:
ER_data['EV_centr'] = ER_data['EV_centr'].apply(to_float_list)
average = []
for i in range(len(ER_data.EV_centr)):
    if ER_data.EV_centr[i]:
        average.append(sum(ER_data.EV_centr[i])/len(ER_data.EV_centr[i]))
    else:
        average.append('NA')
ER_data['avg_EV_centr'] = average
# Closeness centrality:
ER_data['close_centr'] = ER_data['close_centr'].apply(to_float_list)
average = []
for i in range(len(ER_data.close_centr)):
    average.append(sum(ER_data.close_centr[i])/len(ER_data.close_centr[i]))
ER_data['avg_close_centr'] = average
# Betweenness centrality:
ER_data['between_centr'] = ER_data['between_centr'].apply(to_float_list)
average = []
for i in range(len(ER_data.between_centr)):
    average.append(sum(ER_data.between_centr[i])/len(ER_data.between_centr[i]))
ER_data['avg_between_centr'] = average
# Degree sequence
ER_data['deg_seq'] = ER_data['deg_seq'].apply(to_float_list)
average = []
for i in range(len(ER_data.deg_seq)):
    average.append(sum(ER_data.deg_seq[i])/len(ER_data.deg_seq[i]))
ER_data['avg_degree'] = average

# Save the new data:
pathER = os.path.dirname(os.getcwd()) +'/ER_analysis/'
filenameER = 'ER_data.csv'
ER_data.to_csv(pathER + filenameER, header=True, sep=';')

In [14]:
### Simulation with Holme-Kim graphs
def run_experiment_HK(min_n, max_n, step_n, nr_epochs, p_blue_values, p_values): 
    # Creates, for each number of nodes n between [min_n] and [max_n], for each m in m_values (m is the number of random edges to add for each new node), for each value in [p_values] (p is the probability of 
    # adding a triangle after adding a random edge), for each value in [p_blue_values], 
    # [nr_epochs] Holme- Kim powerlaw cluster graphs with these values for the random parameters, and check on each graph the number of nodes in a majority 
    # illusion and whether the graph is in a majority-majority illusion. The results are returned in a pandas dataframe We also record for each graph certain measures.
    # m_values is defined in the loop, because for every n, m should be in the range 1<=m<=n.
    # note: if there are no connections, or a complete graph, there is no majority-majority-illusion, so those colums will have only values 0
    graph_type = 'HK'
    path = os.path.dirname(os.getcwd()) +'/HK_analysis/'
    filename = 'minn' + str(min_n) + '_maxn' + str(max_n) + '_step' + str(step_n) + '_epochs' + str(nr_epochs) + '_weak_' + graph_type + 'homophily.csv'

    node_numbers = range(min_n, max_n+1, step_n)

    # Create empty dataframe:
    data = pd.DataFrame(columns=['n', 'm', 'p', 'p_blue','mi', 'wmi', 'nr_nodes_str_ill', 'nr_nodes_weak_ill', 'deg_assort_coef', 'deg_seq', 'avg_path_length', 'CC', 'EV_centr', 'close_centr', 'between_centr', 'frac_largest_comp', 'tie', 'probability_mixed_edge', 'actual_fraction_mixed_edges']) #create empty dataframe
    
    for n in node_numbers:
        print(f'n: {n}')
        m_values = [n/10, n/2, 9*n/10] #m is 10, 50, and 90 percent of n
        m_values = [int(x) for x in m_values]
        for m in m_values:
            # print(f'm = {m}')
            for p in p_values:
                # print(f'p = {p}')
                for p_blue in p_blue_values:
                    for epoch in range(nr_epochs):
                        G=nx.powerlaw_cluster_graph(n, m, p)
                        (nr_red_nodes, nr_blue_nodes) = color_randomly(G, p_blue)
                        (nr_edges, nr_mixed_edges) = color_edges(G)
                        (probability_mixed_edge, actual_fraction_mixed_edges) = homopily(n, nr_red_nodes, nr_blue_nodes, nr_edges, nr_mixed_edges)
                        (has_strict_illusion, has_weak_illusion, nr_str_ill_nodes, nr_weak_ill_nodes, nodes_under_illusion, global_tie) = has_illusions(G)

                    ###### Other measures of G:
                        deg_assort_coef = nx.degree_assortativity_coefficient(G) #Degree assortativity coefficient
                        deg_seq = list(dict(nx.degree(G)).values()) # Degree sequence
                        if nx.is_connected(G): #If G is not connected, path length raises error
                            avg_path_length = nx.average_shortest_path_length(G) #Average shortest path length
                        else:
                            avg_path_length = 'NA'
                        CC = nx.transitivity(G) #Clustering coefficient
                        
                        #Centrality measures:
                        try:
                            EV_centr = nx.eigenvector_centrality(G, max_iter=500) 
                            EV_centr = list(EV_centr.values()) #Eigenvector centrality
                        except:
                            print('EV_centrality failed. Use NA instead.')
                            EV_centr = 'NA'

                        close_centr = nx.closeness_centrality(G) 
                        close_centr = list(close_centr.values()) #Closeness centrality

                        between_centr = nx.betweenness_centrality(G)
                        between_centr = list(between_centr.values()) #Betweenness centrality

                        # Fraction of nodes in largest component:
                        connected_components = sorted(nx.connected_components(G), key=len, reverse=True)
                        G0 = G.subgraph(connected_components[0])
                        size_largest_comp = G0.number_of_nodes()
                        frac_largest_comp = size_largest_comp / G.number_of_nodes() # Fraction of nodes in largest component
                        
                        # Save the new data in the dataframe:
                        newrow = pd.DataFrame({'n': [n], 'm': [m], 'p': [p], 'p_blue': [p_blue], 'mi': [has_strict_illusion], 'wmi': [has_weak_illusion], 'nr_nodes_str_ill': [nr_str_ill_nodes], 'nr_nodes_weak_ill': [nr_weak_ill_nodes], 'deg_assort_coef': [deg_assort_coef], 'deg_seq': [deg_seq], 'avg_path_length': [avg_path_length], 'CC': [CC], 'EV_centr': [EV_centr], 'close_centr': [close_centr], 'between_centr': [between_centr], 'frac_largest_comp':[frac_largest_comp], 'tie': [global_tie], 'probability_mixed_edge': [probability_mixed_edge], 'actual_fraction_mixed_edges' : [actual_fraction_mixed_edges]})
                        data = pd.concat([data, newrow], ignore_index=True)

    data.to_csv(path+filename, header=True, sep=';')
    return data

In [None]:
min_n = 10
max_n = 100
step_n = 10
nr_epochs = 1000
p_blue_values = np.arange(0.1, 0.5+0.01, 0.1).tolist()
p_blue_values = np.around(p_blue_values, 2) # To change 0.600000001 into 0.6
print(p_blue_values)
p_values = np.arange(0, 1+0.01, 0.2).tolist()
p_values = np.around(p_values, 2)  

HK_data = run_experiment_HK(min_n, max_n, step_n, nr_epochs, p_blue_values, p_values)

In [None]:
# Calculate average values for the properties that contain a value per node:
# Eigenvector centrality:
HK_data['EV_centr'] = HK_data['EV_centr'].apply(to_float_list)
average = []
for i in range(len(HK_data.EV_centr)):
    average.append(sum(HK_data.EV_centr[i])/len(HK_data.EV_centr[i]))
HK_data['avg_EV_centr'] = average
# Closeness centrality:
HK_data['close_centr'] = HK_data['close_centr'].apply(to_float_list)
average = []
for i in range(len(HK_data.close_centr)):
    average.append(sum(HK_data.close_centr[i])/len(HK_data.close_centr[i]))
HK_data['avg_close_centr'] = average
# Betweenness centrality:
HK_data['between_centr'] = HK_data['between_centr'].apply(to_float_list)
average = []
for i in range(len(HK_data.between_centr)):
    average.append(sum(HK_data.between_centr[i])/len(HK_data.between_centr[i]))
HK_data['avg_between_centr'] = average
# Degree sequence
HK_data['deg_seq'] = HK_data['deg_seq'].apply(to_float_list)
average = []
for i in range(len(HK_data.deg_seq)):
    average.append(sum(HK_data.deg_seq[i])/len(HK_data.deg_seq[i]))
HK_data['avg_degree'] = average

# Save the new data:
pathHK = os.path.dirname(os.getcwd()) +'/HK_analysis/'
filenameHK = 'HK_data.csv'
HK_data.to_csv(pathHK + filenameHK, header=True, sep=';')

In [15]:
# Facebook:

#Get facebook data: 
facebook = pd.read_csv(
    "facebook_combined.txt.gz",
    compression="gzip",
    sep=" ",
    names=["start_node", "end_node"],
)
G_fb = nx.from_pandas_edgelist(facebook, "start_node", "end_node")

In [17]:
def run_experiment_Facebook(G, nr_epochs, p_blue_values): 
# Test for majority majority illusion and weak versions on different colorings of a Facebook network. 

    path = os.path.dirname(os.getcwd()) +'/FB_analysis/'
    filename =  'FB_data.csv'

    # Create empty dataframe:
    data = pd.DataFrame(columns=['p_blue', 'mi', 'wmi', 'nr_nodes_str_ill', 'nr_nodes_weak_ill', 'tie']) #create empty dataframe
    
    for p_blue in p_blue_values:
        print(f'p_blue: {p_blue}') #To keep track of where we are
        for epoch in range(nr_epochs):
            color_randomly(G, p_blue, start_count_1=True)
            color_edges(G)
            (has_strict_illusion, has_weak_illusion, nr_str_ill_nodes, nr_weak_ill_nodes, nodes_under_illusion, global_tie) = has_illusions(G)  

            # Other measures of G: not necessary since we check only one graph.
          
            # Save data:
            newrow = pd.DataFrame({'p_blue': [p_blue], 'mi': [has_strict_illusion], 'wmi': [has_weak_illusion], 'nr_nodes_str_ill': [nr_str_ill_nodes], 'nr_nodes_weak_ill': [nr_weak_ill_nodes], 'tie':[global_tie]})
            data = pd.concat([data, newrow], ignore_index=True)

    data.to_csv(path+filename, header=True, sep=';')
    return data

In [None]:
nr_epochs = 1000

p_blue_values = np.arange(0.1, 0.5+0.01, 0.1).tolist()
p_blue_values = np.around(p_blue_values, 2) # To change 0.600000001 into 0.6

FB_data = run_experiment_Facebook(G_fb, nr_epochs, p_blue_values)