In [8]:
import pandas as pd
import numpy as np
import networkx as nx
from utils import *
from algo_evaluation import *

In [9]:
%load_ext autoreload
%autoreload 2

In [11]:
df = load_data("./data/")

In [12]:

def greedy_modularity_communities(graph: nx.Graph):
    """
    Perform the greedy modularity algorithm with weights.
    """
    return list(nx.algorithms.community.greedy_modularity_communities(graph, weight='weight'))



def label_propagation_communities(graph: nx.Graph):
    """
    Perform the label propagation algorithm.
    """
    return list(nx.algorithms.community.asyn_lpa_communities(graph, weight='weight'))


def girvan_newmann_communities(graph: nx.Graph):
    """
    Perform the girvan newmann algorithm.
    """
    communities = nx.community.girvan_newman(graph)
    communities = next(communities)     # take the first split (maybe extend this)
    return list(communities)


def girvan_newman_with_weights(graph: nx.Graph):
    """
    Perform the girvan newmann algorithm with weights.
    """
    def most_valuable_edge(G):
        """Returns the edge with the highest betweenness centrality
        in the graph `G`.

        """
        betweenness = nx.edge_betweenness_centrality(G, weight='distance')
        return max(betweenness, key=betweenness.get)
    
    first_node = list(graph.nodes)[0]
    if 'distance' not in graph.nodes[first_node]:
        weights = nx.get_edge_attributes(graph, 'weight').values()
        inverse_weights = [1 / weight for weight in weights]
        nx.set_edge_attributes(graph, dict(zip(graph.edges, inverse_weights)), 'distance')

    communities = nx.community.girvan_newman(graph, most_valuable_edge=most_valuable_edge)
    communities = next(communities)     # take the first split (maybe extend this)
    return list(communities)
    

def louvain_communities(graph: nx.Graph):
    """
    Perform the louvain algorithm.
    """
    return list(nx.algorithms.community.louvain_communities(graph, weight='weight'))



algos = {
    'Greedy Modularity': greedy_modularity_communities,
    'Label Propagation': label_propagation_communities,
    'Girvan Newman': girvan_newmann_communities,
    'Girvan Newman with Weights': girvan_newman_with_weights,
    'Louvain': louvain_communities
}


In [13]:
# select categories to analyze
channels = ['Sport']

results_over_num_users = {}
timeout = 60*10

for num_users in [100, 200, 500, 1000, 2000]:

    filtered_df, articles_per_user, selected_users = filter_users(df, channels, min_articles=4, max_num_users=num_users)
    print(f"Number of selected users: {len(selected_users)}")

    # create graph
    weights = iom(selected_users, articles_per_user)
    threshold = np.percentile(weights.flatten(), 75)
    print(f"Threshold: {threshold}")
    graph = build_graph(selected_users, weights, threshold)

    results = perform_algos(graph, algos, True, timeout)

    results_over_num_users[len(selected_users)] = results

Number of selected users: 100


100%|██████████| 100/100 [00:00<00:00, 3663.66it/s]

Threshold: 0.0
Running Greedy Modularity...





Running Label Propagation...
Running Girvan Newman...
Running Girvan Newman with Weights...
Running Louvain...
Number of selected users: 200


100%|██████████| 200/200 [00:00<00:00, 1806.91it/s]

Threshold: 0.0
Running Greedy Modularity...





Running Label Propagation...
Running Girvan Newman...
Running Girvan Newman with Weights...
Algorithm Girvan Newman with Weights timed out.
Running Louvain...
Number of selected users: 500


100%|██████████| 500/500 [00:00<00:00, 740.84it/s]


Threshold: 0.0
Running Greedy Modularity...
Running Label Propagation...
Running Girvan Newman...
Running Girvan Newman with Weights...
Algorithm Girvan Newman with Weights timed out.
Running Louvain...
Number of selected users: 1000


100%|██████████| 1000/1000 [00:02<00:00, 379.02it/s]


Threshold: 0.0
Running Greedy Modularity...
Running Label Propagation...
Running Girvan Newman...
Algorithm Girvan Newman timed out.
Running Girvan Newman with Weights...
Algorithm Girvan Newman with Weights timed out.
Running Louvain...
Number of selected users: 1144


100%|██████████| 1144/1144 [00:03<00:00, 309.31it/s]


Threshold: 0.0
Running Greedy Modularity...
Running Label Propagation...
Running Girvan Newman...
Algorithm Girvan Newman timed out.
Running Girvan Newman with Weights...
Algorithm Girvan Newman with Weights timed out.
Running Louvain...


In [15]:
for num_users, results in results_over_num_users.items():
    print(f"Number of users: {num_users}")
    for algo, result in results.items():
        print(f"""{algo}:
            \tTime: {result['runtime']}
            \tModularity: {result['modularity']}
            \tNumber of communities: {len(result['community']) if result['community'] else 0}""")
    print("\n")

Number of users: 100
Greedy Modularity:
            	Time: 0.1621408462524414
            	Modularity: 0.1628962778314541
            	Number of communities: 4
Label Propagation:
            	Time: 0.06281399726867676
            	Modularity: -2.9976021664879227e-15
            	Number of communities: 1
Girvan Newman:
            	Time: 0.8417932987213135
            	Modularity: -7.957152113314905e-06
            	Number of communities: 2
Girvan Newman with Weights:
            	Time: 16.05259656906128
            	Modularity: -7.957152113314905e-06
            	Number of communities: 2
Louvain:
            	Time: 0.09161591529846191
            	Modularity: 0.1675332898525525
            	Number of communities: 5


Number of users: 200
Greedy Modularity:
            	Time: 0.4774501323699951
            	Modularity: 0.1801240171095746
            	Number of communities: 3
Label Propagation:
            	Time: 0.11560654640197754
            	Modularity: 1.709743457922741e-14
        

In [14]:
import pickle

# File where you want to store the dictionary
file_path = 'results.pickle'

# Writing the dictionary to a pickle file
with open(file_path, 'wb') as file:
    pickle.dump(results_over_num_users, file)

print("Dictionary saved to", file_path)

Dictionary saved to results.pickle
