# Enrich the networks with the statistics, save them also in a dictionary

In [2]:
import networkx as nx
from fa2 import ForceAtlas2  #the package should be installed before
import matplotlib.pyplot as plt
from datetime import datetime
from statistics import mean, median, quantiles
from networkx.algorithms.traversal.breadth_first_search import descendants_at_distance
import time
import requests
import glob
import pickle
import os
import shutil

In [3]:
def addAuthorFeatures(G, feat_dict, num_comm):
    #Author-network characteristics
    undir_authorGraph = G.to_undirected()
    authorGraph_NoRoot = G.copy()
    authorGraph_NoRoot.remove_node(list(G.nodes())[0])
    undirect_authorGraph_NoRoot = authorGraph_NoRoot.to_undirected()
    cc_aux = list(nx.weakly_connected_components(authorGraph_NoRoot))
    if len(cc_aux) == 0:
        size_largest_conn_comp = 0
        diam = 0
        recip_value = 0
        clust_coeff  = 0
    else:
        largest_cc_authorGraph = authorGraph_NoRoot.subgraph(max(nx.weakly_connected_components(authorGraph_NoRoot), key=len))
        undir_largest_cc_authorGraph = largest_cc_authorGraph.to_undirected()
        
        size_largest_conn_comp = len(largest_cc_authorGraph)
        diam = nx.algorithms.distance_measures.diameter(undir_largest_cc_authorGraph)
        recip_value = nx.algorithms.reciprocity(G)
        clust_coeff = nx.average_clustering(undir_authorGraph)

    num_users = G.number_of_nodes()
    num_edges = G.number_of_edges()
    
    G.graph["num_users"] = num_users/num_comm
    G.graph["num_edges"] = num_edges/num_comm
    G.graph["size_largest_conn_comp"] = size_largest_conn_comp/num_comm
    G.graph["recip_value"] = recip_value/num_comm
    G.graph["diam"] = diam/num_comm
    G.graph["clust_coeff"] = clust_coeff/num_comm
    
    feat_dict["authorStats"]["num_users"] = float(num_users)/num_comm
    feat_dict["authorStats"]["num_edges"] = float(num_edges)/num_comm
    feat_dict["authorStats"]["size_largest_conn_comp"] = float(size_largest_conn_comp)/num_comm
    feat_dict["authorStats"]["recip_value"] = recip_value/num_comm
    feat_dict["authorStats"]["diam"] = float(diam)/num_comm
    feat_dict["authorStats"]["clust_coeff"] = clust_coeff/num_comm
    

In [4]:
def comments_and_max_depth_and_H_index(G):
    i=0
    dir_comments = len(descendants_at_distance(G, list(G.nodes())[0], 1))
    num_comments = G.number_of_nodes()
    descs = [len(descendants_at_distance(G, list(G.nodes())[0], i))]
    while descs[-1] > 0:
        i = i+1
        descs.append(len(descendants_at_distance(G, list(G.nodes())[0], i)))
    
    for j in range(len(descs)):
        if descs[j] < j:
            return num_comments, dir_comments, i, j

In [5]:
def addCommentsFeatures(G, feat_dict, norma):
    #Temporal patterns
    created_utcs1=list(nx.get_node_attributes(G,'created').values())
    if len(G)>2:
        lifeTime = max(created_utcs1)-min(created_utcs1)
        averageTime = mean(created_utcs1)-min(created_utcs1)
        medianTime = median(created_utcs1)-min(created_utcs1)
        quantiles95 = quantiles(created_utcs1,n=10)[8]
        lifetime95 = quantiles95-min(created_utcs1)
        
    else:
        lifeTime = 0
        averageTime = 0
        medianTime = 0
        lifetime95 = 0
        
    #Submission specific characteristics
    par = comments_and_max_depth_and_H_index(G)
    num_comm = par[0]
    if norma == False:
        num_comm = 1
    
    G.graph["lifeTime"] = lifeTime/num_comm
    G.graph["averageTime"] = averageTime/num_comm
    G.graph["medianTime"] = medianTime/num_comm
    G.graph["lifetime95"] = lifetime95/num_comm
    
    G.graph["num_comments"] = par[0]/num_comm
    G.graph["num_direct_comments"] = par[1]/num_comm
    G.graph["max_depth"] = par[2]/num_comm
    G.graph["H_index"] = par[3]/num_comm
    
    G.graph["created"] = list(nx.get_node_attributes(G,'created').values())[0]
    
    feat_dict["timeStats"]["lifeTime"] = lifeTime/num_comm
    feat_dict["timeStats"]["averageTime"] = averageTime/num_comm
    feat_dict["timeStats"]["medianTime"] = medianTime/num_comm
    feat_dict["timeStats"]["lifetime95"] = lifetime95/num_comm
    
    feat_dict["commStats"]["num_comments"] = par[0]/num_comm
    feat_dict["commStats"]["num_direct_comments"] = par[1]/num_comm
    feat_dict["commStats"]["max_depth"] = par[2]/num_comm
    feat_dict["commStats"]["H_index"] = par[3]/num_comm
    
    feat_dict["created"] = list(nx.get_node_attributes(G,'created').values())[0]
    #print(par[0])
    
    return num_comm

In [6]:
'''
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
'''
subreddit = "PoliticalDiscussion"

'''
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
'''




normals = [False]#, True]
for normal in normals:
    statCov = {}
    source_subfolder = "total"
    subfolder = "total"
    if normal==True:
        subfolder = "total_normalizedNumComm"
    
    shutil.rmtree("aG_enrich_monthly_"+subreddit+"/"+subfolder)
    os.mkdir("aG_enrich_monthly_"+subreddit+"/"+subfolder)
    
    comNets = glob.glob("cG_prove_monthly_"+subreddit+"/"+source_subfolder+"/*")
    for i,net in enumerate(comNets):
        if i%500 == 0:
            print(i,"/",len(comNets))
        netId = "t3_"+net.split("_")[5]
        statCov[netId] = {"authorStats":{},"timeStats":{},"commStats":{}}

        commentsGraph = nx.read_gpickle(net)
        num_comm = addCommentsFeatures(commentsGraph, statCov[netId], normal)

        nx.write_gpickle(commentsGraph,"cG_enrich_monthly_"+subreddit+"/"+subfolder+"/commGraph_"+netId+"_"+subreddit)

        authorGraph = nx.read_gpickle("aG_prove_monthly_"+subreddit+"/"+source_subfolder+"/aGraph_"+netId+"_"+subreddit)
        addAuthorFeatures(authorGraph, statCov[netId], num_comm)

        nx.write_gpickle(authorGraph,"aG_enrich_monthly_"+subreddit+"/"+subfolder+"/aGraph_"+netId+"_"+subreddit)

    with open('stat_'+subreddit+'_'+subfolder+'.pickle', 'wb') as handle:
        pickle.dump(statCov, handle)

0 / 16652
500 / 16652
1000 / 16652
1500 / 16652
2000 / 16652
2500 / 16652
3000 / 16652
3500 / 16652
4000 / 16652
4500 / 16652
5000 / 16652
5500 / 16652
6000 / 16652
6500 / 16652
7000 / 16652
7500 / 16652
8000 / 16652
8500 / 16652
9000 / 16652
9500 / 16652
10000 / 16652
10500 / 16652
11000 / 16652
11500 / 16652
12000 / 16652
12500 / 16652
13000 / 16652
13500 / 16652
14000 / 16652
14500 / 16652
15000 / 16652
15500 / 16652
16000 / 16652
16500 / 16652


In [26]:
autNets = glob.glob("cg_"+subreddit+"/*")

In [27]:
autNets

['cg_PoliticalDiscussion/cGraph_paf9du_PoliticalDiscussion',
 'cg_PoliticalDiscussion/cGraph_p9ipq2_PoliticalDiscussion',
 'cg_PoliticalDiscussion/cGraph_p6tr0x_PoliticalDiscussion',
 'cg_PoliticalDiscussion/cGraphpz9qzw_PoliticalDiscussion',
 'cg_PoliticalDiscussion/cGraph_p7cjrg_PoliticalDiscussion',
 'cg_PoliticalDiscussion/cGraphq1ekjh_PoliticalDiscussion',
 'cg_PoliticalDiscussion/cGraphq031nz_PoliticalDiscussion',
 'cg_PoliticalDiscussion/cGraph_phe627_PoliticalDiscussion',
 'cg_PoliticalDiscussion/cGraphpw1dmw_PoliticalDiscussion',
 'cg_PoliticalDiscussion/cGraphpzhqkd_PoliticalDiscussion',
 'cg_PoliticalDiscussion/cGraph_pla5rw_PoliticalDiscussion',
 'cg_PoliticalDiscussion/cGraph_pdef00_PoliticalDiscussion',
 'cg_PoliticalDiscussion/cGraph_pm6l4w_PoliticalDiscussion',
 'cg_PoliticalDiscussion/cGraph_pjb4k5_PoliticalDiscussion',
 'cg_PoliticalDiscussion/cGraph_pqf2ad_PoliticalDiscussion',
 'cg_PoliticalDiscussion/cGraph_phyigb_PoliticalDiscussion',
 'cg_PoliticalDiscussion/cGra

In [15]:
for net in autNets:
    authorGraph = nx.read_gpickle(net)
    name_aux = net.split("_")
    name = "cg_PoliticalDiscussion/cGraph"+name_aux[3]+"_PoliticalDiscussion"
    nx.write_gpickle(authorGraph,name)

In [23]:
import os
for net in autNets:
    os.remove(net)

In [29]:
len(autNets)

1426

In [None]:
#all the subreddits at once
'''
subReddits = ["COVID19", "politics", "LGBTQ"]
allStatistics = {}
for subReddit in subReddits:
    autNets = glob.glob("aG/*_"+subReddit+"_*")
    if subReddit not in allStatistics:
        allStatistics[subReddit]={}
    for net in autNets:
        netId = net.split("_")[1]
        allStatistics[subReddit][netId] = {"authorStats":{},"timeStats":{},"commStats":{}}
        authorGraph = nx.read_gpickle(net)
        addAuthorFeatures(authorGraph, allStatistics[subReddit][netId])
        nx.write_gpickle(authorGraph,"enrich_authorGraphs/autGraph_"+netId+"_"+subReddit)
    
    comNets = glob.glob("commentsGraphs/*_"+subReddit)
    for net in comNets:
        netId = net.split("_")[1]
        commentsGraph = nx.read_gpickle(net)
        addCommentsFeatures(commentsGraph, allStatistics[subReddit][netId])
        nx.write_gpickle(commentsGraph,"enrich_commGraphs/commGraph_"+netId+"_"+subReddit)



with open('allStatistics_1.pickle', 'wb') as handle:
    pickle.dump(allStatistics, handle)

#with open('filename.pickle', 'rb') as handle:
#    b = pickle.load(handle)
'''