# Full and Sliced Network Generator

In [None]:
import pickle
from itertools import combinations
import pprint
import json
import glob
from random import random, randrange
from datetime import datetime, timedelta
import itertools
from collections import defaultdict
import networkx as nx
import numpy
import time
import functools
import pandas as pd

from tqdm import trange, tqdm

In [None]:
start_time = time.time()

#### Load the Data

In [None]:
# Load the data
f = open('pruned_retweet_dict','rb')
retweet_dict = pickle.load(f)
f.close()

#get the set of authors who were actually retweeted, 
# as this becomes useful in later steps. 

rt_author_set = set()
for a in retweet_dict:
    rt_author_set.add(retweet_dict[a]['author'])

#i believe checking frozensets is quicker
rt_author_set=frozenset(rt_author_set)
print(len(rt_author_set))

## Build Full Time Network

### Define Functions

In [None]:
# Directed Graph: 
# A -> B if B retweeted A. 
# Edges are weighted with retweet count
# nodes have a count with the total amount of tweets they generated
def build_PT_network(retweet_dict,rt_author_set,add_metadata = False):
    G = nx.DiGraph()
    for a in rt_author_set:
        G.add_node(a,count=0)
    for i in tqdm(retweet_dict):
        a = retweet_dict[i]['author']
        G.nodes[a]['count']+=1

    for i in tqdm(retweet_dict):
        rlist = retweet_dict[i]['rt_list']
        rauthor = retweet_dict[i]['author']
        for a,b in rlist:
            if rauthor != a: #no self-loops
                if (a in rt_author_set) and (rauthor in rt_author_set):
                    if G.has_edge(rauthor,a):
                        G.edges[rauthor,a]['weight'] += 1
                        # Now to make sure we use the earliest time tag
                        if add_metadata:
                            stored_date = G.edges[rauthor,a]['date']
                            stored_date = datetime.strptime(stored_date,'%y-%m-%d-%H:%I:%M')
                            new_date = datetime.strptime(b,'%y-%m-%d-%H:%I:%M')
                            if new_date < stored_date:
                                G.edges[rauthor,a]["date"]= b
                    else:
                        G.add_edge(rauthor,a,weight=1)
                        if add_metadata:
                            G.edges[rauthor,a]["date"]= b
 
    
    return G 


In [None]:
def prune_network(G,minimum_edgeweight=0,minumum_count=0,component=True,isolated=False):
    # Remove low weight edges
    tocut = []                 
    if minimum_edgeweight > 0:
        for a,b,w in G.edges(data='weight'):
            if w <= minimum_edgeweight:
                tocut.append((a,b))
    G.remove_edges_from(tocut)
    
    # Remove low count nodes
    tocutnodes = []
    if minumum_count > 0:
        for node in G.nodes():
            if G.nodes[node]['count'] <= minumum_count:
                tocutnodes.append(node) 
        G.remove_nodes_from(tocutnodes)
            
    # Keep only the principal (weakly connected) component
    if component:
        giant = max(nx.algorithms.components.weakly_connected_components(G), key=len)
        tocutnodes = []
        for node in G.nodes():
            if node not in giant:
                tocutnodes.append(node)
        G.remove_nodes_from(tocutnodes)
    
    if isolated:
        G.remove_nodes_from(list(nx.isolates(G)))
    
    return G

### Build the network

In [None]:
elapsed_time = time.time() - start_time
print(elapsed_time)
PT_full = build_PT_network(retweet_dict,rt_author_set,add_metadata=False)
print(len(PT_full.nodes()))
nx.write_gexf(PT_full,'PT-full.gexf')
PT = prune_network(PT_full,minimum_edgeweight=0,minumum_count=0,component=True,isolated=False)
nx.write_gexf(PT,'PT-pruned.gexf')
print(len(PT.nodes()))
elapsed_time2 = time.time() - start_time
elapsed_time2 = elapsed_time2 - elapsed_time
print(elapsed_time2)

## Sliced Networks

### Define Function

In [None]:
def build_PT_network(date1,date2,retweet_dict):
    G = nx.DiGraph()
    date1 = datetime.strptime(date1,"%d/%m/%y")
    date2 = datetime.strptime(date2,"%d/%m/%y")
    
    for a in PT.nodes():
        G.add_node(a,count=0)

    for i in tqdm(retweet_dict):
        rlist = retweet_dict[i]['rt_list']
        rauthor = retweet_dict[i]['author']
        
        t_date = datetime.strptime(retweet_dict[i]['date'],'%y-%m-%d-%H:%M:%S')
        if (date1 < t_date) and (t_date <= date2):
            if rauthor in PT.nodes():
                G.nodes[rauthor]['count']+=1
        
        for a,b in rlist:
            rt_date = datetime.strptime(b,'%y-%m-%d-%H:%M:%S')
            if (date1 < rt_date) and (rt_date <= date2):
                if rauthor != a: #no self-loops
                    if (a in PT.nodes()) and (rauthor in PT.nodes()):
                        # There is no time frame consideration here
                        if G.has_edge(rauthor,a):
                            G.edges[rauthor,a]['weight'] += 1                          
                        else:
                            G.add_edge(rauthor,a,weight=1)
                            

    return G

In [None]:
# We want to keep nodes without edges because they correspond to 
# the original network - so we do not prune the slices
# I use slightly larger intervals to the past and future cause caution
PT1 = build_PT_network('24/12/19','11/03/20',retweet_dict)
nx.write_gexf(PT1,'PT-Slice1.gexf')
print("PT1 is done")
PT2 = build_PT_network('11/03/20','29/05/20',retweet_dict)
nx.write_gexf(PT2,'PT-Slice2.gexf')
print("PT2 is done")

In [None]:
# Check the lists are the same   
def comparelists(l1,l2):
    if functools.reduce(lambda x, y : x and y, map(lambda p, q: p == q,l1,l2), True): 
        print ("The lists are the same") 
    else: 
        print ("The lists are not the same")
        
comparelists(PT.nodes(),PT1.nodes())
comparelists(PT.nodes(),PT2.nodes())

## Add node features to networks

In [None]:
import pandas as pd
data = pd.read_csv("PT-pruned-louvain.gexf.csv")

# First we incorporate the Gephi modularity
for index, row in data.iterrows():
    author = row['Id']
    community = row["modularity_class"]
    PT.nodes[author]['louvain']=community
    PT1.nodes[author]['louvain']=community
    PT2.nodes[author]['louvain']=community

nx.write_gexf(PT,'PT-pruned.gexf')
nx.write_gexf(PT1,'PT-Slice1.gexf')
nx.write_gexf(PT2,'PT-Slice2.gexf')


In [None]:
elapsed_time = time.time() - start_time
print(elapsed_time)