## Setup

In [None]:
import json
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt

import pickle
from itertools import combinations

import pprint
import json
import glob
from random import random, randrange
import logging
from collections import defaultdict
import sys

import subprocess
import shlex

import scipy
from scipy import stats
from scipy.stats import pearsonr

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

from datetime import datetime, timedelta
from datetime import datetime
from dateutil.parser import parse

import datetime
from collections import Counter
import itertools
import time
import csv
import pandas as pd
import pandas
from tqdm import trange, tqdm

import scipy
from scipy import stats
from scipy.stats import pearsonr

start_time = time.time()

### Load Data

In [None]:
PT = nx.read_gexf('PT-pruned.gexf')
PT1 = nx.read_gexf('PT-Slice1.gexf')
PT2 = nx.read_gexf('PT-Slice2.gexf')

In [None]:
# Using the regular dictionary
f = open('author_popularity_dictionary','rb')
popularity_dict = pickle.load(f)
f.close()

In [None]:
clusters = [13,48,6,49,104]
labels = ["Democrats","Republicans","Unorthodox","Public Health","Antivaxxers"]

## Functions

In [None]:
# There is an out_degree_centrality algorithm implemented, 
# https://networkx.github.io/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.out_degree_centrality.html#networkx.algorithms.centrality.out_degree_centrality
# but it does not take into account edge weight
# and edge weight here is important
# So we do it ourselves

def find_node_w_outdegree(node,network):
    out_degree = float(0)
    # https://networkx.org/documentation/stable/reference/classes/generated/networkx.DiGraph.neighbors.html
    # .neighbors gives you the 'successors', i.e. the PT.neighbots(n*) are all the nodes n* points to.
    for neighbor in network.neighbors(node):
        out_degree += network.edges[node,neighbor]['weight']
    return(out_degree)

def find_node_w_indegree(node,network):
    in_degree = float(0)
    # https://networkx.org/documentation/stable/reference/classes/generated/networkx.DiGraph.neighbors.html
    # .neighbors gives you the 'successors', i.e. the PT.neighbots(n*) are all the nodes n* points to.
    for neighbor in network.predecessors(node):
        in_degree += network.edges[neighbor,node]['weight']
    return(in_degree)

def count_neighbors(node,network):
    neighbors = float(0)
    for neighbor in network.neighbors(node):
        neighbors += 1
    for neighbor in network.predecessors(node):
        neighbors += 1
    return(neighbors)


In [None]:
PT.nodes['JoeBiden']['count']

In [None]:
PT1.nodes['JoeBiden']['count']

In [None]:
PT2.nodes['JoeBiden']['count']

In [None]:
def get_author_data(auth):
    data = popularity_dict[auth]
    date_break = datetime.datetime.strptime('11/03/20',"%d/%m/%y")
    prior = np.array([0,0]).astype('int64') # prior [followers,tweet count]
    posterior = np.array([0,0]).astype('int64') # posterior [followers,tweet count]
    for entry in data:
        date = datetime.datetime.strptime(entry[0],'%y-%m-%d-%H:%M:%S')
        if date <= date_break:
            prior[0] += entry[1] + entry[2] # count followers and friends
            prior[1] += 1
        if date_break <= date:
            posterior[0] += entry[1] + entry[2] # count followers and friends
            posterior[1] += 1
    
    # It might be the case that authors appear in only one of the slices, or both:
    if prior[1]>0 and posterior[1]>0:
        author_type = 'Continuing'
    elif prior[1]>0 and not posterior[1]>0:
        author_type = 'Prior Only'
    elif posterior[1]>0 and not prior[1]>0:
        author_type = 'Posterior Only'
    
    if prior[1]>0:
        prior_mean = prior[0]/prior[1]
    else:
        prior_mean = 0
    if posterior[1]>0:
        posterior_mean = posterior[0]/posterior[1]
    else:
        posterior_mean = 0

    if prior[1]>0 or posterior[1]>0:
        followers_mean = (prior[0]+posterior[0])/(prior[1]+posterior[1])
    else:
        followers_mean = 0
        
   
    ID = auth
    Com = PT.nodes[auth]['louvain']
    X = followers_mean
    X_1 = prior_mean
    X_2 = posterior_mean
    C = PT.nodes[auth]['count']
    C_1 = PT1.nodes[auth]['count']
    C_2 = PT2.nodes[auth]['count']
    R = find_node_w_outdegree(auth,PT)
    R_1 = find_node_w_outdegree(auth,PT1)
    R_2 = find_node_w_outdegree(auth,PT2)
    I = find_node_w_indegree(auth,PT)
    I_1 = find_node_w_indegree(auth,PT1)
    I_2 = find_node_w_indegree(auth,PT2)
    N = count_neighbors(auth,PT)
    N_1 = count_neighbors(auth,PT1)
    N_2 = count_neighbors(auth,PT2)
    T = author_type

    
    return np.array([ID,Com,X,X_1,X_2,C,C_1,C_2,R,R_1,R_2,I,I_1,I_2,N,N_1,N_2,T])

In [None]:
# Try it out
# NOTATION
# Retweet/Retweets: the times this author was retweeted
# Retweeted: the times this author retweeted someone else

results = pd.DataFrame(columns =['Id','Community', 'Mean_Followers','Prior_Mean_Followers', 'Posterior_Mean_Followers', \
        'Tweet_Count','Prior_Tweet_Count', 'Posterior_Tweet_Count',\
        'Retweet_Count','Prior_Retweet_Count','Posterior_Retweet_Count',\
        'Retweeted_Count','Prior_Retweeted_Count','Posterior_Retweeted_Count',\
        'Neighbors','Prior_Neighbors','Posterior_Neighbors','Author_Type'])

data = get_author_data('JoeBiden')
results.loc[len(results)] = data
results

In [None]:
generate_data = False

In [None]:
# NOTATION
# Retweet/Retweets: the times this author was retweeted
# Retweeted: the times this author retweeted someone else
if generate_data:
    results = pd.DataFrame(columns =['Id','Community', 'Mean_Followers','Prior_Mean_Followers', 'Posterior_Mean_Followers', \
        'Tweet_Count','Prior_Tweet_Count', 'Posterior_Tweet_Count',\
        'Retweet_Count','Prior_Retweet_Count','Posterior_Retweet_Count',\
        'Retweeted_Count','Prior_Retweeted_Count','Posterior_Retweeted_Count',\
        'Neighbors','Prior_Neighbors','Posterior_Neighbors','Author_Type'])
    
    for author in tqdm(popularity_dict):
        data = get_author_data(author)
        results.loc[len(results)] = data

    results.to_csv("user_features", index=False)

In [None]:
data = pandas.read_csv('user_features')
data

## Now Community Data Frame

In [None]:
# The code to generate this csv is at the bottom of this notebook
import pandas
data = pandas.read_csv('prepost_followers.csv')
data

In [None]:
data['Rate_Users_Increase']=data['Posterior_Active_Users']/data['Prior_Active_Users']
data['Rate_Followers_Increase']=data['Posterior_Mean_Followers']/data['Prior_Mean_Followers']
data

In [None]:
clusters = [13,48,6,49,104]
labels = ["Democrats","Republicans","Unorthodox","Public Health","Antivaxxers"]

In [None]:
recompute = False

In [None]:
if recompute:
    com_data = pd.read_csv("PT-pruned-louvain.gexf.csv")

    # Using the popularity dictionary
    f = open('author_popularity_dictionary','rb')
    popularity_dict = pickle.load(f)
    f.close()

In [None]:
def get_com_nodes(com):
    com_nodes = []
    subset = com_data[com_data['modularity_class']==com]
    for i in tqdm(subset['Id']):
        com_nodes.append(i)
    return com_nodes

In [None]:
def check_auth_time(author,pre=True):
    date_break = datetime.datetime.strptime('11/03/20',"%d/%m/%y")
    data = popularity_dict[author]
    value = False
    for entry in data:
        date = datetime.datetime.strptime(entry[0],'%y-%m-%d-%H:%M:%S')
        if pre:
            if date <= date_break:
                value=True
        else:
            if date_break <= date:
                value=True
    return value

def check_auth_continuity(author):
    date_break = datetime.datetime.strptime('11/03/20',"%d/%m/%y")
    data = popularity_dict[author]
    pre = False
    post = False
    for entry in data:
        date = datetime.datetime.strptime(entry[0],'%y-%m-%d-%H:%M:%S')
        if date <= date_break:
            pre=True
        if date_break <= date:
            post=True
    if (pre & post):
        return True
    else:
        return False

In [None]:
def get_mean_followers(com,pre=True):
    date_break = datetime.datetime.strptime('11/03/20',"%d/%m/%y")
    com_nodes = get_com_nodes(com)
    # Consider only those who were active before and after  
    com_auth = []
    for node in com_nodes:
        if check_auth_continuity(node):
            com_auth.append(node)   
    com_norm = len(com_auth)
    com_count = float(0)
    for author in tqdm(com_auth): # only those active before and after
        data = popularity_dict[author]
        auth_norm = len(data)
        auth_count = float(0)
        for entry in data:
            date = datetime.datetime.strptime(entry[0],'%y-%m-%d-%H:%M:%S')
            if pre:
                if date <= date_break:
                    auth_count += entry[1] # count followers
                    auth_count += entry[2] # count friends
            else:
                if date_break <= date:
                    auth_count += entry[1] # count followers
                    auth_count += entry[2] # count friends                    
        auth_mean = auth_count/auth_norm
        com_count += auth_mean
    com_mean = com_count/com_norm
    return com_mean

In [None]:
if recompute:
    results = pd.DataFrame(columns=['Community_Name','Community_Id','Prior_Active_Users','Posterior_Active_Users',\
                                    'Continuous_Active_Users','Prior_Mean_Followers','Posterior_Mean_Followers',\
                                   'Total_Tweet_Count','Prior_Tweet_Count','Posterior_Tweet_Count', \
                                    'Total_OutDegree','Prior_OutDegree', 'Posterior_OutDegree',\
                                   'Total_InDegree','Prior_InDegree', 'Posterior_InDegree'])
    for i in range(len(clusters)):
        data = []
        data.append(labels[i])
        data.append(clusters[i])
        com_nodes = get_com_nodes(clusters[i])
        prior_count = 0
        posterior_count = 0
        continuity_count = 0
        total_tweets = 0
        prior_tweets = 0
        posterior_tweets = 0
        total_outdegree = 0
        prior_outdegree = 0
        posterior_outdegree = 0
        total_indegree = 0
        prior_indegree = 0
        posterior_indegree = 0
        
        for node in tqdm(com_nodes):
            if check_auth_time(node,pre=True):
                prior_count += 1
            if check_auth_time(node,pre=False):
                posterior_count += 1
            if check_auth_continuity(node):
                continuity_count += 1
            
            total_tweets += PT.nodes[node]['count']
            prior_tweets += PT1.nodes[node]['count']
            posterior_tweets += PT2.nodes[node]['count']
            total_outdegree += find_node_w_outdegree(node,PT)
            prior_outdegree += find_node_w_outdegree(node,PT1)
            posterior_outdegree += find_node_w_outdegree(node,PT2)
            total_indegree += find_node_w_indegree(node,PT)
            prior_indegree += find_node_w_indegree(node,PT1)
            posterior_indegree += find_node_w_indegree(node,PT2)

            
        data.append(prior_count)
        data.append(posterior_count)
        data.append(continuity_count)
        prior_mean = get_mean_followers(clusters[i],pre=True)
        data.append(prior_mean)
        posterior_mean = get_mean_followers(clusters[i],pre=False)
        data.append(posterior_mean)
        data.append(total_tweets)
        data.append(prior_tweets)
        data.append(posterior_tweets)
        data.append(total_outdegree)
        data.append(prior_outdegree)
        data.append(posterior_outdegree)
        data.append(total_indegree)
        data.append(prior_indegree)
        data.append(posterior_indegree)
        
        results.loc[len(results)] = data
    results.to_csv('prepost_followers.csv', index=False)
    results

In [None]:
if recompute:
    results

### Some Plots

 - group bar charts: https://matplotlib.org/3.1.0/gallery/lines_bars_and_markers/barchart.html#sphx-glr-gallery-lines-bars-and-markers-barchart-py

 - pie charts: https://matplotlib.org/3.1.0/gallery/pie_and_polar_charts/pie_features.html#sphx-glr-gallery-pie-and-polar-charts-pie-features-py

In [None]:
comnunity_data = pd.read_csv("prepost_followers.csv")
comnunity_data

In [None]:
np.array(comnunity_data['Prior_Tweet_Count'])

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np

fig = plt.figure(figsize=(15,10))

ax = fig.add_subplot(221)


prior = np.array(comnunity_data['Prior_Active_Users'])
posterior = np.array(comnunity_data['Posterior_Active_Users'])

ind = np.arange(len(prior))  # the x locations for the groups
width = 0.30  # the width of the bars

rects1 = ax.bar(ind - width/2, prior, width, label='Before', color='b')
rects2 = ax.bar(ind + width/2, posterior, width, label='After',color='r')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Author Count')
ax.set_title('Author Count Before and After')
ax.set_xticks(ind)
ax.set_xticklabels(('Democrats', 'Republicans', 'Unorthodox', 'Public Health', 'Antivaxxers'))
ax.legend()


def autolabel(rects, xpos='center'):
    """
    Attach a text label above each bar in *rects*, displaying its height.

    *xpos* indicates which side to place the text w.r.t. the center of
    the bar. It can be one of the following {'center', 'right', 'left'}.
    """

    ha = {'center': 'center', 'right': 'left', 'left': 'right'}
    offset = {'center': 0, 'right': 1, 'left': -1}

    for rect in rects:
        height = rect.get_height()
        ax.annotate('{}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(offset[xpos]*3, 3),  # use 3 points offset
                    textcoords="offset points",  # in both directions
                    ha=ha[xpos], va='bottom')


#autolabel(rects1, "center")
#autolabel(rects2, "center")

ax = fig.add_subplot(222)

prior = np.array(comnunity_data['Prior_Tweet_Count'])
posterior = np.array(comnunity_data['Posterior_Tweet_Count'])

ind = np.arange(len(prior))  # the x locations for the groups
width = 0.30  # the width of the bars

rects1 = ax.bar(ind - width/2, prior, width, label='Before', color='b')
rects2 = ax.bar(ind + width/2, posterior, width, label='After',color='r')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Tweet Count')
ax.set_title('Tweet Count Before and After')
ax.set_xticks(ind)
ax.set_xticklabels(('Democrats', 'Republicans', 'Unorthodox', 'Public Health', 'Antivaxxers'))
ax.legend()

########

ax = fig.add_subplot(223)

prior = np.array(comnunity_data['Prior_OutDegree'])
posterior = np.array(comnunity_data['Posterior_OutDegree'])

ind = np.arange(len(prior))  # the x locations for the groups
width = 0.30  # the width of the bars

rects1 = ax.bar(ind - width/2, prior, width, label='Before', color='b')
rects2 = ax.bar(ind + width/2, posterior, width, label='After',color='r')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Retweeted Count')
ax.set_title('Retweeted Count Before and After')
ax.set_xticks(ind)
ax.set_xticklabels(('Democrats', 'Republicans', 'Unorthodox', 'Public Health', 'Antivaxxers'))
ax.legend()


###########

ax = fig.add_subplot(224)

prior = np.array(comnunity_data['Prior_InDegree'])
posterior = np.array(comnunity_data['Posterior_InDegree'])

ind = np.arange(len(prior))  # the x locations for the groups
width = 0.30  # the width of the bars

rects1 = ax.bar(ind - width/2, prior, width, label='Before', color='b')
rects2 = ax.bar(ind + width/2, posterior, width, label='After',color='r')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Retweeting Count')
ax.set_title('Retweeting Count Before and After')
ax.set_xticks(ind)
ax.set_xticklabels(('Democrats', 'Republicans', 'Unorthodox', 'Public Health', 'Antivaxxers'))
ax.legend()

########
fig.tight_layout()

plt.show()