In [None]:
'''
VMP 2022-03-02: used in final report.
Figure 2 a) and Figure 2 b). 
'''

In [None]:
# overall path to the project
path = "path/to/base"

In [None]:
# check RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

Your runtime has 27.3 gigabytes of available RAM



In [None]:
# basic setup
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from community import community_louvain
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd 
import seaborn as sns
import math
from statistics import median
from matplotlib.lines import Line2D
from matplotlib.colors import ListedColormap
import pickle

In [None]:
''' SETUP '''

' SETUP '

In [None]:
# markers gender dct
mks_g_dct = {'Female': 'o',
             'Male': '^'} 

# colors gender dct
clrs_g_dct = {'Female': '#fc8d62',
              'Male': '#66c2a5'}

# colors gender lst 
clrs_g_lst = ['#fc8d62', '#66c2a5']

# other setup: 
ms=2  #marker size
sns.set(style='ticks', font_scale=1.1, font='Arial')  #overall style for figs

#  # 
figsize=(4.5*1.5,3.2)
dpi = 300

import matplotlib.font_manager as font_manager
font = font_manager.FontProperties(family='Arial')

style = 'SS'

In [None]:
''' FUNCTIONS '''

' FUNCTIONS '

In [None]:
### create network (GCC) ### 
def create_network(inpath, filename):
  df_tmp = pd.read_csv(f"{inpath}{filename}")
  df_tmp = df_tmp[["AuthorId_x", "AuthorId_y"]].drop_duplicates() # unweighted
  G = nx.from_pandas_edgelist(df_tmp, "AuthorId_x", "AuthorId_y")
  Gcc = sorted(nx.connected_components(G), key=len, reverse=True)
  G0 = G.subgraph(Gcc[0])
  return G0

In [None]:
# label community function
def label_community(G, partition): 
  for i in G.nodes(): 
    G.nodes[i]['community'] = partition.get(i)
  return G

In [None]:
# subset relevant community 
def get_subgraph(G, partition, focus_author): 
  focus_com = partition.get(focus_author)
  nodes = (
    node
    for node, data
    in G.nodes(data=True)
    if data.get("community") == focus_com
    )
  G_sub = G.subgraph(nodes)
  G_sub_nodes = list(G_sub.nodes())
  G_sub_nodes = pd.DataFrame({'AuthorId': G_sub_nodes})
  return G_sub, G_sub_nodes

In [None]:
# get node dict
def get_node_dict(df_year, gcc_nodes, FocusAuthorX): 
  node_info = df_year.merge(gcc_nodes, on = 'AuthorId', how = "inner") # get eigencentrality + gender
  node_info['FocusNode'] = ['true' if x == FocusAuthorX else 'false' for x in list(node_info.AuthorId)] # get focus node
  node_info_dct = node_info.set_index('AuthorId').to_dict() # translate to dict
  return node_info_dct

In [None]:
# add information (node)
def add_node_info(G, node_info_dct):
  for i in G.nodes(): 
    G.nodes[i]['Gender'] = node_info_dct.get('Gender').get(i)
    G.nodes[i]['Eigencentrality'] = node_info_dct.get('eigencentrality_unweighted_scaled').get(i)
    G.nodes[i]['FocusNode'] = node_info_dct.get('FocusNode').get(i)
  return G

In [None]:
# add information (edge)
def add_edge_info(G, focus_author):
  for i,j in G.edges():
    if i == focus_author or j == focus_author: 
      G.edges[i,j]['FocusEdge'] = 'true'
    else: 
      G.edges[i,j]['FocusEdge'] = 'false'
  return G

In [None]:
def extract_node_info(G):
  node_gender = nx.get_node_attributes(G, 'Gender').values()
  node_eigencentrality = nx.get_node_attributes(G, 'Eigencentrality').values()
  node_focus = nx.get_node_attributes(G, 'FocusNode').values()
  return node_gender, node_eigencentrality, node_focus

In [None]:
def extract_edge_info(G): 
  edge_focus = nx.get_edge_attributes(G, 'FocusEdge').values()
  return edge_focus

In [None]:
def frexp10(x):
    ## found on the internet
    exp = int(math.floor(math.log10(abs(x))))
    first_part = x / 10**exp 
    second_part = exp 

    ## manually round 
    rounded = round(first_part)
    new_number = float(str(rounded) + 'e' + str(exp))
    format_string = '{0}$^{{{1:+03}}}$'.format(rounded, exp)

    return new_number, format_string

In [None]:
# plot it 
def plot_fig_1(G1, G2, focus_author, clrs_g_lst, figsize, dpi, LouvainSetting, suptitle, outfolder, outpath):
  fig, ax = plt.subplots(1, 2, figsize = figsize, dpi = dpi)
  i = 0
  for G, title in zip([G1, G2], ["March 2020", "June 2021"]): 
    node_gender, node_eigencentrality, node_focus = extract_node_info(G)
    edge_focus = extract_edge_info(G)
    pos = nx.spring_layout(G, seed=42)
    if i == 0: 
      # with reference to first network
      max_eigencentrality = max(node_eigencentrality) 
      min_eigencentrality = min(node_eigencentrality) 
      median_eigencentrality = median(node_eigencentrality)

      # for size of stuff
      node_size = [200*(x/max_eigencentrality) for x in node_eigencentrality] 
      node_max = max(node_size)
      node_min = min(node_size)
      node_median = median(node_size)

    nx.draw_networkx_nodes(
        G = G, 
        pos = pos,
        node_color = [clrs_g_lst[0] if x == 'Female' else clrs_g_lst[1] if x == 'Male' else 'grey' for x in node_gender], 
        node_size = [200*(x/max_eigencentrality) for x in node_eigencentrality] ,
        alpha = [1.0 if x == 'true' else 1 for x in node_focus],
        ax = ax[i]
        )
    
    # trying something here.
    focus_index = list(node_focus).index('true')
    node_eigen = list(node_eigencentrality)[focus_index]
    nodes = nx.draw_networkx_nodes(
        G = G.subgraph(focus_author),
        pos = pos, 
        node_color = clrs_g_lst[0], # always female
        node_size = 200*(node_eigen/max_eigencentrality),
        alpha = 1,
        linewidths = 1,
        ax = ax[i]
        )
    nodes.set_edgecolor('black')
    
    nx.draw_networkx_edges(
        G, 
        pos = pos, 
        alpha = 0.1,
        edge_color = 'black', 
        ax = ax[i])

    ax[i].axis('off')
    i += 1

  
  node_max = 200*(10e-02/max_eigencentrality)
  node_median = 200*(10e-03/max_eigencentrality)
  node_min = 200*(10e-04/max_eigencentrality)
  lines = [Line2D([0], [0], linewidth=0, markersize = math.sqrt(node_max), color = 'grey', marker='o'), 
           Line2D([0], [0], linewidth=0, markersize = math.sqrt(node_median), color = 'grey', marker='o'), 
           Line2D([0], [0], linewidth=0, markersize = math.sqrt(node_min), color='grey', marker='o')]

  max_format = '{0}$^{{{1:+03}}}$'.format(10, -2)
  median_format = '{0}$^{{{1:+03}}}$'.format(10, -3)
  min_format = '{0}$^{{{1:+03}}}$'.format(10, -4)
  labels = [max_format, median_format, min_format]
  fig.subplots_adjust(left=0.15) 
  fig.legend(lines, labels, title = "Centrality", loc='lower left', labelspacing = 0.5, # upper left before
             frameon=False, handletextpad=0.1, prop=font, handlelength=1.8,
             fontsize = 10)._legend_box.align='left' 

  plt.gcf().text(0.3, 0.05, 'Mar 2020', fontsize = 14)
  plt.gcf().text(0.65, 0.05, 'Jun 2021', fontsize = 14)
  plt.suptitle(f'{suptitle}')
  plt.savefig(f"{outpath}{outfolder}/Pruning{LouvainSetting}_Author{focus_author}.pdf")
  fig.clf()
  plt.close(fig)

In [None]:
''' READ FILES '''

' READ FILES '

In [None]:
# load data 
inPath1 = f"{path}/DATA/collaboration/network/main/"
df_main = pd.read_csv(f"{inPath1}plot_dataframe.csv")

# subset columns that we need and genders
df_main = df_main[["AuthorId", "month", "eigencentrality_unweighted_scaled", "Gender"]].drop_duplicates() 

# fix months 
df_main['month'] = pd.DatetimeIndex(df_main['month']).to_period('M') 

# get the months that we care about
df_main_2020 = df_main[df_main["month"] == "2020-03"][["AuthorId", "eigencentrality_unweighted_scaled", "Gender"]]
df_main_2021 = df_main[df_main["month"] == "2021-06"][["AuthorId", "eigencentrality_unweighted_scaled", "Gender"]]

In [None]:
# create corresponding networks 
inpath = f"{path}/DATA/collaboration/network/collaboration/"

filename = "preprints_2016-07_2021-07.csv" # including June 21. 
GCC_2021 = create_network(inpath, filename)

filename = "preprints_2015-04_2020-04.csv" # including March 20.
GCC_2020 = create_network(inpath, filename)

In [None]:
def plot_author(GCC_2020, GCC_2021, partition_2020, partition_2021, FocusAuthorX, LouvainSetting, suptitle, outfolder):
  # label community
  GCC_2020_com = label_community(GCC_2020, partition_2020)
  GCC_2021_com = label_community(GCC_2021, partition_2021)
  # subset community
  GCC_2020_sub, GCC_2020_nodes = get_subgraph(GCC_2020_com, partition_2020, FocusAuthorX)
  GCC_2021_sub, GCC_2021_nodes = get_subgraph(GCC_2021_com, partition_2021, FocusAuthorX)
  # reasonable size & almost matched degree
  if (10 <= len(GCC_2020_sub.nodes()) <= 40 and 10 <= len(GCC_2021_sub.nodes()) <= 40) and (len(GCC_2021_sub[FocusAuthorX]) -1 <= len(GCC_2020_sub[FocusAuthorX]) <= len(GCC_2021_sub[FocusAuthorX]) + 1):
    # create node info dct
    node_info_2020 = get_node_dict(df_main_2020, GCC_2020_nodes, FocusAuthorX)
    node_info_2021 = get_node_dict(df_main_2021, GCC_2021_nodes, FocusAuthorX)
    # add node info
    GCC_2020_sub = add_node_info(GCC_2020_sub, node_info_2020)
    GCC_2020_sub = add_edge_info(GCC_2020_sub, FocusAuthorX)
    GCC_2021_sub = add_node_info(GCC_2021_sub, node_info_2021)
    GCC_2021_sub = add_edge_info(GCC_2021_sub, FocusAuthorX)
    # plot fig 
    plot_fig_1(G1 = GCC_2020_sub, 
              G2 = GCC_2021_sub, 
              focus_author = FocusAuthorX, 
              clrs_g_lst = clrs_g_lst, 
              figsize = figsize,
              dpi = dpi,
              LouvainSetting = setting,
              suptitle = suptitle,
              outfolder = outfolder,
              outpath = f"{path}/DATA/collaboration/network/figs/final/")
  else: 
    pass 

In [None]:
### plot top 1% author ### 

In [None]:
# setup 
inpath = f"{path}/DATA/collaboration/network/community/"
FocusAuthors = pd.read_csv(f"{path}/DATA/collaboration/network/case_study/2020-03_2021-06_top1pctDROP.csv")
FocusAuthors = list(FocusAuthors.AuthorId)
outname = ''
suptitle = "Top 1%"
settings = [0.00025]
Author = 3013281036

# loop over stuff
for setting in settings: 

  with open(f'{inpath}june21_res{setting}.pickle', 'rb') as handle:
    partition_2021 = pickle.load(handle)

  with open(f'{inpath}march20_res{setting}.pickle', 'rb') as handle:
    partition_2020 = pickle.load(handle)

  #for Author in FocusAuthors: 
  plot_author(GCC_2020 = GCC_2020, 
              GCC_2021 = GCC_2021, 
              partition_2020 = partition_2020, 
              partition_2021 = partition_2021, 
              FocusAuthorX = Author,
              LouvainSetting = setting, # not used now. 
              suptitle = suptitle,
              outfolder = outname)

findfont: Font family ['Arial'] not found. Falling back to DejaVu Sans.


In [None]:
# plot it 
import math
def plot_fig_2(G1, G2, focus_author, clrs_g_lst, figsize, dpi, LouvainSetting, suptitle, outfolder, outpath):
  fig, ax = plt.subplots(1, 2, figsize = figsize, dpi = dpi)
  i = 0

  # first do this: 
  node_gender, node_eigencentrality, node_focus = extract_node_info(G2)
  
  # with reference to first network
  max_eigencentrality = max(node_eigencentrality) 
  min_eigencentrality = min(node_eigencentrality) 
  median_eigencentrality = median(node_eigencentrality)

  # for size of stuff
  node_size = [300*(x/max_eigencentrality) for x in node_eigencentrality] 
  node_max = max(node_size)
  node_min = min(node_size)
  node_median = median(node_size)

  for G, title in zip([G1, G2], ["March 2020", "June 2021"]): 
    node_gender, node_eigencentrality, node_focus = extract_node_info(G)
    edge_focus = extract_edge_info(G)
    pos = nx.spring_layout(G, seed=42)
    print(pos)

    # fix positions 
    if i == 0: 
      pos[2973089958] += (-0.5, 0)
    if i == 1: 
      pos[2973089958] += (0.5, 0)
    nx.draw_networkx_nodes(
        G = G, 
        pos = pos,
        node_color = [clrs_g_lst[0] if x == 'Female' else clrs_g_lst[1] if x == 'Male' else 'grey' for x in node_gender], 
        node_size = [300*(x/max_eigencentrality) for x in node_eigencentrality] ,
        alpha = [1.0 if x == 'true' else 1 for x in node_focus],
        ax = ax[i]
        )
    
    # trying something here.
    focus_index = list(node_focus).index('true')
    node_eigen = list(node_eigencentrality)[focus_index]
    nodes = nx.draw_networkx_nodes(
        G = G.subgraph(focus_author),
        pos = pos, 
        node_color = clrs_g_lst[0], # always female
        node_size = 300*(node_eigen/max_eigencentrality),
        alpha = 1,
        linewidths = 1,
        ax = ax[i]
        )
    nodes.set_edgecolor('black')
    
    nx.draw_networkx_edges(
        G, 
        pos = pos, 
        alpha = 0.1,
        edge_color = 'black', 
        ax = ax[i])

    ax[i].axis('off')
    i += 1

  node_max = 300*(10e-8/max_eigencentrality)
  node_median = 300*(10e-9/max_eigencentrality)
  node_min = 300*(10e-10/max_eigencentrality)
  lines = [Line2D([0], [0], linewidth=0, markersize = math.sqrt(node_max), color = 'grey', marker='o'), 
           Line2D([0], [0], linewidth=0, markersize = math.sqrt(node_median), color = 'grey', marker='o'), 
           Line2D([0], [0], linewidth=0, markersize = math.sqrt(node_min), color='grey', marker='o')]

  max_format = '{0}$^{{{1:+03}}}$'.format(10, -8)
  median_format = '{0}$^{{{1:+03}}}$'.format(10, -9)
  min_format = '{0}$^{{{1:+03}}}$'.format(10, -10)
  labels = [max_format, median_format, min_format]
  fig.subplots_adjust(left=0.15) 
  fig.legend(lines, labels, title = "Centrality", loc='lower left', labelspacing = 0.5, 
             frameon=False, handletextpad=0.1, prop=font, handlelength=1.8,
             fontsize = 10)._legend_box.align='left' 

  plt.gcf().text(0.3, 0.05, 'Mar 2020', fontsize = 14)
  plt.gcf().text(0.65, 0.05, 'Jun 2021', fontsize = 14)

  plt.suptitle(f'{suptitle}')
  plt.savefig(f"{outpath}{outfolder}/Pruning{LouvainSetting}_Author{focus_author}.pdf")
  fig.clf()
  plt.close(fig)

In [None]:
def plot_author_2(GCC_2020, GCC_2021, partition_2020, partition_2021, FocusAuthorX, LouvainSetting, suptitle, outfolder):
  # label community
  GCC_2020_com = label_community(GCC_2020, partition_2020)
  GCC_2021_com = label_community(GCC_2021, partition_2021)
  # subset community
  GCC_2020_sub, GCC_2020_nodes = get_subgraph(GCC_2020_com, partition_2020, FocusAuthorX)
  GCC_2021_sub, GCC_2021_nodes = get_subgraph(GCC_2021_com, partition_2021, FocusAuthorX)
  # reasonable size & almost matched degree
  if (10 <= len(GCC_2020_sub.nodes()) <= 40 and 10 <= len(GCC_2021_sub.nodes()) <= 40) and (len(GCC_2021_sub[FocusAuthorX]) -1 <= len(GCC_2020_sub[FocusAuthorX]) <= len(GCC_2021_sub[FocusAuthorX]) + 1):
    # create node info dct
    node_info_2020 = get_node_dict(df_main_2020, GCC_2020_nodes, FocusAuthorX)
    node_info_2021 = get_node_dict(df_main_2021, GCC_2021_nodes, FocusAuthorX)
    # add node info
    GCC_2020_sub = add_node_info(GCC_2020_sub, node_info_2020)
    GCC_2020_sub = add_edge_info(GCC_2020_sub, FocusAuthorX)
    GCC_2021_sub = add_node_info(GCC_2021_sub, node_info_2021)
    GCC_2021_sub = add_edge_info(GCC_2021_sub, FocusAuthorX)
    # plot fig # G1, G2, focus_author, clrs_g_lst, figsize, dpi, outfolder, outpath
    plot_fig_2(G1 = GCC_2020_sub, 
              G2 = GCC_2021_sub, 
              focus_author = FocusAuthorX, 
              clrs_g_lst = clrs_g_lst, 
              figsize = figsize,
              dpi = dpi,
              LouvainSetting = setting,
              suptitle = suptitle,
              outfolder = outfolder,
              outpath = f"{path}/DATA/collaboration/network/figs/final/")
  else: 
    pass 

In [None]:
# setup 
inpath = f"{path}/DATA/collaboration/network/community/"
FocusAuthors = pd.read_csv(f"{path}/DATA/collaboration/network/case_study/2020-03_2021-06_50pctINCREASE.csv")
FocusAuthors = list(FocusAuthors.AuthorId)
outname = ''
suptitle = "Top 50%"
settings = [0.0001]
Author = 2809584808

# loop over stuff
for setting in settings: 

  with open(f'{inpath}june21_res{setting}.pickle', 'rb') as handle:
    partition_2021 = pickle.load(handle)

  with open(f'{inpath}march20_res{setting}.pickle', 'rb') as handle:
    partition_2020 = pickle.load(handle)

  #for Author in FocusAuthors: 
  plot_author_2(GCC_2020 = GCC_2020, 
              GCC_2021 = GCC_2021, 
              partition_2020 = partition_2020, 
              partition_2021 = partition_2021, 
              FocusAuthorX = Author,
              LouvainSetting = setting, # not used now. 
              suptitle = suptitle,
              outfolder = outname)

{2344414499: array([0.03088105, 0.29552184]), 2973089958: array([ 1.        , -0.34594588]), 2949170919: array([-0.32173053, -0.13824248]), 2809584808: array([-0.30562141,  0.21150602]), 2120071689: array([0.16251828, 0.13818228]), 2060472055: array([-0.15444895,  0.28369027]), 2244998768: array([ 0.2419625 , -0.08543341]), 2226095541: array([-0.16065236, -0.23058766]), 2746311221: array([-0.16860708, -0.02173687]), 2408461142: array([ 0.0452565 , -0.05286509]), 2950238521: array([ 0.01812096, -0.23212239]), 3088118714: array([-0.35200882,  0.04909736]), 2683217342: array([-0.03567015,  0.128936  ])}
{2344414499: array([0.04397401, 0.53977111]), 2769798590: array([ 0.29250439, -0.747398  ]), 2973089958: array([-1.        , -0.76329491]), 2949170919: array([-0.23544551,  0.11483285]), 2809584808: array([0.03300918, 0.28665907]), 2120071689: array([ 0.04498282, -0.12793956]), 2060472055: array([0.38071946, 0.33484698]), 2244998768: array([-0.1690439 , -0.17414466]), 2226095541: array([0.