In [None]:
'''
VMP 2022-02-24: used in final paper
'''

In [None]:
# overall path to the project
path = "path/to/base"

In [None]:
%matplotlib inline

In [None]:
# check RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

Your runtime has 27.3 gigabytes of available RAM



In [None]:
# basic setup
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.lines import Line2D
import numpy as np
from datetime import datetime
import matplotlib
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
from matplotlib.colors import ListedColormap
from matplotlib.ticker import ScalarFormatter

import seaborn as sns
import math 
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
''' overall setup '''

' overall setup '

In [None]:
# markers gender dct
mks_g_dct = {'Female': 'o',
             'Male': '^'} 

# colors gender dct
clrs_g_dct = {'Female': '#fc8d62',
              'Male': '#66c2a5'}

# colors gender lst 
clrs_g_lst = ['#fc8d62', '#66c2a5']

# other setup: 
ms=2  #marker size

# for plots with three panels # 
# other setup: 
sns.set(style='ticks', font_scale=1.3, font='Arial')  #overall style for figs (has to be a bit smaller than productivity)

# for plots with three panels # 
figsize=(4.5*3,3.2)
dpi = 300

import matplotlib.font_manager as font_manager
font = font_manager.FontProperties(family='Arial')

#plt.rcParams['font.family'] = 'sans-serif'
#plt.rcParams['font.sans-serif'] = ['Arial', 'sans-serif']


In [None]:
''' eigencentrality unweighted normalized by FOS '''

' eigencentrality unweighted normalized by FOS '

In [None]:
inPath1 = f"{path}/DATA/collaboration/network_SI/main/"
outpath = f"{path}/DATA/collaboration/network_SI/figs/final/"

In [None]:
# load data 
df_main = pd.read_csv(f"{inPath1}plot_dataframe.csv")

# subset columns that we need and genders
df_main = df_main[["AuthorId", "month", "NormalizedName", "eigencentrality_unweighted_scaled", "Gender", "binned"]].drop_duplicates() 
df_main = df_main[df_main["Gender"] != "Undefined"]

# fix months 
df_main['month'] = pd.DatetimeIndex(df_main['month']).to_period('M') 

In [None]:
''' first plot type (percent in top X percent relative to expected) '''
# here we show both absolute representation of females in top X percent &
# the gap between representation and expected representation 

' first plot type (percent in top X percent relative to expected) '

In [None]:
# function to get the plot (new thing) # 
def plot_baseline(df_main, metric, percents, aggregation, suptitle, limits, clrs_g_dct, figsize, dpi,outpath, outname): 
  fig, ax = plt.subplots(1, len(percents), figsize=figsize, dpi = dpi, constrained_layout=True)
  lower, upper = limits
  i = 0
  for percent in percents:
    df_lst = []
    # get the data for each month
    for month in df_main['month'].unique(): 
      df_main_month = df_main[df_main["month"] == month]
      # get total within genders
      totalN = df_main_month.groupby('Gender').size().to_frame('totalN').reset_index()
      # get top X percent within
      df_main_month = df_main_month.assign(Percentile=df_main_month.groupby("NormalizedName")[metric].rank(pct=True, method = 'first').mul(100)) 
      df_main_month = df_main_month[df_main_month["Percentile"] > percent]
      df_main_month = df_main_month.groupby('Gender').size().to_frame('count').reset_index()
      # get representation overall
      df_main_month_top1_grouped = df_main_month.merge(totalN, how = "inner", on = "Gender")
      # do maths
      df_main_month_top1_grouped = df_main_month_top1_grouped.assign(percent = lambda x: x['count'] / x['count'].sum(),
                                                                     expected = lambda x: x['totalN'] / x['totalN'].sum())
      df_main_month_top1_grouped['month'] = month
      df_lst.append(df_main_month_top1_grouped)

    # collect 
    df_eigen = pd.concat(df_lst)
    workable = df_eigen.pivot_table(values=aggregation, index='month', columns='Gender').rename_axis(None, axis=1).reset_index()
    workable = workable[["month", "Female"]] # order here determines stacking order
    df_expected = df_eigen[df_eigen["Gender"] == "Female"][["expected", "month"]]

    workable.plot("month", "Female", ax = ax[i], color = clrs_g_dct.get('Female'), rot = 40)
    df_expected.plot("month", "expected", ax = ax[i], color = clrs_g_dct.get('Female'), ls = '--', rot = 40)

    ax[i].set_ylim(lower, upper)
    lines = [Line2D([0], [0], color=clrs_g_dct.get('Female')),
            Line2D([0], [0], color=clrs_g_dct.get('Female'), linestyle = '--')]

    labels = ['Observed', 'Expected']
    ax[i].set_title(f'top {100-percent}%')
    ax[i].axvline(x='2020-03', color='grey', ls='--')
    ax[i].tick_params(axis='x', which='minor', labelsize=12)
    if i == 0: 
      ax[i].legend(lines, labels, prop=font, frameon=False) 
      ax[i].set_ylabel('Centrality ranking')
    else: 
      ax[i].get_legend().remove()
    ax[i].set_xlabel('')
    ax[i].yaxis.set_major_locator(plt.LinearLocator(numticks=3)) # 4
    ax[i].yaxis.set_minor_locator(AutoMinorLocator(5))

    i += 1
  
  plt.savefig(f"{outpath}{outname}.pdf")
  fig.clf()
  plt.close(fig)

In [None]:
# generate the plot (new edition) # 
plot_baseline(df_main = df_main, 
              metric = 'eigencentrality_unweighted_scaled',
              percents = [99, 90, 50], 
              aggregation = 'percent', 
              suptitle = 'Eigenvector-centrality$_{{{u}}}$: Scientific fields represented proportionally', 
              limits = (.1, .3),
              clrs_g_dct = clrs_g_dct,
              figsize = figsize,
              dpi = dpi,
              outpath = outpath,
              outname = f'SI_baseline') # new naming

In [None]:
''' second plot - not normalized '''

' second plot - not normalized '

In [None]:
metric_dct = {
    'eigencentrality_unweighted_scaled': 'Eigen$_{{{u}}}$',
}
import matplotlib.ticker as mticker


In [None]:
def plot_absolute(df_main, 
                  metric_dct,
                  metric_lst,
                  aggregation_lst,
                  suptitle,
                  clrs_g_dct, 
                  figsize, 
                  dpi, 
                  ms,
                  mks_g_dct,
                  outpath, 
                  outname): 

  fig, ax = plt.subplots(1, len(aggregation_lst), figsize=figsize, dpi = dpi, constrained_layout=True)
  ax = ax.flatten()
  i = 0

  class ScalarFormatterForceFormat(ScalarFormatter):
      def _set_format(self):  # Override function that finds format to use.
          self.format = "%1.1f"  # Give format here

  for metric, aggregation in zip(metric_lst, aggregation_lst):
    df_ready = df_main.groupby(['month', 'Gender'])[metric].quantile(aggregation, interpolation='lower').reset_index()
    metric_pretty = metric_dct.get(metric)

    # create lists 
    gender_lst = ['Female', 'Male'] 
    lines = []

    for gender_x in gender_lst: 
      df_specific = df_ready[df_ready['Gender'] == gender_x]
      df_specific.plot('month', metric, ax = ax[i], color = clrs_g_dct.get(gender_x), label = f"{gender_x}", ms = ms, marker = mks_g_dct.get(gender_x), rot = '40') # rot = 30
    ax[i].axvline(x='2020-03', color='grey', lw=1, ls='--') 
    aggregation_label = int(round((1 - aggregation)*100,0))
    lines = [Line2D([0], [0], color=clrs_g_dct.get('Female')),
             Line2D([0], [0], color=clrs_g_dct.get('Male'))]

    labels = ['Female', 'Male']
    ax[i].legend(lines, labels, frameon=False, prop=font)
    ax[i].set_title(f"top {aggregation_label}%")
    ax[i].tick_params(axis='x', which='minor', labelsize=12)
    if i == 0: 
      ax[i].set_ylabel('Centrality')
    ax[i].set_xlabel('')

    ## another manual try 
    math.ceil(df_specific[metric].max())
    if i == 0: 
      ax[i].legend(lines, labels, frameon=False, prop=font)
      ax[i].set_ylim([0, 1.0e-3])
    if i == 1: 
      ax[i].get_legend().remove()
      ax[i].set_ylim([0, 3.0e-5])
    if i == 2: 
      ax[i].get_legend().remove()
      ax[i].set_ylim([0, 2.0e-7])
    
    ## basic setup of y ticks
    ax[i].yaxis.set_major_locator(plt.LinearLocator(numticks=6))
    ax[i].yaxis.set_minor_locator(AutoMinorLocator(6))
    
    ## scaling y ticks 
    yfmt = ScalarFormatterForceFormat()
    yfmt.set_powerlimits((0,0))
    ax[i].yaxis.set_major_formatter(yfmt) #%.2e
    ax[i].ticklabel_format(axis = 'y', useMathText=True)


    i += 1

  plt.savefig(f"{outpath}{outname}.pdf")
  fig.clf()
  plt.close(fig)


In [None]:
# run it 
plot_absolute(df_main = df_main, 
              metric_dct = metric_dct,
              metric_lst = ['eigencentrality_unweighted_scaled'] * 3,
              aggregation_lst = [0.99, 0.9, 0.5],
              suptitle = 'Eigenvector-centrality$_{{{u}}}$: Absolute values',
              clrs_g_dct = clrs_g_dct, 
              figsize = figsize, 
              dpi = dpi, 
              ms = ms,
              mks_g_dct = mks_g_dct,
              outpath = outpath,
              outname = f'SI_absolute')

In [None]:
''' plot for SI '''

' plot for SI '

In [None]:
def plot_gender(df_main, metric, percents, aggregation, suptitle, limits, clrs_g_lst, figsize, dpi, outpath, outname): 
  fig, ax = plt.subplots(1, len(percents), figsize=figsize, dpi = dpi, constrained_layout=True)
  #suptitle_size, subtitle_size = titlesize
  lower, upper = limits
  i = 0
  for percent in percents:
    df_lst = []
    # get the data for each month
    for month in df_main['month'].unique(): 
      df_main_month = df_main[df_main["month"] == month]
      # get total within genders
      totalN = df_main_month.groupby('Gender').size().to_frame('totalN').reset_index()
      # get top X percent within
      df_main_month = df_main_month.assign(Percentile=df_main_month.groupby("NormalizedName")[metric].rank(pct=True, method = 'first').mul(100)) 
      df_main_month = df_main_month[df_main_month["Percentile"] > percent]
      df_main_month = df_main_month.groupby('Gender').size().to_frame('count').reset_index()
      # get representation overall
      df_main_month_top1_grouped = df_main_month.merge(totalN, how = "inner", on = "Gender")
      df_main_month_top1_grouped = df_main_month_top1_grouped.assign(percent = lambda x: x['count'] / x['count'].sum(),
                                                                     expected = lambda x: x['totalN'] / x['totalN'].sum(),
                                                                     difference = lambda x: -((x["expected"] - x["percent"])/x["expected"]))
      df_main_month_top1_grouped['month'] = month
      df_lst.append(df_main_month_top1_grouped)

    # collect 
    df_eigen = pd.concat(df_lst)
    workable = df_eigen.pivot_table(values='difference', index='month', columns='Gender').rename_axis(None, axis=1).reset_index()
    workable = workable[["month", "Female", "Male"]] # order here determines stacking order

    # plot 
    workable.plot.area(x = 'month', stacked=False, ax=ax[i], color = clrs_g_lst, alpha = 0.7, rot = '40')
    lines = [Line2D([0], [0], color=clrs_g_lst[0]),
             Line2D([0], [0], color=clrs_g_lst[1])]

    labels = ['Female', 'Male']
    ax[i].set_ylim(lower, upper)
    ax[i].set_title(f'top {100-percent}%')
    ax[i].axvline(x='2020-03', color='lightgrey', lw=1, ls='--')
    ax[i].tick_params(axis='x', which='minor', labelsize=12)
    if i == 0: 
      ax[i].legend(lines, labels, loc='lower left', prop=font)
      ax[i].set_ylabel('Centrality ranking')
    else: 
      ax[i].get_legend().remove()
    ax[i].set_xlabel('')
    ax[i].yaxis.set_major_locator(plt.LinearLocator(numticks=7))
    ax[i].yaxis.set_minor_locator(AutoMinorLocator(7))
    i += 1

  #plt.suptitle(f'{suptitle}')
  plt.savefig(f"{outpath}{outname}.pdf")
  fig.clf()
  plt.close(fig)


In [None]:
# generate the plot # 
plot_gender(df_main = df_main, 
            metric = 'eigencentrality_unweighted_scaled',
            percents = [99, 90, 50], 
            aggregation = 'percent', 
            suptitle = 'Eigenvector-centrality$_{{{u}}}$: Scientific fields represented proportionally', 
            limits = (-0.4, 0.2), 
            clrs_g_lst = clrs_g_lst,
            figsize = figsize,
            dpi = dpi,
            outpath = outpath,
            outname = f'SI_gender')