In [None]:
'''
VMP 2022-02-24: used in final report
'''

In [None]:
# check RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

Your runtime has 54.8 gigabytes of available RAM



In [None]:
# overall path to the project
path = "path/to/base"

In [None]:
# basic setup
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# ...
from community import community_louvain
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd 
import pickle

In [None]:
# paths specific to the global dataset. 
inPath1 = f"{path}/DATA/collaboration/network/main/"
outpath = f"{path}/DATA/collaboration/network/case_study/"

In [None]:
# load and filter
df_main = pd.read_csv(f"{inPath1}plot_dataframe.csv")
df_main['month'] = pd.DatetimeIndex(df_main['month']).to_period('M')
df_main = df_main[df_main["Gender"] != "Undefined"]
df_main = df_main[["AuthorId", "Gender", "NormalizedName", "month", "eigencentrality_unweighted_scaled", "degree_unweighted"]].drop_duplicates()

In [None]:
''' create function '''

' create function '

In [None]:
def get_sample(df_main, periods, percentiles, labels, top_bottom, outname): 
  # unpack
  lower, higher = top_bottom
  period_1, period_2 = periods

  # two time-periods
  df_2020 = df_main[df_main["month"] == period_1][["AuthorId", 
                                                   "eigencentrality_unweighted_scaled", 
                                                   "Gender", 
                                                   "NormalizedName", 
                                                   "degree_unweighted"]].drop_duplicates()
  df_2021 = df_main[df_main["month"] == period_2][["AuthorId", 
                                                   "eigencentrality_unweighted_scaled", 
                                                   "Gender", 
                                                   "NormalizedName", 
                                                   "degree_unweighted"]].rename(columns = {'eigencentrality_unweighted_scaled': 'eigencentrality_unweighted_scaled_new',
                                                                                           'degree_unweighted': 'degree_unweighted_new'}).drop_duplicates() 
  
  # get women in top 1% (almost all physics of course)
  df_2020["percentile"] = pd.qcut(df_2020.eigencentrality_unweighted_scaled, percentiles, labels = labels)
  df_2020_target = df_2020[df_2020["percentile"] == "target"]
  df_2020_target_female = df_2020_target = df_2020_target[df_2020_target["Gender"] == "Female"]         
  
  # Author has to appear in both March 2020 and June 2021. 
  df_2020_2021 = df_2020_target_female.merge(df_2021, how = "inner", on = "AuthorId")             

  # increase or decrease
  df_2020_2021 = df_2020_2021.assign(eigencentrality_improvement = lambda x: x.eigencentrality_unweighted_scaled_new / x.eigencentrality_unweighted_scaled)   

  # eigencentrality fall or improve & degree stays the same. 
  df_2020_2021_filter = df_2020_2021[((df_2020_2021["eigencentrality_improvement"] >= lower) & (df_2020_2021["eigencentrality_improvement"] <= higher)) &
                                    (df_2020_2021["degree_unweighted"] == df_2020_2021["degree_unweighted"])]        

  # sample 50
  if len(df_2020_2021_filter) < 50: 
    print('rethink parameters')

  df_2020_2021_sample = df_2020_2021_filter.sample(n = 50)        

  # write file 
  df_2020_2021_sample.to_csv(f"{outpath}{period_1}_{period_2}_{outname}.csv", index = False)                                                                                        

In [None]:
get_sample(df_main = df_main, 
           periods = ("2020-03", "2021-06"),
           percentiles = [0, .985, .995, 1],
           labels = ["low", "target", "high"],
           top_bottom = (0.1, 0.5), # drop between half & 10x
           outname = 'top1pctDROP')

In [None]:
get_sample(df_main = df_main, 
           periods = ("2020-03", "2021-06"),
           percentiles = [0, .5, .51, 1],
           labels = ['low', 'target', 'high'],
           top_bottom = (2, 10), # increase between 2x and 10x
           outname = '50pctINCREASE')