In [None]:
''' VMP 2022-03-02: used in SI. 
Now uses one overall path '''

In [None]:
# overall path to the project
path = "path/to/base"

In [None]:
# check RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

Your runtime has 54.8 gigabytes of available RAM



In [None]:
# basic setup
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np
import math 
import re 
from tqdm import tqdm
from os import listdir
from os.path import isfile, join
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
''' check network size (relative) '''

' check network size (relative) '

In [None]:
### paper centric ### 

In [None]:
# slice the meta paper - much faster like this. 
def get_papers(paaar, paper_GCC, start_date, end_date):
  
  '''
  paaar: PaperAuthorAffiliationsAttributesRepo [PaperId, AuthorId, Date] & distinct.
  '''
  paper_slice = paaar[
                          (paaar["Date"] >= start_date) & 
                          (paaar["Date"] < end_date) # <= also works, but just for good measure.
      ]

  print(f"inclusive: from {paper_slice.Date.min()} to {paper_slice.Date.max()}")

  paper_slice['start_date'] = start_date
  
  # gives both (how = 'inner' gives only GCC)
  paper_GCC_ready = paper_slice.merge(paper_GCC, how = 'inner', on = 'AuthorId') 

  return paper_GCC_ready

In [None]:
# binning & grouping by author-level. 
def prepare_paper_df(df_paper, start_date, grouping): 
  df = df_paper.copy()
  bins = [-0.1, 1, 3, 6, 15, 999] # left not included, so now less than 0 to make sure we get those. 
  labels = ['[0-1]', ']1-3]', ']3-6]', ']6-15]', ']15-999]'] 
  df = df.groupby('AuthorId')[grouping].mean().reset_index() 
  df['binned'] = pd.cut(df['ScientificAge'], bins = bins, labels = labels)
  lst = ["AuthorId", "binned"] + grouping 
  df = df[lst]
  df['start_date'] = start_date
  return df

In [None]:
# function for creating both exclusive & inclusive version. 
def add_meta_both(author_binned, author_FoS): 
  
  ''' 
  use 'AuthorCountryGenderRepo' # took out country
  '''

  author_inclusive = author_binned.merge(author_FoS, how = 'inner', on = 'AuthorId')
  author_inclusive['Gender'] = author_inclusive['Gender'].fillna('other')
  #author_inclusive['CountryCode'] = author_inclusive['CountryCode'].fillna('other')

  return author_inclusive #, author_exclusive

In [None]:
def create_files(GCC_files, paaar, AuthorCountryGenderRepo, inPath, outPath): 

  # initialize lists 
  author_inclusive_GCC_lst = []

  # initialize regex expressions
  date_pattern = '\d{4}-\d{2}'

  for GCC_file in GCC_files: 
    
    # testing 
    print(GCC_file)

    # get information from file name 
    start_date = re.findall(date_pattern, GCC_file)[0]  
    end_date = re.findall(date_pattern, GCC_file)[1]

    # load paper 
    paper_GCC = pd.read_csv(f"{inPath}{GCC_file}")

    ### paper centric ### 
    paper_GCC_ready = get_papers(paaar, paper_GCC, start_date, end_date)

    ### author centric pt. 1 ###
    author_GCC_tmp = prepare_paper_df(paper_GCC_ready, start_date, ['ScientificAge',
                                                                    'eigencentrality_weighted', 
                                                                    'eigencentrality_unweighted', 
                                                                    'eigencentrality_weighted_scaled',
                                                                    'eigencentrality_unweighted_scaled',
                                                                    'coreness',
                                                                    'degree_weighted',
                                                                    'degree_unweighted'])

    ### author centric pt. 2 ### 
    author_inclusive_GCC = add_meta_both(author_GCC_tmp, AuthorCountryGenderRepo)

    ### add to lists ### 
    author_inclusive_GCC_lst.append(author_inclusive_GCC)

  # concat them 
  author_inclusive_GCC_df = pd.concat(author_inclusive_GCC_lst)

  # save them
  author_inclusive_GCC_df.to_csv(f"{outPath}preprints_author_inclusive_GCC_df.csv", index = False)


In [None]:
''' author centric '''

' author centric '

In [None]:
## load files ## 
# both kinds of authors to check whether it makes a difference 
inPath = f"{path}/DATA/collaboration/network_SI/preprocessing/"
AuthorCountryGenderRepo = pd.read_csv(f"{inPath}AuthorCountryGenderFoSAll.csv") 
paaar = pd.read_csv(f"{inPath}paaaa_main.csv") 

In [None]:
# inpath (what needs changing)
inPath = f"{path}/DATA/collaboration/network_SI/metrics/"

In [None]:
GCC_files = sorted([x for x in listdir(f"{inPath}") if x.endswith("main.csv")])

In [None]:
# outpath 
outPath = f"{path}/DATA/collaboration/network_SI/main/"

In [None]:
# NB: probably not the best long-term setup.

In [None]:
create_files(GCC_files = GCC_files,
             paaar = paaar, 
             AuthorCountryGenderRepo = AuthorCountryGenderRepo,
             inPath = inPath,
             outPath = outPath)

metrics_2010-01_2015-01_main.csv
inclusive: from 2010-01-01 to 2014-12-31
metrics_2010-02_2015-02_main.csv
inclusive: from 2010-02-01 to 2015-01-31
metrics_2010-03_2015-03_main.csv
inclusive: from 2010-03-01 to 2015-02-28
metrics_2010-04_2015-04_main.csv
inclusive: from 2010-04-01 to 2015-03-31
metrics_2010-05_2015-05_main.csv
inclusive: from 2010-05-01 to 2015-04-30
metrics_2010-06_2015-06_main.csv
inclusive: from 2010-06-01 to 2015-05-31
metrics_2010-07_2015-07_main.csv
inclusive: from 2010-07-01 to 2015-06-30
metrics_2010-08_2015-08_main.csv
inclusive: from 2010-08-01 to 2015-07-31
metrics_2010-09_2015-09_main.csv
inclusive: from 2010-09-01 to 2015-08-31
metrics_2010-10_2015-10_main.csv
inclusive: from 2010-10-01 to 2015-09-30
metrics_2010-11_2015-11_main.csv
inclusive: from 2010-11-01 to 2015-10-31
metrics_2010-12_2015-12_main.csv
inclusive: from 2010-12-01 to 2015-11-30
metrics_2011-01_2016-01_main.csv
inclusive: from 2011-01-01 to 2015-12-31
metrics_2011-02_2016-02_main.csv
inclu