In [None]:
''' VMP 2022-03-02: used in final report.
Now uses one overall path '''

' we actually do have the problem of assigning FoS to papers '

In [None]:
# overall path to the project
path = "path/to/base"

In [None]:
# check RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

Your runtime has 54.8 gigabytes of available RAM



In [None]:
# basic setup
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!pip install igraph

Collecting igraph
  Downloading igraph-0.9.8-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.2 MB)
[K     |████████████████████████████████| 3.2 MB 4.2 MB/s 
[?25hCollecting texttable>=1.6.2
  Downloading texttable-1.6.4-py2.py3-none-any.whl (10 kB)
Installing collected packages: texttable, igraph
Successfully installed igraph-0.9.8 texttable-1.6.4


In [None]:
import pandas as pd 
from tqdm import tqdm
import networkx as nx
from functools import reduce
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math 
import re 
import igraph as ig
from os import listdir
from os.path import isfile, join
pd.options.mode.chained_assignment = None 
plt.style.use('ggplot')

In [None]:
def igraph_metrics(filename, inpath, outpath, date_pattern): 

  # get information from file name 
  start_date = re.findall(date_pattern, filename)[0]  
  end_date = re.findall(date_pattern, filename)[1]  

  print(f"\nstart_date: {start_date} \nend_date: {end_date}")

  # read csv
  df_tmp = pd.read_csv(f"{inpath}{filename}")
  df_tmp = df_tmp[["AuthorId_x", "AuthorId_y"]]
  df_tmp = df_tmp.groupby(['AuthorId_x', 'AuthorId_y']).size().to_frame('weight').reset_index()

  tuples = [tuple(x) for x in df_tmp.values]
  G = ig.Graph.TupleList(tuples, directed = False, edge_attrs = ['weight'])
  GCC = G.clusters().giant()

  # eigenvector weigthed 
  eigencentrality_weighted = GCC.eigenvector_centrality(directed = False, weights = 'weight', scale = False)

  # eigenvector unweighted
  eigencentrality_unweighted = GCC.eigenvector_centrality(directed = False, weights = None, scale = False)

  # eigenvector weigthed scaled
  eigencentrality_weighted_scaled = GCC.eigenvector_centrality(directed = False, weights = 'weight', scale = True)

  # eigenvector unweighted scaled
  eigencentrality_unweighted_scaled = GCC.eigenvector_centrality(directed = False, weights = None, scale = True)

  # k-core 
  coreness = GCC.coreness(mode = "all")

  # degree weighted
  degree_weighted = GCC.strength(mode = 'all', loops = False, weights = 'weight')

  # degree unweighted
  degree_unweighted = GCC.degree(mode = 'all', loops = False)

  # meta-information
  AuthorId = GCC.vs()["name"]

  df_main = pd.DataFrame({
      'AuthorId': AuthorId,
      'eigencentrality_weighted': eigencentrality_weighted,
      'eigencentrality_unweighted': eigencentrality_unweighted,
      'eigencentrality_weighted_scaled': eigencentrality_weighted_scaled,
      'eigencentrality_unweighted_scaled': eigencentrality_unweighted_scaled,
      'coreness': coreness,
      'degree_weighted': degree_weighted,
      'degree_unweighted': degree_unweighted
      })

  # try new setup
  G_edges_w = sum(G.es['weight'])
  GCC_edges_w = sum(GCC.es['weight'])

  # overall information
  df_netsize = pd.DataFrame({
      'G_nodes': [G.vcount()],
      'G_edges': [G.ecount()],
      'G_edges_w': G_edges_w,
      'GCC_edges_w': GCC_edges_w,
      'GCC_nodes': [GCC.vcount()],
      'GCC_edges': [GCC.ecount()]
  })

  # save dataframe and networks
  df_main.to_csv(f"{outpath}metrics_{start_date}_{end_date}_main.csv", index = False)
  df_netsize.to_csv(f"{outpath}metrics_{start_date}_{end_date}_netsize.csv", index = False)


In [None]:
# paths (ref/cite)
inpath = f"{path}/DATA/collaboration/network/collaboration/"
outpath = f"{path}/DATA/collaboration/network/metrics/"

In [None]:
# regex patterns & files
date_pattern = '\d{4}-\d{2}'
collaboration_files = [x for x in listdir(f"{inpath}") if re.findall("\d.csv", x)]

# loop over files for these fields 
for collaboration_file in collaboration_files: 
  igraph_metrics(filename = collaboration_file, 
                  inpath = inpath, 
                  outpath = outpath,  
                  date_pattern = date_pattern)


start_date: 2010-01 
end_date: 2015-01

start_date: 2010-02 
end_date: 2015-02

start_date: 2010-03 
end_date: 2015-03

start_date: 2010-04 
end_date: 2015-04

start_date: 2010-05 
end_date: 2015-05

start_date: 2010-06 
end_date: 2015-06

start_date: 2010-07 
end_date: 2015-07

start_date: 2010-08 
end_date: 2015-08

start_date: 2010-09 
end_date: 2015-09

start_date: 2010-10 
end_date: 2015-10

start_date: 2010-11 
end_date: 2015-11

start_date: 2010-12 
end_date: 2015-12

start_date: 2011-01 
end_date: 2016-01

start_date: 2011-02 
end_date: 2016-02

start_date: 2011-03 
end_date: 2016-03

start_date: 2011-04 
end_date: 2016-04

start_date: 2011-05 
end_date: 2016-05

start_date: 2011-06 
end_date: 2016-06

start_date: 2011-07 
end_date: 2016-07

start_date: 2011-08 
end_date: 2016-08

start_date: 2011-09 
end_date: 2016-09

start_date: 2011-10 
end_date: 2016-10

start_date: 2011-11 
end_date: 2016-11

start_date: 2011-12 
end_date: 2016-12

start_date: 2012-01 
end_date: 2017-01
