In [1]:
# all of the usual spark setup.
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

Your runtime has 54.8 gigabytes of available RAM



In [2]:
# basic setup
%%capture
from google.colab import drive
drive.mount('/content/gdrive')
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar -xvzf spark-3.1.2-bin-hadoop3.2.tgz
! pip install -q findspark
! pip install pyspark

In [3]:
# new try 
import os
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

In [4]:
# import stuff
import findspark
findspark.init()
import sys
import pandas as pd
sys.path.insert(0,'/content/gdrive/MyDrive/<path to MAGspark1.py and MAG.py>')
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from MAG import MicrosoftAcademicGraph
from MAGspark1 import get_mag_with_node_connection
import datetime, time 
mag, spark = get_mag_with_node_connection()

In [5]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
from pyspark.sql.functions import col

In [6]:
gpath = "/content/gdrive/MyDrive/<path to destination>"

In [None]:
'''
Compute proportion of preprints to other doctypes, All file
'''

run = False

if run:
  years = range(2021, 2022)
  for year in years:
    m, d = (12, 31)
    if year ==2021:
      m, d = (6, 30)
    
    df = mag.getDataframe('PaperAuthorAffiliationsAttributesAll') \
    .select("PaperId", "Date", "DocType") \
    .filter((col("Date") >= datetime.date(year, 1, 1)) & (col("Date") <= datetime.date(year, m, d))) \
    .select("PaperId", "Date", "DocType") \
    .distinct() \
    .toPandas()
    
    df['Date'] = pd.to_datetime(df['Date']).dt.to_period("M")
    
    (df[df.DocType=='Repository'].groupby('Date')['PaperId'].nunique() / \
     df[df.DocType!='Repository'].groupby('Date')['PaperId'].nunique()) \
     .reset_index(name='Proportion') \
     .to_csv(gpath+f"RatiopreprintstoAll_{year}.csv", index=False)
     
    print(year)


In [None]:
'''
Compute overall stats for table in productivity panel, All file (filtered)
'''

run = True

if run:
  df = mag.getDataframe('PaperAuthorAffiliationsAttributesAll') \
  .select("PaperId", "AuthorId", "Date", "Gender") \
  .filter((col("Date") >= datetime.date(2019, 1, 1)) & (col("Date") <= datetime.date(2021, 6, 30))) \
  .select("PaperId", "AuthorId", "Date", "Gender") \
  .distinct() \
  .toPandas()

  df['Date'] = pd.to_datetime(df['Date']).dt.to_period("Y")

  for year in range(2019, 2022):
    wdw = df[df.Date == str(year)]
    n_documents = len(set(wdw['PaperId']))
    n_auhorships = wdw[wdw['Gender'] != -1].shape[0]
    n_uniq_authors = len(set(wdw['AuthorId']))

    print(year, n_documents, n_auhorships, n_uniq_authors)

In [None]:
'''
Compute proportion of preprints to other doctypes, NoFilter file
'''

run = True

if run:
  
  results = []
  years = range(2014, 2022)

  for year in years:
    m, d = (12, 31)
    if year ==2021:
      m, d = (6, 30)
    
    df = mag.getDataframe('PaperAuthorAffiliationsAttributesNoFilter') \
    .select("PaperId", "Date", "DocType") \
    .filter((col("Date") >= datetime.date(year, 1, 1)) & (col("Date") <= datetime.date(year, m, d))) \
    .select("PaperId", "Date", "DocType") \
    .distinct() \
    .toPandas()
    
    df['Date'] = pd.to_datetime(df['Date']).dt.to_period("M")
    
    res = (df[df.DocType=='Repository'].groupby('Date')['PaperId'].nunique() / \
     df[df.DocType!='Repository'].groupby('Date')['PaperId'].nunique()) \
     .reset_index(name='Proportion')
    results += res.values.tolist()
    print(year)
    
  dfout = pd.DataFrame(results, columns=['Date', 'Proportion'])    
  dfout.to_csv(gpath+f"RatiopreprintstoNoFilter.csv", index=False)


2014
2015
2016
2017
2018
2019
2020
2021


In [None]:
dfout.to_csv(gpath+f"RatiopreprintstoNoFilter.csv", index=False)


In [None]:
'''
Compute overall stats for table in productivity panel, NoFilter file
'''

run = True

if run:
  df = mag.getDataframe('PaperAuthorAffiliationsAttributesNoFilter') \
  .select("PaperId", "AuthorId", "Date", "Gender") \
  .filter((col("Date") >= datetime.date(2019, 1, 1)) & (col("Date") <= datetime.date(2021, 6, 30))) \
  .select("PaperId", "AuthorId", "Date", "Gender") \
  .distinct() \
  .toPandas()

  df['Date'] = pd.to_datetime(df['Date']).dt.to_period("Y")

  for year in range(2019, 2022):
    wdw = df[df.Date == str(year)]
    n_documents = len(set(wdw['PaperId']))
    n_auhorships = wdw[wdw['Gender'] != -1].shape[0]
    n_uniq_authors = len(set(wdw['AuthorId']))

    print(year, n_documents, n_auhorships, n_uniq_authors)

2019 6409642 9749795 12642389
2020 7192663 10192716 14183053
2021 2675189 5405354 6758598


In [13]:
'''
Compute proportion of preprints to other doctypes by field, All file (filtered)
'''

run = True

if run:

  fos = ['biology',
       'chemistry',
       'computer science',
       'engineering',
       'environmental science',
       'geography',
       'geology',
       'materials science',
       'mathematics',
       'physics']
  
  results = []

  for field in fos:
    # for year in years:
    #   m, d = (12, 31)
    #   if year ==2021:
    #     m, d = (6, 30)

    df = mag.getDataframe('PaperAuthorAffiliationsAttributesAll') \
    .select("PaperId", "Date", "DocType", "NormalizedName") \
    .filter((col("NormalizedName") == field) & (col("Date") >= datetime.date(2011, 1, 1)) & (col("Date") <= datetime.date(2021, 6, 30)) ) \
    .select("PaperId", "Date", "DocType", "NormalizedName") \
    .distinct() \
    .toPandas()
    
    df['Date'] = pd.to_datetime(df['Date']).dt.to_period("M")
    
    res = (df[df.DocType=='Repository'].groupby('Date')['PaperId'].nunique() / \
          df[df.DocType!='Repository'].groupby('Date')['PaperId'].nunique()) \
          .reset_index(name='Proportion')
    res['Field'] = field
    results += res.values.tolist()
    print(field)
    
  dfout = pd.DataFrame(results, columns=['Date', 'Proportion', 'Field'])    
  dfout.to_csv(gpath+f"DateRatiopreprintstoAllField.csv", index=False)


biology
chemistry
computer science
engineering
environmental science
geography
geology
materials science
mathematics
physics


In [14]:
'''
Compute proportion of preprints to other doctypes by field, NoFilter file
'''

run = True

if run:

  fos = ['biology',
       'chemistry',
       'computer science',
       'engineering',
       'environmental science',
       'geography',
       'geology',
       'materials science',
       'mathematics',
       'physics']
  
  results = []

  for field in fos:
    # for year in years:
    #   m, d = (12, 31)
    #   if year ==2021:
    #     m, d = (6, 30)

    df = mag.getDataframe('PaperAuthorAffiliationsAttributesNoFilter') \
    .select("PaperId", "Date", "DocType", "NormalizedName") \
    .filter((col("NormalizedName") == field) & (col("Date") >= datetime.date(2011, 1, 1)) & (col("Date") <= datetime.date(2021, 6, 30)) ) \
    .select("PaperId", "Date", "DocType", "NormalizedName") \
    .distinct() \
    .toPandas()
    
    df['Date'] = pd.to_datetime(df['Date']).dt.to_period("M")
    
    res = (df[df.DocType=='Repository'].groupby('Date')['PaperId'].nunique() / \
          df[df.DocType!='Repository'].groupby('Date')['PaperId'].nunique()) \
          .reset_index(name='Proportion')
    res['Field'] = field
    results += res.values.tolist()
    print(field)
    
  dfout = pd.DataFrame(results, columns=['Date', 'Proportion', 'Field'])    
  dfout.to_csv(gpath+f"DateRatiopreprintstoNoFilterField.csv", index=False)


biology
chemistry
computer science
engineering
environmental science
geography
geology
materials science
mathematics
physics
