In [1]:
# all of the usual spark setup.
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

Your runtime has 54.8 gigabytes of available RAM



In [2]:
# basic setup
%%capture
from google.colab import drive
drive.mount('/content/gdrive')
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar -xvzf spark-3.1.2-bin-hadoop3.2.tgz
! pip install -q findspark
! pip install pyspark

In [3]:
# new try 
import os
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

In [4]:
# import stuff
import findspark
findspark.init()
import sys
import pandas as pd
sys.path.insert(0,'/content/gdrive/MyDrive/***') # path to MAGspark1.py and MAG.py files
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from MAG import MicrosoftAcademicGraph
from MAGspark1 import get_mag_with_node_connection # Lasse works, but we should be able to do this faster.
import datetime, time 
mag, spark = get_mag_with_node_connection()

In [5]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
from pyspark.sql.functions import col

In [6]:
gpath = "path to data"

In [7]:
'''
Make files for preprints, each year has 1 file
'''

run = False

if run:
  years = range(2010, 2022)

  for year in years:
    if year == 2021:  
      mag.getDataframe('PaperAuthorAffiliationsAttributesRepo') \
      .select("PaperId", "AuthorId", "Date", "Gender", "ScientificAge","CountryCode") \
      .filter((col("Date") >= datetime.date(year, 1, 1)) & (col("Date") < datetime.date(year, 7, 1))) \
      .select("PaperId", "AuthorId", "Date", "Gender", "ScientificAge","CountryCode") \
      .distinct() \
      .toPandas() \
      .to_csv(gpath+f"preprints_{year}.csv", index=False)
    
    else:
      mag.getDataframe('PaperAuthorAffiliationsAttributesRepo') \
      .select("PaperId", "AuthorId", "Date", "Gender", "ScientificAge","CountryCode") \
      .filter((col("Date") >= datetime.date(year, 1, 1)) & (col("Date") <= datetime.date(year, 12, 31))) \
      .select("PaperId", "AuthorId", "Date", "Gender", "ScientificAge","CountryCode") \
      .distinct() \
      .toPandas() \
      .to_csv(gpath+f"preprints_{year}.csv", index=False)

In [8]:
'''
Make files for all doctypes, each year has 1 file
'''

run = False

if run:
  years = range(2010, 2022)

  for year in years:
    if year == 2021:  
      mag.getDataframe('PaperAuthorAffiliationsAttributesAll') \
      .select("PaperId", "AuthorId", "Date", "Gender", "ScientificAge","CountryCode") \
      .filter((col("Date") >= datetime.date(year, 1, 1)) & (col("Date") < datetime.date(year, 7, 1))) \
      .select("PaperId", "AuthorId", "Date", "Gender", "ScientificAge","CountryCode") \
      .distinct() \
      .toPandas() \
      .to_csv(gpath+f"PaperAuthorDateGenderSciAgeAll_{year}.csv", index=False)
    
    else:
      mag.getDataframe('PaperAuthorAffiliationsAttributesAll') \
      .select("PaperId", "AuthorId", "Date", "Gender", "ScientificAge","CountryCode") \
      .filter((col("Date") >= datetime.date(year, 1, 1)) & (col("Date") <= datetime.date(year, 12, 31))) \
      .select("PaperId", "AuthorId", "Date", "Gender", "ScientificAge","CountryCode") \
      .distinct() \
      .toPandas() \
      .to_csv(gpath+f"PaperAuthorDateGenderSciAgeAll_{year}.csv", index=False)

In [9]:
'''
Make files for all doctypes not filtered (keep authors <25 & rec_and_cite),
each year has 1 file
'''

run = True

if run:
  years = range(2010, 2022)
  for year in years:
    m,d = 12, 31
    if year == 2021:
      m,d = 6, 30
    mag.getDataframe('PaperAuthorAffiliationsAttributesNoFilter') \
    .select("PaperId", "AuthorId", "Date", "Gender", "ScientificAge","CountryCode") \
    .filter((col("Date") >= datetime.date(year, 1, 1)) & (col("Date") <= datetime.date(year, m, d))) \
    .select("PaperId", "AuthorId", "Date", "Gender", "ScientificAge","CountryCode") \
    .distinct() \
    .toPandas() \
    .to_csv(gpath+f"PaperAuthorDateGenderSciAgeNoFilter_{year}.csv", index=False)