In [None]:
''' VMP 2022-03-02: used in final report.
This document runs overall preprocessing. 
Takes data from the HPC in PySpark format & 
creates the necessary .csv files for the collaboration analysis.
Now uses one overall path
'''

In [None]:
# overall path to the project
path = "path/to/base"

In [None]:
# check RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

Your runtime has 54.8 gigabytes of available RAM



In [None]:
# basic setup
%%capture
from google.colab import drive
drive.mount('/content/gdrive')
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar -xvzf spark-3.1.2-bin-hadoop3.2.tgz
! pip install -q findspark
! pip install pyspark

In [None]:
# new try 
import os
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

In [None]:
# import stuff
import findspark
findspark.init()
import sys
import pandas as pd
sys.path.insert(0, f'{path}/CODE/Database') # path to data from HPC
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col
from MAG import MicrosoftAcademicGraph
from MAGspark1 import get_mag_with_node_connection 
import datetime, time 
mag, spark = get_mag_with_node_connection()

In [None]:
''' dataset for collaboration network '''

' dataset for collaboration network '

In [None]:
# subset of the data for collaboration.ipynb
mag.getDataframe('PaperAuthorAffiliationsAttributesRepo') \
  .select('PaperId', 'AuthorId', 'Date') \
  .filter(col('Date') < datetime.date(2021, 7, 1)) \
  .distinct() \
  .toPandas() \
  .to_csv(f"{path}/DATA/collaboration/network/preprocessing/paaar_collaboration.csv", index = False) 

In [None]:
''' paper author for main '''

' paper author for main '

In [None]:
# subset of data for main.ipynb
mag.getDataframe('PaperAuthorAffiliationsAttributesRepo') \
  .select('PaperId', 'AuthorId', 'Date', 'ScientificAge') \
  .filter(col('Date') < datetime.date(2021, 7, 1)) \
  .distinct() \
  .toPandas() \
  .to_csv(f"{path}/DATA/collaboration/network/preprocessing/paaar_main.csv", index = False)

In [None]:
''' author information for main '''

' authorinformation for main '

In [None]:
# author information 
mag.getDataframe('PaperAuthorAffiliationsAttributesRepo') \
  .select('AuthorId', 'Gender', 'CountryCode') \ 
  .distinct() \
  .toPandas() \
  .to_csv(f"{path}/DATA/collaboration/network/preprocessing/AuthorCountryGenderRepo.csv", index = False)

In [None]:
# main field of study ([2010-2021-06])
paaarFoS = mag.getDataframe('PaperAuthorAffiliationsAttributesRepo') \
  .select('PaperId', 'AuthorId', 'Date', 'NormalizedName') \
  .filter(col('Date') < datetime.date(2021, 7, 1)) \
  .distinct() \
  .toPandas()

In [None]:
# the 10 STEM fields 
fields = ["biology", "chemistry", "computer science", "engineering", "environmental science", "geography", "geology", "materials science", "mathematics", "physics"]

In [None]:
# only in legit fields
paaarFoSmain = paaarFoS[paaarFoS['NormalizedName'].isin(fields)]

In [None]:
# FoS 
paaarFoSunique = paaarFoSmain.groupby('AuthorId')['NormalizedName'].apply(pd.Series.mode).reset_index()
paaarFoSunique = paaarFoSunique[["AuthorId", "NormalizedName"]]
paaarFoSunique = paaarFoSunique.sample(frac=1).drop_duplicates(subset='AuthorId').reset_index(drop=True)
paaarFoSunique.to_csv(f"{path}/DATA/collaboration/network/preprocessing/paaarFoSunique.csv", index = False)

In [None]:
# merge inner with other Author information
AuthorGenderRepo = pd.read_csv(f"{path}/DATA/collaboration/network/preprocessing/AuthorGenderRepo.csv")
AuthorGenderFoSRepo = AuthorGenderRepo.merge(paaarFoSunique, on = "AuthorId", how = "inner")
AuthorGenderFoSRepo.to_csv(f"{path}/DATA/collaboration/network/preprocessing/AuthorCountryGenderFoSRepo.csv", index = False)

In [None]:
''' author information for plots (by month) '''

' author information for plots (by month) '

In [None]:
# subset of data for main.ipynb
mag.getDataframe('PaperAuthorAffiliationsAttributesRepo') \
  .select('PaperId', 'AuthorId', 'Date', 'Gender', 'NormalizedName') \
  .filter(col('Date') < datetime.date(2021, 7, 1)) \
  .distinct() \
  .toPandas() \
  .to_csv(f"{path}/DATA/collaboration/network/preprocessing/paaar_check_plot.csv", index = False)