In [1]:
'''
VMP 2022-03-23:
Updated version of preprocessing. 
Only relies on one item outside of MAG (GenderizedFirstnames).
'''

'\nVMP 2022-03-12:\nUpdated version of preprocessing. \nOnly relies on one item outside of MAG (GenderizedFirstnames).\n'

In [1]:
import sys  
sys.path.insert(0, '/home/vicp/colab-productivity-covid/DataCuration')
from MAGspark import get_mag_with_cluster_connection
from MAG import MicrosoftAcademicGraph
import os
from pyspark.sql import functions as F, Window
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pyspark.sql.functions import avg
os.chdir('/home/vicp/colab-productivity-covid/DataCuration')
mag, spark = get_mag_with_cluster_connection(68377, 
                               memory_per_executor=16000)

['NAME STATE JOBID', 'simple-gpu PENDING 68375', 'simple-gpu PENDING 68370', '1.train.0 PENDING 68356', '1.train.1 PENDING 68357', 'nvmi_test.job PENDING 68362', 'nvmi_test.job PENDING 68363', 'translate-sbu PENDING 68378', 'resnet-container-RTX RUNNING 68098', 'cluster_new.job RUNNING 68377', 'mpi RUNNING 68309', 'mpi RUNNING 68182', 'eval RUNNING 68371', 'deep-significance-experiment RUNNING 68361', 'jupyter-notebook RUNNING 68376', 'translate-sbu RUNNING 68360', 'run_all_benchmarks.job RUNNING 68323', 'jupyter-notebook RUNNING 68213', 'jupyter-notebook RUNNING 68211', 'jupyter-notebook RUNNING 68210', '']


In [2]:
spark

In [4]:
#author_affiliations = mag.getDataframe('PaperAuthorAffiliations')
#papers = mag.getDataframe('Papers')
#ag = mag.getDataframe('AuthorsGenderized')
#affiliations = mag.getDataframe('Affiliations')
#paa_att = mag.getDataframe('PaperAuthorAffiliationsAttributes')
#prf = mag.getDataframe('PaperRootField')

# Root Field
Taken from Data Preparation (Lasse)

In [5]:
def paper_root_field_mag(mag):
    """
    Computes a mapping between PaperId and level-0 (root) Field of Study.
    Each paper can have multiple level-0 Fields of Study. 

    Saves result to TSV at given destination
    """

    fos = mag.getDataframe('FieldsOfStudy')
    pfos = fos = mag.getDataframe('PaperFieldsOfStudy')
    
    query = """
        SELECT pfs.PaperId, fs.FieldOfStudyId, 
        CASE WHEN fs.FieldOfStudyId IN (41008148,
                                        121332964,
                                        192562407,
                                        127413603,
                                        39432304,
                                        185592680,
                                        127313418,
                                        86803240,
                                        205649164,
                                        33923547)
        THEN 1 ELSE 0 END as is_stem
        FROM PaperFieldsOfStudy pfs
        INNER JOIN FieldsOfStudy fs ON pfs.FieldOfStudyId = fs.FieldOfStudyId
        WHERE fs.Level = 0
    """
    paper_fos = mag.query_sql(query)
    
    # write to file
    mag.saveFile(paper_fos, "datacuration", "PaperRootField.txt")
    

In [None]:
paper_root_field_mag(mag)

# AuthorCountries
Taken from Data Curation (Lasse)

In [None]:
def count_author_names_and_country(mag): 
    
    author_affiliations = mag.getDataframe('PaperAuthorAffiliations')
    aff = mag.getDataframe('Affiliations')
    authors = mag.getDataframe('Authors')
    
    query = """
        SELECT paa.AuthorId, a.Iso3166Code as Country, COUNT(*) as num_authorships,
        ROW_NUMBER() OVER(PARTITION BY paa.AuthorId ORDER BY COUNT(*) DESC) AS CountryRowNumber
        FROM PaperAuthorAffiliations paa
        INNER JOIN Affiliations AS a ON paa.AffiliationId = a.AffiliationId
        GROUP BY paa.AuthorId, a.Iso3166Code
    """
    author_countries = mag.query_sql(query)
    author_countries.createOrReplaceTempView('AuthorCnt')

    query = """
        SELECT a.AuthorId, DisplayName, ac.Country
        FROM Authors a 
        LEFT JOIN AuthorCnt ac ON a.AuthorId = ac.AuthorID
        WHERE ac.CountryRowNumber is null OR ac.CountryRowNumber = 1
    """
    
    author_countries_count = mag.query_sql(query)
    
    # write to file
    mag.saveFile(author_countries_count, "datacuration", "AuthorCountries.txt")


In [None]:
count_author_names_and_country(mag)

# Genderization
Taken from Data Curation (Lasse)

In [None]:
def assign_genders_to_authors(mag): 
    ac = mag.getDataframe('AuthorCountries')
    genderized = mag.getDataframe('GenderizedFirstnames')
    
    query = """
        SELECT 
            ac.AuthorId,
            ac.DisplayName, 
            ac.Country,
            gs.gender,
            COALESCE(gs.genderized, -3) as genderized
        FROM AuthorCountries ac 
        LEFT JOIN GenderizedFirstnames gs ON LEFT(ac.DisplayName, POSITION(' ' in ac.DisplayName) - 1) = gs.Firstname 
                                           AND COALESCE(ac.Country, 'unknown') = COALESCE(gs.Country, 'unknown')
        ORDER BY gs.genderized DESC, ac.AuthorId
    """
    
    author_genders = mag.query_sql(query)
    
    # write to file
    mag.saveFile(author_genders, "datacuration", "AuthorsGenderized.txt")

In [None]:
assign_genders_to_authors(mag)

## ProjectPapers 

In [None]:
def paper_family_attributes(mag): 
    
    author_affiliations = mag.getDataframe('PaperAuthorAffiliations')
    papers = mag.getDataframe('Papers')
    prf = mag.getDataframe('PaperRootField')
    
    query = """
    SELECT 
        x.PaperId, 
        x.FamilyId,
        prf1.FieldOfStudyId,
        x.DocType,
        x.Date,
        x.PubOrderInFamily,
        na.num_authors 
    FROM (
        SELECT 
            p.PaperId as PaperId, 
            COALESCE(p.FamilyId, p.PaperId) as FamilyId,
            p.DocType as DocType, 
            p.Date as Date,
            ROW_NUMBER() OVER(PARTITION BY COALESCE(p.FamilyId, p.PaperId) ORDER BY p.Date ASC) AS PubOrderInFamily
        FROM Papers p 
        INNER JOIN PaperRootField prf ON p.PaperId = prf.PaperId
        WHERE IsStem = 1 AND p.Year >= 2010 
    ) x 
    INNER JOIN (
        SELECT paa.PaperId, COUNT(DISTINCT(paa.AuthorId)) as num_authors 
        FROM PaperAuthorAffiliations paa 
        GROUP BY paa.PaperId
    ) na ON x.PaperId = na.PaperId 
    INNER JOIN PaperRootField prf1 ON x.PaperId = prf1.PaperId
    WHERE PubOrderInFamily = 1
    """
    
    paper_families = mag.query_sql(query)
    mag.saveFile(paper_families, "datacuration", "ProjectPapersAllDocType.txt")
    

In [None]:
paper_family_attributes(mag)

# Scientific Age

In [None]:
def scientic_age(mag): 
    
    author_affiliations = mag.getDataframe('PaperAuthorAffiliations')
    papers = mag.getDataframe('Papers')
    prf = mag.getDataframe('PaperRootField')
    
    query = """
        SELECT paa.AuthorId, 
        MIN(CASE WHEN IsStem = 1 THEN p.Date ELSE null END) as MinDateStem,
        MIN(p.Date) as MinDate
        FROM PaperAuthorAffiliations paa 
        INNER JOIN PaperRootField prf ON paa.PaperId = prf.PaperId
        INNER JOIN Papers p ON paa.PaperId = p.PaperId
        GROUP BY paa.AuthorId
    """
    
    s_age = mag.query_sql(query)
    mag.saveFile(s_age, "datacuration", "AuthorScientificAgeAllDocType.txt")


In [None]:
scientic_age(mag)

# PaperAuthorAffiliationsAttributes

In [9]:
# here the load is necessary (at least some of them). 
pp = mag.getDataframe('ProjectPapersAllDocType')
sa = mag.getDataframe('AuthorScientificAgeAllDocType')
affiliations = mag.getDataframe('Affiliations')
paa = mag.getDataframe('PaperAuthorAffiliations')
ag = mag.getDataframe('AuthorsGenderized')
asa = mag.getDataframe('AuthorScientificAgeAllDocType')

In [10]:
def paper_author_affiliations_with_attributes(mag):

    query = """
    SELECT
        pp.PaperID,
        paa.AuthorId,
        pp.Date,
        INT(YEAR(pp.Date)) as Year,
        INT(MONTH(pp.Date)) as Month,
        INT(QUARTER(pp.Date)) as Quarter,
        CASE WHEN ag.Genderized IN (0,1) THEN ag.Genderized ELSE -1 END as Gender,
        DATEDIFF(pp.Date, asa.MinDate) / 365.0 as ScientificAge, 
        a.Iso3166Code as CountryCode
        FROM ProjectPapersAllDocType pp
        INNER JOIN PaperAuthorAffiliations paa ON pp.PaperId = paa.PaperId
        INNER JOIN AuthorsGenderized ag ON paa.AuthorId = ag.AuthorId 
        INNER JOIN AuthorScientificAgeAllDocType asa ON ag.AuthorId = asa.AuthorId 
        LEFT JOIN Affiliations a ON paa.AffiliationId = a.AffiliationId
    """

    attribute_paa = mag.query_sql(query)
    mag.saveFile(attribute_paa, "datacuration", "PaperAuthorAffiliationsAttributesAllDocType.txt")

In [11]:
paper_author_affiliations_with_attributes(mag)

# PaperAuthors

In [12]:
paaa = mag.getDataframe('PaperAuthorAffiliationsAttributesAllDocType')

In [13]:
def project_authors(mag): 
    
    query = """
    SELECT ag.*, as.MinDate, as.MinDateStem
    FROM AuthorsGenderized ag
    LEFT JOIN AuthorScientificAgeAllDocType as ON ag.AuthorId = as.AuthorId 
    WHERE ag.AuthorId IN (SELECT DISTINCT(AuthorId) FROM PaperAuthorAffiliationsAttributesAllDocType)
    """
    
    project_authors = mag.query_sql(query)
    mag.saveFile(project_authors, "datacuration", "ProjectAuthorsAllDocType.txt")

In [14]:
project_authors(mag)