In [None]:
'''
VMP 2022-03-02:
The first preprocessing step after Lasse.
NB: probably need to add function to get 'PaperRootField'. 
'''

In [1]:
import sys  
sys.path.insert(0, '/home/vicp')
from MAGspark import get_mag_with_cluster_connection
from MAG import MicrosoftAcademicGraph
import os
from pyspark.sql import functions as F, Window
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pyspark.sql.functions import avg
os.chdir('/home/vicp')
mag, spark = get_mag_with_cluster_connection(60830, 
                               memory_per_executor=16000)

['NAME STATE JOBID', 'cluster_new.job RUNNING 60832', 'cluster_new.job RUNNING 60831', 'cluster_new.job RUNNING 60830', 'jupyter-notebook RUNNING 60829', 'simple-gpu RUNNING 60734', 'jupyter-notebook RUNNING 60827', 'jupyter-notebook RUNNING 60828', 'jupyter-notebook RUNNING 60825', '']


In [2]:
spark

In [3]:
author_affiliations = mag.getDataframe('PaperAuthorAffiliations')
papers = mag.getDataframe('Papers')
ag = mag.getDataframe('AuthorsGenderized')
affiliations = mag.getDataframe('Affiliations')
#paa_att = mag.getDataframe('PaperAuthorAffiliationsAttributes')
prf = mag.getDataframe('PaperRootField')

## ProjectPapers 

In [4]:
def paper_family_attributes(mag): 
    
    author_affiliations = mag.getDataframe('PaperAuthorAffiliations')
    papers = mag.getDataframe('Papers')
    prf = mag.getDataframe('PaperRootField')
    
    query = """
    SELECT 
        x.PaperId, 
        x.FamilyId,
        prf1.FieldOfStudyId,
        x.DocType,
        x.Date,
        x.PubOrderInFamily,
        na.num_authors 
    FROM (
        SELECT 
            p.PaperId as PaperId, 
            COALESCE(p.FamilyId, p.PaperId) as FamilyId,
            p.DocType as DocType, 
            p.Date as Date,
            ROW_NUMBER() OVER(PARTITION BY COALESCE(p.FamilyId, p.PaperId) ORDER BY p.Date ASC) AS PubOrderInFamily
        FROM Papers p 
        INNER JOIN PaperRootField prf ON p.PaperId = prf.PaperId
        WHERE IsStem = 1 AND p.Year >= 2010 
    ) x 
    INNER JOIN (
        SELECT paa.PaperId, COUNT(DISTINCT(paa.AuthorId)) as num_authors 
        FROM PaperAuthorAffiliations paa 
        GROUP BY paa.PaperId
    ) na ON x.PaperId = na.PaperId 
    INNER JOIN PaperRootField prf1 ON x.PaperId = prf1.PaperId
    WHERE PubOrderInFamily = 1
    """
    
    paper_families = mag.query_sql(query)
    mag.saveFile(paper_families, "datacuration", "ProjectPapersAllDocType.txt")
    

In [5]:
paper_family_attributes(mag)

# Scientific Age

In [6]:
def scientic_age(mag): 
    
    author_affiliations = mag.getDataframe('PaperAuthorAffiliations')
    papers = mag.getDataframe('Papers')
    prf = mag.getDataframe('PaperRootField')
    
    query = """
        SELECT paa.AuthorId, 
        MIN(CASE WHEN IsStem = 1 THEN p.Date ELSE null END) as MinDateStem,
        MIN(p.Date) as MinDate
        FROM PaperAuthorAffiliations paa 
        INNER JOIN PaperRootField prf ON paa.PaperId = prf.PaperId
        INNER JOIN Papers p ON paa.PaperId = p.PaperId
        GROUP BY paa.AuthorId
    """
    
    s_age = mag.query_sql(query)
    mag.saveFile(s_age, "datacuration", "AuthorScientificAgeAllDocType.txt")


In [7]:
scientic_age(mag)

# PaperAuthorAffiliationsAttributes

In [8]:
# here the load is necessary
pp = mag.getDataframe('ProjectPapersAllDocType')
sa = mag.getDataframe('AuthorScientificAgeAllDocType')

In [9]:
def paper_author_affiliations_with_attributes(mag):

    query = """
    SELECT
        pp.PaperID,
        paa.AuthorId,
        pp.Date,
        INT(YEAR(pp.Date)) as Year,
        INT(MONTH(pp.Date)) as Month,
        INT(QUARTER(pp.Date)) as Quarter,
        CASE WHEN ag.Genderized IN (0,1) THEN ag.Genderized ELSE -1 END as Gender,
        DATEDIFF(pp.Date, asa.MinDate) / 365.0 as ScientificAge, 
        a.Iso3166Code as CountryCode
        FROM ProjectPapersAllDocType pp
        INNER JOIN PaperAuthorAffiliations paa ON pp.PaperId = paa.PaperId
        INNER JOIN AuthorsGenderized ag ON paa.AuthorId = ag.AuthorId 
        INNER JOIN AuthorScientificAgeAllDocType asa ON ag.AuthorId = asa.AuthorId 
        LEFT JOIN Affiliations a ON paa.AffiliationId = a.AffiliationId
    """

    attribute_paa = mag.query_sql(query)
    mag.saveFile(attribute_paa, "datacuration", "PaperAuthorAffiliationsAttributesAllDocType.txt")

In [10]:
paper_author_affiliations_with_attributes(mag)

# PaperAuthors

In [11]:
paaa = mag.getDataframe('PaperAuthorAffiliationsAttributesAllDocType')

In [12]:
def project_authors(mag): 
    
    query = """
    SELECT ag.*, as.MinDate, as.MinDateStem
    FROM AuthorsGenderized ag
    LEFT JOIN AuthorScientificAgeAllDocType as ON ag.AuthorId = as.AuthorId 
    WHERE ag.AuthorId IN (SELECT DISTINCT(AuthorId) FROM PaperAuthorAffiliationsAttributesAllDocType)
    """
    
    project_authors = mag.query_sql(query)
    mag.saveFile(project_authors, "datacuration", "ProjectAuthorsAllDocType.txt")

In [13]:
project_authors(mag)