# 11. Field of Study Entity Counts   

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
import pyspark.sql.functions as sf


rootpath = 'wasbs://mag-2018-09-27@magtrainingsource.blob.core.windows.net/mag/'
outputDir = '/output/jiaxin/pyspark/'
conferenceShortName = 'WWW'
confFieldCitationGrowthRateStartYear = 2013
conferenceAnalyticsBaseDir = '/output/conferenceAnalytics/'

In [None]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [None]:
# Get all [Paper] -> [FieldOfStudy] -> [Author] -> [Affiliation] relationships
Papers = spark.read.load(rootpath + "Papers.txt", format="csv", sep="\t") \
.toDF("PaperId", "Rank", "Doi", "DocType", "PaperTitle", "OriginalTitle", "BookTitle", 
      "Year", "Date", "Publisher", "JournalId", "ConferenceSeriesId", "ConferenceInstanceId", 
      "Volume", "Issue", "FirstPage", "LastPage", "ReferenceCount", "CitationCount", 
      "EstimatedCitationCount", "CreatedDate")

PaperFieldsOfStudy = spark.read.load(rootpath + "PaperFieldsOfStudy.txt", format="csv", sep="\t") \
.toDF("PaperId", "FieldOfStudyId", "Score")

PaperAuthorAffiliations = spark.read.load(rootpath + "PaperAuthorAffiliations.txt", format="csv", sep="\t") \
.toDF("PaperId", "AuthorId", "AffiliationId", "AuthorSequenceNumber", "OriginalAffiliation")

paperAuthorAffiliationFieldsOfStudy = Papers.join(PaperFieldsOfStudy, "PaperId", 'inner') \
.join(PaperAuthorAffiliations, "PaperId", 'inner') \
.select("FieldOfStudyId", "PaperId", "AuthorId", "AffiliationId", "JournalId", "ConferenceSeriesId", "Year")


@paperAuthorAffiliationFieldsOfStudy =
    SELECT PaperFieldsOfStudy.FieldOfStudyId,
           PaperFieldsOfStudy.PaperId,
           PaperAuthorAffiliations.AuthorId,
           PaperAuthorAffiliations.AffiliationId,
           Papers.JournalId,
           Papers.ConferenceSeriesId,
           Papers.Year
    FROM Papers
         INNER JOIN
             PaperFieldsOfStudy
         ON Papers.PaperId == PaperFieldsOfStudy.PaperId
         INNER JOIN
             PaperAuthorAffiliations
         ON Papers.PaperId == PaperAuthorAffiliations.PaperId;


#//Get distinct affiliation, author, journal and confernece count for each field of study by year
authorAffiliationJournalConferenceCountByYearAndFos = paperAuthorAffiliationFieldsOfStudy.groupby("Year", "FieldOfStudyId") \
.agg(sf.)


@authorAffiliationJournalConferenceCountByYearAndFos =
    SELECT COUNT(DISTINCT AffiliationId) AS AffiliationCount,
           COUNT(DISTINCT AuthorId) AS AuthorCount,
           COUNT(DISTINCT JournalId) AS JournalCount,
           COUNT(DISTINCT ConferenceSeriesId) AS ConferenceCount,
           Year,
           FieldOfStudyId
    FROM @paperAuthorAffiliationFieldsOfStudy
    GROUP BY Year,
             FieldOfStudyId;


# Re-format result from {FieldOfStudyId, Year, AuthorCount, AffiliationCount, JournalCount, ConferenceCount} to 
# {FieldOfStudyId, Year, Count, EntityType} for easier consumption by visual components.

@entityStatsByYearAndFos =
    SELECT FieldOfStudyId,
           Year,
           "Affiliation" AS EntityType,
           AffiliationCount AS Count         
    FROM @authorAffiliationJournalConferenceCountByYearAndFos
        UNION ALL
    SELECT FieldOfStudyId,
           Year,
           "Author" AS EntityType,
           AuthorCount AS Count
    FROM @authorAffiliationJournalConferenceCountByYearAndFos
        UNION ALL
    SELECT FieldOfStudyId,
           Year,
           "Journal" AS EntityType,
           JournalCount AS Count
    FROM @authorAffiliationJournalConferenceCountByYearAndFos
         UNION ALL
    SELECT FieldOfStudyId,
           Year,
           "Conference" AS EntityType,
           ConferenceCount AS Count
    FROM @authorAffiliationJournalConferenceCountByYearAndFos;


entityStatsByYearAndFos.write.csv(outputDir + "entityStatsByYearAndFos.csv", mode='overwrite', header='true')

In [None]:
#//output all [FieldOfStudyId, Name] for easier lookup in visuals

@fieldsOfStudy =
    SELECT FieldOfStudyId,
           NormalizedName AS Name
    FROM FieldsOfStudy;



fieldsOfStudy.write.csv(outputDir + "fieldsOfStudy.csv", mode='overwrite', header='true')

In [None]:
sc.stop()