# 11. Field of Study Entity Counts   

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
import pyspark.sql.functions as sf


rootpath = 'wasbs://mag-2018-09-27@magtrainingsource.blob.core.windows.net/mag/'
outputDir = '/output/jiaxin/pyspark/'
conferenceShortName = 'WWW'
conferenceAnalyticsBaseDir = '/output/conferenceAnalytics/'

In [None]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [None]:
# Get all [Paper] -> [FieldOfStudy] -> [Author] -> [Affiliation] relationships
Papers = spark.read.load(rootpath + "Papers.txt", format="csv", sep="\t") \
.toDF("PaperId", "Rank", "Doi", "DocType", "PaperTitle", "OriginalTitle", "BookTitle", 
      "Year", "Date", "Publisher", "JournalId", "ConferenceSeriesId", "ConferenceInstanceId", 
      "Volume", "Issue", "FirstPage", "LastPage", "ReferenceCount", "CitationCount", 
      "EstimatedCitationCount", "CreatedDate")

PaperFieldsOfStudy = spark.read.load(rootpath + "PaperFieldsOfStudy.txt", format="csv", sep="\t") \
.toDF("PaperId", "FieldOfStudyId", "Score")

PaperAuthorAffiliations = spark.read.load(rootpath + "PaperAuthorAffiliations.txt", format="csv", sep="\t") \
.toDF("PaperId", "AuthorId", "AffiliationId", "AuthorSequenceNumber", "OriginalAffiliation")

paperAuthorAffiliationFieldsOfStudy = Papers.join(PaperFieldsOfStudy, "PaperId", 'inner') \
.join(PaperAuthorAffiliations, "PaperId", 'inner') \
.select("FieldOfStudyId", "PaperId", "AuthorId", "AffiliationId", "JournalId", "ConferenceSeriesId", "Year")


# Get distinct affiliation, author, journal and confernece count for each field of study by year
authorAffiliationJournalConferenceCountByYearAndFos = paperAuthorAffiliationFieldsOfStudy.groupby("Year", "FieldOfStudyId") \
.agg(sf.countDistinct("AffiliationId").alias("AffiliationCount"), sf.countDistinct("AuthorId").alias("AuthorCount"), 
     sf.countDistinct("JournalId").alias("JournalCount"), sf.countDistinct("ConferenceSeriesId").alias("ConferenceCount")) \


# Re-format result from {FieldOfStudyId, Year, AuthorCount, AffiliationCount, JournalCount, ConferenceCount} to 
# {FieldOfStudyId, Year, Count, EntityType} for easier consumption by visual components.
entityStatsByYearAndFos = authorAffiliationJournalConferenceCountByYearAndFos.selectExpr("FieldOfStudyId", "Year", "ConferenceSeriesId as EntityType", "ConferenceCount as Count") \
.union(authorAffiliationJournalConferenceCountByYearAndFos.selectExpr("FieldOfStudyId", "Year", "AffiliationId as EntityType", "AffiliationCount as Count")) \
.union(authorAffiliationJournalConferenceCountByYearAndFos.selectExpr("FieldOfStudyId", "Year", "AuthorId as EntityType", "AuthorCount as Count")) \
.union(authorAffiliationJournalConferenceCountByYearAndFos.selectExpr("FieldOfStudyId", "Year", "JournalId as EntityType", "JournalCount as Count"))

entityStatsByYearAndFos.write.csv(outputDir + "entityStatsByYearAndFos.csv", mode='overwrite', header='true')

In [None]:
# output all [FieldOfStudyId, Name] for easier lookup in visuals
fieldsOfStudy = FieldsOfStudy.selectExpr("FieldOfStudyId", "NormalizedName as Name")

fieldsOfStudy.write.csv(outputDir + "fieldsOfStudy.csv", mode='overwrite', header='true')

In [None]:
sc.stop()