## 01. Field of Study Top Authors  

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from CreateFunctions_mag import *
import pyspark.sql.functions as sf

In [None]:
rootpath = 'wasbs://mag-2018-09-27@magtrainingsource.blob.core.windows.net/mag/'
outputDir = '/output/jiaxin/pyspark/'
fieldName = 'computer science'
n_top = 100

Start Spark context

In [None]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [None]:
FieldsOfStudy = FieldsOfStudy(rootpath, spark)

targetFieldOfStudyId = FieldsOfStudy.where(sf.col("NormalizedName").isin(fieldName)) \
.select("FieldOfStudyId")


# Get all paperIds for the field
PaperFieldsOfStudy = PaperFieldsOfStudy(rootpath, spark)

fieldPaperIds = PaperFieldsOfStudy.join(targetFieldOfStudyId, "FieldOfStudyId", 'inner') \
.select("PaperId")


# Get all [citing paper] -> [field paper] relationships
PaperReferences = PaperReferences(rootpath, spark)

PaperReferences = PaperReferences.groupby("PaperReferenceId") \
.count() \
.selectExpr("PaperReferenceId as PaperId", "count as CitationCount")


# For each field paper, count incoming reference to get citation count
Citation = PaperReferences.join(fieldPaperIds, "PaperId", 'inner')


# Join against PaperAuthorAffiliation to get field paper -> field author relationship
PaperAuthorAffiliations = PaperAuthorAffiliations(rootpath, spark)

PublicationCount = PaperAuthorAffiliations.join(fieldPaperIds, "PaperId", 'inner') \
.groupby("AuthorId") \
.count() \
.selectExpr("AuthorId", "count as PublicationCount")

CitationCount = PaperAuthorAffiliations.join(Citation, "PaperId", 'inner') \
.groupby("AuthorId") \
.agg(sf.sum("CitationCount").alias("CitationCount")) \
.select("AuthorId", "CitationCount")


# Then join against Author to get detail author information
Authors = Authors(rootpath, spark)

Authors = Authors.select("AuthorId", "DisplayName") \
.join(PublicationCount, "AuthorId", 'inner') \
.selectExpr("AuthorId", "DisplayName as Name", "PublicationCount")


# Aggregate over authorId to cacluate publication/ciation count for each author
FoS_Authors = Authors.join(CitationCount, "AuthorId", 'inner')\
.select("Name", "CitationCount", "PublicationCount")


# Get top n authors by citation count
Top_Authors = FoS_Authors.sort(sf.desc("PublicationCount"), sf.desc("CitationCount")) \
.limit(n_top)

# Save results
Top_Authors.write.csv(outputDir + 'Top_Authors.csv', mode='overwrite', header='true')

Stop Spark context

In [None]:
sc.stop()