# 01. Field of Study Top Authors  

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as sf 


rootpath = 'wasbs://mag-2018-09-27@magtrainingsource.blob.core.windows.net/mag/'
outputDir = '/output/user99/pyspark/'
targetFoS = 'computer science'
n_top = 100

In [None]:
# start Spark context
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)


# load data
FieldsOfStudy = spark.read.load(rootpath + "FieldsOfStudy.txt", format="csv", sep="\t") \
.toDF("FieldOfStudyId", "Rank", "NormalizedName", "DisplayName", "MainType", 
      "Level", "PaperCount", "CitationCount", "CreatedDate") \
.where(sf.col("NormalizedName").isin(targetFoS)) \
.select("FieldOfStudyId")


# Get all paperIds for the field
PaperFieldsOfStudy = spark.read.load(rootpath + "PaperFieldsOfStudy.txt", format="csv", sep="\t") \
.toDF("PaperId", "FieldOfStudyId", "Score") \
.join(FieldsOfStudy, "FieldOfStudyId", 'inner') \
.select("PaperId")


# Get all [citing paper] -> [field paper] relationships
PaperReferences = spark.read.load(rootpath + "PaperReferences.txt", format="csv", sep="\t") \
.toDF("PaperId", "PaperReferenceId") \
.groupby("PaperReferenceId") \
.count() \
.selectExpr("PaperReferenceId as PaperId", "count as CitationCount")


# For each field paper, count incoming reference to get citation count
Citation = PaperReferences.join(PaperFieldsOfStudy, "PaperId", 'inner')


# Join against PaperAuthorAffiliation to get field paper -> field author relationship
PublicationCount = spark.read.load(rootpath + "PaperAuthorAffiliations.txt", format="csv", sep="\t") \
.toDF("PaperId", "AuthorId", "AffiliationId", "AuthorSequenceNumber", "OriginalAffiliation") \
.join(PaperFieldsOfStudy, "PaperId", 'inner') \
.groupby("AuthorId") \
.count() \
.selectExpr("AuthorId as AuthorId", "count as PublicationCount")


CitationCount = spark.read.load(rootpath + "PaperAuthorAffiliations.txt", format="csv", sep="\t") \
.toDF("PaperId", "AuthorId", "AffiliationId", "AuthorSequenceNumber", "OriginalAffiliation") \
.join(Citation, "PaperId", 'inner') \
.groupby("AuthorId") \
.agg(sf.sum("CitationCount").alias("CitationCount")) \
.select("AuthorId", "CitationCount")


# Then join against Author to get detail author information
Authors = spark.read.load(rootpath + "Authors.txt", format="csv", sep="\t") \
.toDF("AuthorId", "Rank", "NormalizedName", "DisplayName", "LastKnownAffiliationId", 
      "PaperCount", "CitationCount", "CreatedDate") \
.select("AuthorId", "DisplayName") \
.join(PublicationCount, "AuthorId", 'inner') \
.selectExpr("AuthorId as AuthorId", "DisplayName as Name", "PublicationCount as PublicationCount")


# Aggregate over authorId to cacluate publication/ciation count for each author
FoS_Authors = Authors.join(CitationCount, "AuthorId", 'inner')\
.select("Name", "CitationCount", "PublicationCount")


# Get top n authors by citation count
Top_Authors = FoS_Authors.sort(sf.desc("PublicationCount"), sf.desc("CitationCount")) \
.limit(n_top)

# Save results
Top_Authors.write.csv(outputDir + "Top_Authors.csv", mode='overwrite', header='true')

# Stop Spark context
sc.stop()