# 01. Field of Study Top Authors  

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, desc
from pyspark.sql.types import *


rootpath = 'wasbs://mag-2018-09-27@magtrainingsource.blob.core.windows.net/mag/'
outputDir = '/output/jiaxin/pyspark/'
targetFoS = 'computer science'
n_top = 100

In [None]:
# start Spark context
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)


# load data
_FieldsOfStudy = spark.read.load(rootpath + "FieldsOfStudy.txt", format="csv", sep="\t") \
.toDF("FieldOfStudyId", "Rank", "NormalizedName", "DisplayName", "MainType", 
      "Level", "PaperCount", "CitationCount", "CreatedDate") \
.where(col("NormalizedName").isin(targetFoS)) \
.select("FieldOfStudyId")


# Get all paperIds for the field
_PaperFieldsOfStudy = spark.read.load(rootpath + "PaperFieldsOfStudy.txt", format="csv", sep="\t") \
.toDF("PaperId", "FieldOfStudyId", "Score") \
.join(_FieldsOfStudy, "FieldOfStudyId", 'inner') \
.select("PaperId")


# Get all [citing paper] -> [field paper] relationships
_PaperReferences = spark.read.load(rootpath + "PaperReferences.txt", format="csv", sep="\t") \
.toDF("PaperId", "PaperReferenceId") \
.groupby("PaperReferenceId") \
.count() \
.selectExpr("PaperReferenceId as PaperId", "count as CitationCount")


# For each field paper, count incoming reference to get citation count
_ReferencesCount = _PaperReferences.join(_PaperFieldsOfStudy, "PaperId", 'inner')


# Join against PaperAuthorAffiliation to get field paper -> field author relationship
_paperauthoraffiliation = spark.read.load(rootpath + "PaperAuthorAffiliations.txt", format="csv", sep="\t") \
.toDF("PaperId", "AuthorId", "AffiliationId", "AuthorSequenceNumber", "OriginalAffiliation") \
.join(_ReferencesCount, "PaperId", 'inner') \
.select("AuthorId", "PaperId", "CitationCount")


# Then join against Author to get detail author information
# Aggregate over authorId to cacluate publication/ciation count for each author
_Authors = spark.read.load(rootpath + "Authors.txt", format="csv", sep="\t") \
.toDF("AuthorId", "Rank", "NormalizedName", "DisplayName", "LastKnownAffiliationId", 
      "PaperCount", "CitationCount", "CreatedDate") \
.join(_paperauthoraffiliation, "AuthorId", 'inner') \
.groupby("AuthorId") \
.count() \
.selectExpr("AuthorId as AuthorId", "DisplayName as Name", "CitationCount as CitationCount", "count as PublicationCount")
# .write.save(outputDir + "Top_Authors.csv", format="csv", mode="overwrite")


# Get top n authors by citation count
Fos_Authors = _Authors.sort(desc("CitationCount"), desc("PublicationCount")) \
.limit(n_top).show()


# Stop Spark context
sc.stop()