# 02. Conference Top Authors By Static Rank   

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
import pyspark.sql.functions as sf


rootpath = 'wasbs://mag-2018-09-27@magtrainingsource.blob.core.windows.net/mag/'
outputDir = '/output/jiaxin/pyspark/'
targetConf = 'WWW'
n_top = 20

In [None]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [None]:
# First find the conference series id by matching conference short name
ConferenceSeries = spark.read.load(rootpath + "ConferenceSeries.txt", format="csv", sep="\t") \
.toDF("ConferenceSeriesId", "Rank", "NormalizedName", "DisplayName", "PaperCount", 
      "CitationCount", "CreatedDate") \
.where(sf.col("NormalizedName").isin(targetConf)) \
.select("ConferenceSeriesId")


# Get all conference papers by conference series Id
Papers = spark.read.load(rootpath + "Papers.txt", format="csv", sep="\t") \
.toDF("PaperId", "Rank", "Doi", "DocType", "PaperTitle", "OriginalTitle", "BookTitle", 
      "Year", "Date", "Publisher", "JournalId", "ConferenceSeriesId", "ConferenceInstanceId", 
      "Volume", "Issue", "FirstPage", "LastPage", "ReferenceCount", "CitationCount", 
      "EstimatedCitationCount", "CreatedDate") \
.join(ConferenceSeries, "ConferenceSeriesId", 'inner') \
.selectExpr("PaperId", "Rank * -0.001 as LogProbRank") # Rank is stored in the graph as LogProbRank*-1000


# Get all [conference paper] -> [conference author] relationships
Conf_Authors = spark.read.load(rootpath + "PaperAuthorAffiliations.txt", format="csv", sep="\t") \
.toDF("PaperId", "AuthorId", "AffiliationId", "AuthorSequenceNumber", "OriginalAffiliation") \
.join(Papers, "PaperId", 'inner') \
.select("AuthorId", "LogProbRank")


# Aggregate static rank and publication for each author
Authors = spark.read.load(rootpath + "Authors.txt", format="csv", sep="\t") \
.toDF("AuthorId", "Rank", "NormalizedName", "DisplayName", "LastKnownAffiliationId", 
      "PaperCount", "CitationCount", "CreatedDate") \
.join(Conf_Authors, "AuthorId", 'inner') \
.select("DisplayName", "LogProbRank")

AuthorRank = Authors.groupby("DisplayName") \
.agg(sf.sum(sf.exp("LogProbRank")).alias("AuthorProbRank")) \
.select("DisplayName", "AuthorProbRank")

PublicationCounts = Authors.groupby("DisplayName") \
.count() \
.selectExpr("DisplayName", "count as PublicationCount")

AuthorProbRank  = AuthorRank.join(PublicationCounts, "DisplayName", 'inner') \
.sort(sf.desc("AuthorProbRank")) \
.limit(n_top)


# Create two ranks based on different methods to display on the x and y axis for comparison
conferenceAuthorRanks = AuthorProbRank.select("DisplayName", "PublicationCount", 
                                        sf.rank().over(Window.orderBy("AuthorProbRank")).alias("AuthorRank"), 
                                        sf.rank().over(Window.orderBy(sf.col("AuthorProbRank") / sf.col("PublicationCount"))).alias("AuthorNormalizedRank"))
# Consider average quality of paper instead of sum


# Save results
conferenceAuthorRanks.write.csv(outputDir + "conferenceAuthorRanks.csv", mode='overwrite', header='true')

In [None]:
sc.stop()