# 05. Conference Top Authors  

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
import pyspark.sql.functions as sf


rootpath = 'wasbs://mag-2018-09-27@magtrainingsource.blob.core.windows.net/mag/'
outputDir = '/output/jiaxin/pyspark/'
conferenceShortName = 'WWW'
conferenceTopPapersCount = 20
conferenceMostCitedAuthorsCount = 20
maDetailPagePrefix = 'https://academic.microsoft.com/#/detail/'

In [None]:
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)

In [None]:
# First find the conference series id by matching conference short name
targetConferenceSeriesId = spark.read.load(rootpath + "ConferenceSeries.txt", format="csv", sep="\t") \
.toDF("ConferenceSeriesId", "Rank", "NormalizedName", "DisplayName", "PaperCount", 
      "CitationCount", "CreatedDate") \
.where(sf.col("NormalizedName").isin(conferenceShortName)) \
.select("ConferenceSeriesId")


# Get all conference papers by conference series Id
conferencePapers = spark.read.load(rootpath + "Papers.txt", format="csv", sep="\t") \
.toDF("PaperId", "Rank", "Doi", "DocType", "PaperTitle", "OriginalTitle", "BookTitle", 
      "Year", "Date", "Publisher", "JournalId", "ConferenceSeriesId", "ConferenceInstanceId", 
      "Volume", "Issue", "FirstPage", "LastPage", "ReferenceCount", "CitationCount", 
      "EstimatedCitationCount", "CreatedDate") \
.join(targetConferenceSeriesId, "ConferenceSeriesId", 'inner') \
.select("PaperId", "CitationCount", "OriginalTitle", "Year")


# Get all [conference paper] -> [conference author] relationships
conferenceAuthorsPapers = spark.read.load(rootpath + "PaperAuthorAffiliations.txt", format="csv", sep="\t") \
.toDF("PaperId", "AuthorId", "AffiliationId", "AuthorSequenceNumber", "OriginalAffiliation") \
.join(conferencePapers, "PaperId", 'inner') \
.select("PaperId", "AuthorId", "CitationCount")

Author = spark.read.load(rootpath + "Authors.txt", format="csv", sep="\t") \
.toDF("AuthorId", "Rank", "NormalizedName", "DisplayName", "LastKnownAffiliationId", 
      "PaperCount", "CitationCount", "CreatedDate") \
.select("AuthorId", "DisplayName")

conferenceAuthorsPapers = conferenceAuthorsPapers.join(Author, "AuthorId", 'inner') \
.selectExpr("DisplayName as AuthorName", "AuthorId", "PaperId", "CitationCount")


# Aggregate citation and publication count for each author  
conferenceTopAuthors1 = conferenceAuthorsPapers.groupby("AuthorId") \
.count() \
.join(conferenceAuthorsPapers, "AuthorId", 'inner') \
.selectExpr("AuthorName", "count as PublicationCount", "AuthorId")

conferenceTopAuthors2 = conferenceAuthorsPapers.groupby("AuthorId") \
.agg(sf.sum("CitationCount").alias("CitationCount")) \
.join(conferenceTopAuthors1, "AuthorId", 'inner') \
.withColumn("DetailsUrl", sf.concat(sf.lit(maDetailPagePrefix), "AuthorId")) \
.select("AuthorName", "DetailsUrl", "PublicationCount", "CitationCount") \
.orderBy("CitationCount", ascending=False) \
.limit(conferenceTopPapersCount)


# Create two ranks based on publication and citation to display on the x and y axis for comparison  
conferenceTopAuthors = conferenceTopAuthors2.withColumn("AuthorCitationRank", sf.rank().over(Window.orderBy(sf.desc("CitationCount")))) \
.withColumn("AuthorPublicationRank", sf.rank().over(Window.orderBy(sf.desc("PublicationCount")))) \
.select("*")

conferenceTopAuthors.write.csv(outputDir + 'conferenceTopAuthors.csv', mode='overwrite', header='true')


# Get all reference from [conference Paper] -> [referenced paper]  





# Get all conference paper -> paper -> author relationships to find most cited authors  





# Aggregate publication and citation count for each author cited by conference paper  






# Aggregate yearly citation recieved from conference papers for most cited authors






# Save results
conferenceTopPapers.write.csv(outputDir + "conferenceTopPapers.csv", mode='overwrite', header='true')

In [None]:
sc.stop()