# 04. Conference Top Papers   

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
import pyspark.sql.functions as sf


rootpath = 'wasbs://mag-2018-09-27@magtrainingsource.blob.core.windows.net/mag/'
outputDir = '/output/user99/pyspark/'
targetConf = 'WWW'

In [None]:
# start Spark context
sc = SparkContext.getOrCreate()
spark = SparkSession(sc)


# First find the conference series id by matching conference short name
targetConferenceSeriesId = spark.read.load(rootpath + "ConferenceSeries.txt", format="csv", sep="\t") \
.toDF("ConferenceSeriesId", "Rank", "NormalizedName", "DisplayName", "PaperCount", 
      "CitationCount", "CreatedDate") \
.where(sf.col("NormalizedName").isin(targetConf)) \
.select("ConferenceSeriesId")


# Get all conference papers by conference series Id
conferencePapers = spark.read.load(rootpath + "Papers.txt", format="csv", sep="\t") \
.toDF("PaperId", "Rank", "Doi", "DocType", "PaperTitle", "OriginalTitle", "BookTitle", 
      "Year", "Date", "Publisher", "JournalId", "ConferenceSeriesId", "ConferenceInstanceId", 
      "Volume", "Issue", "FirstPage", "LastPage", "ReferenceCount", "CitationCount", 
      "EstimatedCitationCount", "CreatedDate") \
.join(targetConferenceSeriesId, "ConferenceSeriesId", 'inner') \
.select("PaperId", "Year", "CitationCount")


# Get all reference from [Conference Paper] -> [Referenced Paper]
conferenceReferencedPapers = spark.read.load(rootpath + "PaperReferences.txt", format="csv", sep="\t") \
.toDF("PaperId", "PaperReferenceId") \
.join(conferencePapers, "PaperId", 'inner') \
.select("PaperId", "Year", "PaperReferenceId")


# Get total reference count for each year
conferenceReferenceStats = conferenceReferencedPapers.groupby("Year") \
.count() \
.selectExpr("Year as Year", "count as ReferenceCount")


# Get total publication/citation count for each year
conferencePaperCitationStats = conferencePapers.groupby("Year") \
.agg(sf.sum("CitationCount").alias("CitationCount")) \
.select("CitationCount", "Year")

conferencePaperCitationStats = conferencePapers.groupby("Year") \
.count() \
.join(conferencePaperCitationStats, "Year", 'inner') \
.selectExpr("count as PublicationCount", "CitationCount as CitationCount", "Year as Year")

conferencePaperStats = conferencePaperCitationStats.join(conferenceReferenceStats, "Year") \
.sort("Year") \
.selectExpr("Year as Year", "PublicationCount as PublicationCount", "float(CitationCount / PublicationCount) as AverageCitationCount", 
           "float(ReferenceCount / PublicationCount) as AverageReferenceCount")
# Cast to float first before division to ensure fractions are calculated correctly


# Save results
conferencePaperStats.write.csv(outputDir + "conferencePaperStats.csv", mode='overwrite', header='true')


# Stop Spark context
sc.stop()